In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--conf spark.sql.catalogImplementation=in-memory pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.2
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
spark

![kmeans](pics/kmeans.svg)

![kmeans_algo](pics/kmeans_algo.png)

In [3]:
from pyspark.sql.types import *

In [4]:
schema = StructType([
    StructField("id", StringType()),
    StructField("comment_text", StringType()),
    StructField("toxic", IntegerType()),
    StructField("severe_toxic", IntegerType()),
    StructField("obscene", IntegerType()),
    StructField("threat", IntegerType()),
    StructField("insult", IntegerType()),
    StructField("identity_hate", IntegerType())
])

In [5]:
dataset = spark.read.csv("/user/pavel.klemenkov/lectures/lecture03/data/train.csv", schema=schema, header=True, multiLine=True, escape='"')

In [7]:
dataset = dataset.repartition(4).cache()

In [8]:
dataset

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int]

In [9]:
from pyspark.ml.feature import *

In [10]:
tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")

In [11]:
stop_words = StopWordsRemover.loadDefaultStopWords("english")

In [12]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered", stopWords=stop_words)

In [13]:
count_vectorizer = CountVectorizer(inputCol=swr.getOutputCol(), outputCol="word_vector", vocabSize=200)

In [14]:
from pyspark.ml import Pipeline

In [15]:
preprocessing = Pipeline(stages=[
    tokenizer,
    swr,
    count_vectorizer
])

In [16]:
preprocessing_model = preprocessing.fit(dataset)

In [17]:
preprocessed_dataset = preprocessing_model.transform(dataset)

In [20]:
preprocessed_dataset.select(["word_vector"]).show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------+
|word_vector                                                                                                                            |
+---------------------------------------------------------------------------------------------------------------------------------------+
|(200,[0,1,5,53,55,68,101,115,129],[2.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0])                                                               |
|(200,[0,1,70,78,101,179],[6.0,1.0,1.0,1.0,1.0,1.0])                                                                                    |
|(200,[0,2,3,4,9,13,18,22,43,44,49,53,70,105,113,128,148,156],[1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|(200,[0,11,49,128,131],[2.0,1.0,2.0,1.0,1.0])                                                                                          |
|(200,[7,8,22,28,31,38,42,44,46,50

In [21]:
dataset

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int]

In [22]:
from pyspark.ml.clustering import KMeans

In [23]:
kmeans = KMeans(featuresCol="word_vector", k=7, seed=5757)

In [24]:
kmeans_model = kmeans.fit(preprocessed_dataset)

In [25]:
clustering = kmeans_model.transform(preprocessed_dataset)

In [26]:
clustering[clustering.columns[2:8] + ["prediction"]].take(10)

[Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=2),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=2),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=2),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=2),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=2),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=2),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=2),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=2),
 Row(toxic=1, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=2)]

In [27]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [28]:
evaluator = ClusteringEvaluator(featuresCol="word_vector")

In [29]:
evaluator.evaluate(clustering)

0.37089016703400873

In [30]:
clustering.filter(clustering.prediction == 1)[["comment_text"]].show(5, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [31]:
kmeans = KMeans(featuresCol="word_vector", k=2, seed=1234)

In [32]:
kmeans_model = kmeans.fit(preprocessed_dataset)

In [33]:
clustering = kmeans_model.transform(preprocessed_dataset)

In [34]:
evaluator.evaluate(clustering)

0.9994404484935104

In [35]:
kmeans_model.clusterCenters()

[array([3.0109565 , 0.51180268, 0.24449041, 0.17984831, 0.17243325,
        0.16500564, 0.15431867, 0.14628306, 0.14062304, 0.14034725,
        0.11886674, 0.11762567, 0.10255735, 0.10247587, 0.09564999,
        0.09480381, 0.0941394 , 0.09082362, 0.08216748, 0.08130876,
        0.07923405, 0.07139275, 0.07050896, 0.06879153, 0.06503071,
        0.06314404, 0.06249216, 0.06222264, 0.06039865, 0.05926413,
        0.05809828, 0.05765325, 0.05692616, 0.0554093 , 0.05474489,
        0.0546446 , 0.05430613, 0.05344114, 0.05312147, 0.05280807,
        0.05084618, 0.05274539, 0.05250094, 0.05196816, 0.0514103 ,
        0.05116585, 0.05008148, 0.04925411, 0.04914128, 0.04869625,
        0.04811959, 0.0476871 , 0.04764949, 0.04752413, 0.04740504,
        0.04605115, 0.04578789, 0.04533659, 0.04526138, 0.0452175 ,
        0.0446095 , 0.04420835, 0.04403284, 0.04393882, 0.04333709,
        0.04328695, 0.0428858 , 0.04227153, 0.04097405, 0.04091764,
        0.04075467, 0.04058543, 0.04028457, 0.04

In [36]:
import numpy as np

In [37]:
np.max(kmeans_model.clusterCenters()[0])

3.0109564999373197

In [38]:
np.argsort(-kmeans_model.clusterCenters()[0])

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  41,  42,  43,  44,  45,  40,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  87,  86,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 125, 124, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 154, 155, 156,
       157, 158, 159, 153, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [39]:
np.argsort(-kmeans_model.clusterCenters()[1])

array([  0,  40, 153,   3,   7,   1,  18,  13,   6,   5, 159,   2,  36,
        55,  47,  12,  15,  16, 175, 140,  20,  48, 194,  58,  26,  29,
        23,  10,  22, 142,  96,  53,  56,  49,  50, 164, 124,  86,  88,
        91,  97, 107, 115, 128,  99,  17,  42,  41,  11,  27,  19,  24,
        85,   8,  30, 156,  14, 104,  61,  60,  44,  34, 117,  39, 134,
       109, 135,  81, 138,  35,  54, 113, 143, 106,  31,  33,  82, 166,
       133,  93,  45,  79,  63,  62,  80,  67,  68, 186, 192,  66, 177,
         4,  51,  43,  46,  73,  75,  78,  65, 137, 193, 141, 190, 130,
       129, 196, 151, 146, 148, 152, 155, 157, 158, 181, 180, 161, 162,
         9, 165, 173, 171, 188, 167, 122, 199,  74,  98,  71,  70,  32,
        69, 110, 111,  92, 101, 114,  64,  25,  59, 118,  57, 119,  37,
        38,  52,  77,  72, 170, 172,  76, 174, 169, 168, 184, 178, 179,
       182, 183, 185, 187, 189, 191, 195, 197, 176,  83, 121,  87, 125,
       126, 127,  21, 120, 131, 132, 116, 136, 112, 139, 108, 10

In [40]:
preprocessing_model.stages[2].vocabulary

['',
 '"',
 'article',
 'page',
 'please',
 'like',
 'one',
 '-',
 'wikipedia',
 'talk',
 'think',
 'see',
 'also',
 'know',
 'may',
 'edit',
 'people',
 'use',
 'get',
 'even',
 'make',
 'articles',
 'good',
 'want',
 'time',
 'it.',
 'need',
 'new',
 'thank',
 'go',
 'first',
 'information',
 'many',
 'made',
 'find',
 'page.',
 'name',
 'really',
 'thanks',
 'say',
 'fuck',
 'much',
 'used',
 'since',
 'article.',
 'user',
 'add',
 'way',
 'take',
 'help',
 'sources',
 'look',
 'someone',
 'still',
 'read',
 'section',
 'pages',
 'going',
 'two',
 'deletion',
 'you.',
 'source',
 'edits',
 'without',
 'discussion',
 'well',
 'editing',
 'wikipedia.',
 'point',
 'deleted',
 'back',
 'might',
 'work',
 'something',
 'image',
 'another',
 'added',
 'never',
 'put',
 'link',
 'seems',
 'stop',
 ',',
 'blocked',
 'feel',
 '.',
 'list',
 'block',
 'right',
 'said',
 '(utc)',
 'using',
 'ask',
 'personal',
 'fact',
 'sure',
 'article,',
 'believe',
 'hope',
 'page,',
 'note',
 'actually',


In [43]:
for i in np.argsort(-kmeans_model.clusterCenters()[1])[:40]:
    print(preprocessing_model.stages[2].vocabulary[i])


fuck
|
page
-
"
get
know
one
like
(talk)
article
name
section
way
also
edit
people
u
come
make
take
text
two
need
go
want
think
good
person
article,
still
pages
help
sources
)
give
list
right
using


In [42]:
for i in np.argsort(-kmeans_model.clusterCenters()[0])[:40]:
    print(preprocessing_model.stages[2].vocabulary[i])


"
article
page
please
like
one
-
wikipedia
talk
think
see
also
know
may
edit
people
use
get
even
make
articles
good
want
time
it.
need
new
thank
go
first
information
many
made
find
page.
name
really
thanks
say


## The curse of dimensionality
![curse](pics/dimensionality_vs_performance.png)

## Why is that?
![curse](pics/curseofdimensionality.png)

## LDA

![curse](pics/lda.png)

![curse](pics/dirichlet.png)

In [44]:
from pyspark.ml.clustering import LDA

In [45]:
lda = LDA(featuresCol="word_vector", seed=5757, k=6)

In [46]:
lda_model = lda.fit(preprocessed_dataset)

In [47]:
topics = lda_model.transform(preprocessed_dataset)

In [48]:
topics.take(5)

[Row(id='6fdb7b6734f8bf40', comment_text='"\n\n""Katara""\nI\'ve removed the section entirely. I don\'t care if you like to pretend that Katara and Zuko are meant for each other. It\'s still not case, and there has been no indication whatsoever. Thus, there\'s little point to actually have the section and exempt Toph and Sokka beyond the insane delusions of shippers.  "', toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, words=['"', '', '""katara""', "i've", 'removed', 'the', 'section', 'entirely.', 'i', "don't", 'care', 'if', 'you', 'like', 'to', 'pretend', 'that', 'katara', 'and', 'zuko', 'are', 'meant', 'for', 'each', 'other.', "it's", 'still', 'not', 'case,', 'and', 'there', 'has', 'been', 'no', 'indication', 'whatsoever.', 'thus,', "there's", 'little', 'point', 'to', 'actually', 'have', 'the', 'section', 'and', 'exempt', 'toph', 'and', 'sokka', 'beyond', 'the', 'insane', 'delusions', 'of', 'shippers.', '', '"'], words_filtered=['"', '', '""katara""', 'remove

In [49]:
lda_model.vocabSize()

200

In [50]:
lda_model.describeTopics(maxTermsPerTopic=10).collect()

[Row(topic=0, termIndices=[7, 0, 2, 8, 1, 15, 3, 6, 131, 82], termWeights=[0.06021143367631872, 0.057629535517439316, 0.05682854781977034, 0.03824067058005817, 0.03352867303981374, 0.02674318119930803, 0.020962483900486363, 0.02014963535909523, 0.017994044037643337, 0.01603144346798472]),
 Row(topic=1, termIndices=[0, 9, 1, 3, 49, 27, 8, 105, 56, 45], termWeights=[0.20897159583709132, 0.04427843581569699, 0.04230095072263008, 0.036887525279393055, 0.017060804342550105, 0.016202617806118664, 0.015977531088498063, 0.015728332085796257, 0.015688595003576014, 0.015101081536063305]),
 Row(topic=2, termIndices=[0, 40, 2, 59, 3, 4, 14, 69, 121, 1], termWeights=[0.09088524558404357, 0.06881859611232774, 0.0572980668112808, 0.048720330500939484, 0.038817531502747846, 0.03774703589267476, 0.03561011380738992, 0.034367993715631734, 0.032703661779232465, 0.028210862997463452]),
 Row(topic=3, termIndices=[5, 16, 10, 13, 6, 19, 18, 12, 0, 32], termWeights=[0.03721827676655505, 0.028764065122983237, 

In [51]:
for i in [7, 0, 2, 8, 1, 15, 3, 6, 131, 82]:
    print(preprocessing_model.stages[-1].vocabulary[i])

-

article
wikipedia
"
edit
page
one
hi
,


In [52]:
for i in [0, 9, 1, 3, 49, 27, 8, 105, 56, 45]:
    print(preprocessing_model.stages[-1].vocabulary[i])


talk
"
page
help
new
wikipedia
•
pages
user


In [53]:
for i in [0, 40, 2, 59, 3, 4, 14, 69, 121, 1]:
    print(preprocessing_model.stages[-1].vocabulary[i])


fuck
article
deletion
page
please
may
deleted
speedy
"


In [54]:
for i in [0, 4, 17, 1, 28, 74, 3, 126, 60, 38]:
    print(preprocessing_model.stages[-1].vocabulary[i])


please
use
"
thank
image
page
copyright
you.
thanks


## Clustering is a good dimensionality reduction technique

In [55]:
topics

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int, words: array<string>, words_filtered: array<string>, word_vector: vector, topicDistribution: vector]

In [56]:
from pyspark.sql import functions as f

In [57]:
target = f.when(
    (topics.toxic == 0) &
    (topics.severe_toxic == 0) &
    (topics.obscene == 0) &
    (topics.threat == 0) &
    (topics.insult == 0) &
    (topics.identity_hate == 0),
    0
).otherwise(1)

In [58]:
new_dataset = topics.withColumn("target", target)[["id", "target", "topicDistribution"]].cache()

In [59]:
new_dataset.take(5)

[Row(id='6fdb7b6734f8bf40', target=0, topicDistribution=DenseVector([0.0118, 0.0116, 0.011, 0.3918, 0.5622, 0.0116])),
 Row(id='26e1b63617df36b1', target=0, topicDistribution=DenseVector([0.0128, 0.0125, 0.0118, 0.0139, 0.9363, 0.0125])),
 Row(id='85e4f353ca4b2bde', target=0, topicDistribution=DenseVector([0.5708, 0.4, 0.0065, 0.0076, 0.0082, 0.0068])),
 Row(id='9d2196265213dce8', target=0, topicDistribution=DenseVector([0.132, 0.7876, 0.0178, 0.0208, 0.0229, 0.0189])),
 Row(id='fb7a63a8e287b2d1', target=0, topicDistribution=DenseVector([0.4623, 0.0088, 0.0084, 0.0097, 0.0106, 0.5001]))]

In [60]:
from pyspark.ml.classification import LogisticRegression

In [61]:
lr = LogisticRegression(featuresCol="topicDistribution", labelCol="target")

In [62]:
train = new_dataset.sampleBy("target", fractions={0: 0.8, 1: 0.8}, seed=5757).cache()

In [63]:
test = new_dataset.join(train, on="id", how="leftanti").cache()

In [64]:
lr_model = lr.fit(train)

In [65]:
predictions = lr_model.transform(test)

In [66]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [67]:
evaluator = BinaryClassificationEvaluator(labelCol="target")

In [68]:
evaluator.evaluate(predictions)

0.6444604194017056

## Last time with CountVectorizer with 20k words in vocabulary we got 0.8275751487175559

In [69]:
spark.stop()