In [42]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

In [43]:
spark = (SparkSession.builder
                  .appName('MLBD Comment Classification')
                  .enableHiveSupport()
                  .config("spark.executor.memory", "6G")
                  .config("spark.driver.memory","18G")
                  .config("spark.executor.cores","7")
                  .config("spark.python.worker.memory","6G")
                  .config("spark.driver.maxResultSize","0")
                  .config("spark.sql.crossJoin.enabled", "true")
                  .config("spark.serializer","org.apache.spark.serializer.KryoSerializer")
                  .config("spark.default.parallelism","2")
                  .getOrCreate())

In [44]:
df = pd.read_csv('./hw4/train.csv')
df.comment_text.fillna("", inplace=True)

In [45]:
train_pd = df.sample(frac=0.75,random_state=42)
test_pd = df.drop(train_pd.index)

In [46]:
train, test = spark.createDataFrame(train_pd), spark.createDataFrame(test_pd)

In [47]:
with open('./hw4/stop_words.txt') as file:
    stop_words = list(map(lambda x: x.replace('"', '').strip(), file.readline().split(",")))

In [48]:
tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")

In [49]:
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stop_words)

22/12/11 16:17:06 WARN StopWordsRemover: Default locale set was [en_BY]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


In [50]:
word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol="filtered", outputCol="model")

In [51]:
word2Vec.setMaxIter(10)

Word2Vec_1f1eb2e65e67

In [52]:
word2Vec.getMaxIter()

10

In [20]:
pipeline = Pipeline(stages=[tokenizer, stop_words_remover, word2Vec])
pipeline_fit = pipeline.fit(train)

dataset = pipeline_fit.transform(train)

[Stage 0:>                                                          (0 + 0) / 2]

22/12/11 15:59:04 WARN TaskSetManager: Stage 0 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/11 15:59:08 WARN TaskSetManager: Stage 2 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.




22/12/11 15:59:10 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/12/11 15:59:10 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/12/11 15:59:10 WARN InstanceBuilder$JavaBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


                                                                                

In [24]:
dataset.show(5)

22/12/11 16:10:14 WARN TaskSetManager: Stage 34 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.


[Stage 34:>                                                         (0 + 1) / 1]

22/12/11 16:10:18 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 34 (TID 38): Attempting to kill Python Worker
+----------------+--------------------+-----+------------+-------+------+------+-------------+--------------------+--------------------+--------------------+
|              id|        comment_text|toxic|severe_toxic|obscene|threat|insult|identity_hate|               words|            filtered|               model|
+----------------+--------------------+-----+------------+-------+------+------+-------------+--------------------+--------------------+--------------------+
|7ca72b5b9c688e9e|Geez, are you for...|    0|           0|      0|     0|     0|            0|[geez,, are, you,...|[geez,, forgetful...|[-0.0745399384200...|
|c03f72fd8f8bf54f|Carioca RFA \n\nT...|    0|           0|      0|     0|     0|            0|[carioca, rfa, , ...|[carioca, rfa, su...|[0.35758345896999...|
|9e5b8e8fc1ff2e84|"\n\n Birthday \n...|    0|           0|      0|     0| 

                                                                                

In [25]:
test_res = test.select('id')
test_res.head()

22/12/11 16:11:00 WARN TaskSetManager: Stage 35 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.


[Stage 35:>                                                         (0 + 1) / 1]

22/12/11 16:11:04 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 35 (TID 39): Attempting to kill Python Worker


                                                                                

Row(id='00025465d4725e87')

In [23]:
cols_to_predict = [i for i in train.columns if i not in ['id', 'comment_text']]
cols_to_predict

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [27]:
regParam = 0.1

In [28]:
dataset_test = pipeline_fit.transform(test)

In [30]:
for col in cols_to_predict:
    logreg = LogisticRegression(featuresCol='model', labelCol=col, regParam=regParam)
    model = logreg.fit(dataset)
    res = model.transform(dataset_test)
    test_res = test_res.join(res.select('id', 'prediction'), on="id")
    test_res = test_res.withColumnRenamed("prediction", col)
    test_res.show(5)

22/12/11 16:14:14 WARN TaskSetManager: Stage 164 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/11 16:14:30 WARN TaskSetManager: Stage 165 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/11 16:14:32 WARN TaskSetManager: Stage 166 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/11 16:14:32 WARN TaskSetManager: Stage 167 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/11 16:14:33 WARN TaskSetManager: Stage 168 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/11 16:14:33 WARN TaskSetManager: Stage 169 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/11 16:14:33 WARN TaskSetManager: Stage 170 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/11 16:14:33 WARN TaskSetManager: Stage 171 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/11 16:14:33 WARN TaskSetManager: Stage 172 contains a task of very large size (25197 KiB). The maximum recommended task size is 10

[Stage 178:>  (0 + 2) / 2][Stage 179:>  (0 + 2) / 2][Stage 180:>  (0 + 2) / 2]

22/12/11 16:14:36 WARN TaskSetManager: Stage 182 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.
22/12/11 16:14:36 WARN TaskSetManager: Stage 183 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.




+----------------+-----+------------+-------+------+------+-------------+
|              id|toxic|severe_toxic|obscene|threat|insult|identity_hate|
+----------------+-----+------------+-------+------+------+-------------+
|002264ea4d5f2887|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|0310c62027c1cc81|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|03effbaf048d353d|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|03f1f91ce9efe2c4|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|04d0cce9eb0667a8|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
+----------------+-----+------------+-------+------+------+-------------+
only showing top 5 rows



                                                                                

In [31]:
test_res_pd = test_res.toPandas()

22/12/11 16:14:58 WARN TaskSetManager: Stage 198 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.
22/12/11 16:14:58 WARN TaskSetManager: Stage 199 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.
22/12/11 16:14:58 WARN TaskSetManager: Stage 200 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.
22/12/11 16:14:58 WARN TaskSetManager: Stage 201 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.
22/12/11 16:14:58 WARN TaskSetManager: Stage 202 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.


[Stage 199:>  (0 + 2) / 2][Stage 200:>  (0 + 2) / 2][Stage 201:>  (0 + 2) / 2]

22/12/11 16:15:00 WARN TaskSetManager: Stage 203 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.
22/12/11 16:15:00 WARN TaskSetManager: Stage 204 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

In [32]:
test_res_pd = test_res_pd.sort_values(by=['id'])

In [33]:
test_res_real = test_pd.sort_values(by=['id'])

In [34]:
test_res_pd.toxic = test_res_pd.toxic.astype(int)
test_res_pd.severe_toxic = test_res_pd.severe_toxic.astype(int)
test_res_pd.obscene = test_res_pd.obscene.astype(int)
test_res_pd.threat = test_res_pd.threat.astype(int)
test_res_pd.insult = test_res_pd.insult.astype(int)
test_res_pd.identity_hate = test_res_pd.identity_hate.astype(int)

In [35]:
test_res_pd

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
36098,00013fa6fb6ef643,0,0,0,0,0,0
11656,00025465d4725e87,0,0,0,0,0,0
35243,0002eeaf4c0cdf35,0,0,0,0,0,0
18272,00040093b2687caa,0,0,0,0,0,0
3678,0005300084f90edc,0,0,0,0,0,0
...,...,...,...,...,...,...,...
27058,fff5d08a356a9fde,0,0,0,0,0,0
29825,fff880e2b149dc13,0,0,0,0,0,0
21592,fff90f6920245ab8,0,0,0,0,0,0
20232,ffff3a700c54e047,0,0,0,0,0,0


In [36]:
test_res_real

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
95737,00013fa6fb6ef643,"Wehwalt, FTR, I'm not objecting to text about ...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
95739,0002eeaf4c0cdf35,But isnt it against the rules to edit if you a...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0
10,0005300084f90edc,"""\nFair use rationale for Image:Wonju.jpg\n\nT...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
95719,fff5d08a356a9fde,"Expansion \n\nI can handle the expansion, give...",0,0,0,0,0,0
95725,fff880e2b149dc13,February 2006 (UTC))\n\nWell if you would just...,0,0,0,0,0,0
143604,fff90f6920245ab8,How about edit the broken links of Incident at...,0,0,0,0,0,0
143606,ffff3a700c54e047,"""\n\nI am glad I can amuse you. Perhaps your t...",0,0,0,0,0,0


In [37]:
print(f1_score(test_res_pd.toxic, test_res_real.toxic))
print(recall_score(test_res_pd.toxic, test_res_real.toxic))
print(precision_score(test_res_pd.toxic, test_res_real.toxic))
print(accuracy_score(test_res_pd.toxic, test_res_real.toxic))

0.30364459780030184
0.9436997319034852
0.18093035209457722
0.9190584814378462


In [38]:
print(f1_score(test_res_pd.severe_toxic, test_res_real.severe_toxic))
print(recall_score(test_res_pd.severe_toxic, test_res_real.severe_toxic))
print(precision_score(test_res_pd.severe_toxic, test_res_real.severe_toxic))
print(accuracy_score(test_res_pd.severe_toxic, test_res_real.severe_toxic))

0.02403846153846154
0.8333333333333334
0.012195121951219513
0.989822775925601


In [39]:
print(f1_score(test_res_pd.obscene, test_res_real.obscene))
print(recall_score(test_res_pd.obscene, test_res_real.obscene))
print(precision_score(test_res_pd.obscene, test_res_real.obscene))
print(accuracy_score(test_res_pd.obscene, test_res_real.obscene))

0.2390786338363781
0.8431372549019608
0.13928736695974087
0.9519715238262352


In [40]:
print(f1_score(test_res_pd.threat, test_res_real.threat))
print(recall_score(test_res_pd.threat, test_res_real.threat))
print(precision_score(test_res_pd.threat, test_res_real.threat))
print(accuracy_score(test_res_pd.threat, test_res_real.threat))

0.0
0.0
0.0
0.9967663499862132


  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
print(f1_score(test_res_pd.identity_hate, test_res_real.identity_hate))
print(recall_score(test_res_pd.identity_hate, test_res_real.identity_hate))
print(precision_score(test_res_pd.identity_hate, test_res_real.identity_hate))
print(accuracy_score(test_res_pd.identity_hate, test_res_real.identity_hate))

0.0
0.0
0.0
0.99090065926353


  _warn_prf(average, modifier, msg_start, len(result))
