In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

In [3]:
spark = (SparkSession.builder
                  .appName('MLBD Comment Classification')
                  .enableHiveSupport()
                  .config("spark.executor.memory", "6G")
                  .config("spark.driver.memory","18G")
                  .config("spark.executor.cores","7")
                  .config("spark.python.worker.memory","6G")
                  .config("spark.driver.maxResultSize","0")
                  .config("spark.sql.crossJoin.enabled", "true")
                  .config("spark.serializer","org.apache.spark.serializer.KryoSerializer")
                  .config("spark.default.parallelism","2")
                  .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/13 15:53:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/12/13 15:53:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
df = pd.read_csv('./hw4/train.csv')
df.comment_text.fillna("", inplace=True)

In [5]:
train_pd = df.sample(frac=0.75,random_state=42)
test_pd = df.drop(train_pd.index)

In [6]:
train, test = spark.createDataFrame(train_pd), spark.createDataFrame(test_pd)

  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):


In [7]:
with open('./hw4/stop_words.txt') as file:
    stop_words = list(map(lambda x: x.replace('"', '').strip(), file.readline().split(",")))

In [8]:
tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")

In [9]:
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stop_words)

22/12/13 15:53:18 WARN StopWordsRemover: Default locale set was [en_BY]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


In [10]:
word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol="filtered", outputCol="model")

In [11]:
word2Vec.setMaxIter(10)

Word2Vec_3906bd50b1f9

In [12]:
word2Vec.getMaxIter()

10

In [13]:
pipeline = Pipeline(stages=[tokenizer, stop_words_remover, word2Vec])
pipeline_fit = pipeline.fit(train)

dataset = pipeline_fit.transform(train)

22/12/13 15:53:19 WARN TaskSetManager: Stage 0 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/13 15:53:23 WARN TaskSetManager: Stage 2 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.




22/12/13 15:53:30 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/12/13 15:53:30 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/12/13 15:53:30 WARN InstanceBuilder$JavaBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


                                                                                

In [14]:
dataset.show(5)

22/12/13 15:55:02 WARN TaskSetManager: Stage 32 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.


[Stage 32:>                                                         (0 + 1) / 1]

22/12/13 15:55:06 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 32 (TID 36): Attempting to kill Python Worker
+----------------+--------------------+-----+------------+-------+------+------+-------------+--------------------+--------------------+--------------------+
|              id|        comment_text|toxic|severe_toxic|obscene|threat|insult|identity_hate|               words|            filtered|               model|
+----------------+--------------------+-----+------------+-------+------+------+-------------+--------------------+--------------------+--------------------+
|7ca72b5b9c688e9e|Geez, are you for...|    0|           0|      0|     0|     0|            0|[geez,, are, you,...|[geez,, forgetful...|[-0.0745399384200...|
|c03f72fd8f8bf54f|Carioca RFA \n\nT...|    0|           0|      0|     0|     0|            0|[carioca, rfa, , ...|[carioca, rfa, su...|[0.35758345896999...|
|9e5b8e8fc1ff2e84|"\n\n Birthday \n...|    0|           0|      0|     0| 

                                                                                

In [15]:
test_res = test.select('id')
test_res.head()

22/12/13 15:55:06 WARN TaskSetManager: Stage 33 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.


[Stage 33:>                                                         (0 + 1) / 1]

22/12/13 15:55:10 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 33 (TID 37): Attempting to kill Python Worker


  self._sock = None


Row(id='00025465d4725e87')

In [16]:
cols_to_predict = [i for i in train.columns if i not in ['id', 'comment_text']]
cols_to_predict

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [17]:
regParam = 0.1

In [18]:
dataset_test = pipeline_fit.transform(test)

In [20]:
for col in cols_to_predict:
    logreg = LogisticRegression(featuresCol='model', labelCol=col, regParam=regParam)
    model = logreg.fit(dataset)
    res = model.transform(dataset_test)
    test_res = test_res.join(res.select('id', 'prediction'), on="id")
    test_res = test_res.withColumnRenamed("prediction", col)
    test_res.show(5)

22/12/13 15:57:01 WARN TaskSetManager: Stage 35 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/13 15:57:03 WARN TaskSetManager: Stage 36 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/13 15:57:06 WARN TaskSetManager: Stage 37 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:06 WARN TaskSetManager: Stage 38 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:06 WARN TaskSetManager: Stage 39 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:06 WARN TaskSetManager: Stage 40 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:06 WARN TaskSetManager: Stage 41 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:06 WARN TaskSetManager: Stage 42 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:06 WARN TaskSetManager: Stage 43 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.

                                                                                

+----------------+-----+
|              id|toxic|
+----------------+-----+
|002264ea4d5f2887|  0.0|
|0310c62027c1cc81|  0.0|
|03effbaf048d353d|  0.0|
|03f1f91ce9efe2c4|  0.0|
|04d0cce9eb0667a8|  0.0|
+----------------+-----+
only showing top 5 rows

22/12/13 15:57:08 WARN TaskSetManager: Stage 53 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/13 15:57:10 WARN TaskSetManager: Stage 54 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/13 15:57:17 WARN TaskSetManager: Stage 55 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:17 WARN TaskSetManager: Stage 56 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:17 WARN TaskSetManager: Stage 57 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:18 WARN TaskSetManager: Stage 58 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:18 WARN TaskSetManager: Stage 59 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:18 WARN TaskSetManager: Stage 60 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:18 WARN TaskSetManager: Stage 61 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.

                                                                                

+----------------+-----+------------+
|              id|toxic|severe_toxic|
+----------------+-----+------------+
|002264ea4d5f2887|  0.0|         0.0|
|0310c62027c1cc81|  0.0|         0.0|
|03effbaf048d353d|  0.0|         0.0|
|03f1f91ce9efe2c4|  0.0|         0.0|
|04d0cce9eb0667a8|  0.0|         0.0|
+----------------+-----+------------+
only showing top 5 rows

22/12/13 15:57:20 WARN TaskSetManager: Stage 77 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/13 15:57:22 WARN TaskSetManager: Stage 78 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/13 15:57:25 WARN TaskSetManager: Stage 79 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:25 WARN TaskSetManager: Stage 80 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:25 WARN TaskSetManager: Stage 81 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:25 WARN TaskSetManager: Stage 82 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:25 WARN TaskSetManager: Stage 83 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:25 WARN TaskSetManager: Stage 84 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:25 WARN TaskSetManager: Stage 85 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.

                                                                                

+----------------+-----+------------+-------+
|              id|toxic|severe_toxic|obscene|
+----------------+-----+------------+-------+
|002264ea4d5f2887|  0.0|         0.0|    0.0|
|0310c62027c1cc81|  0.0|         0.0|    0.0|
|03effbaf048d353d|  0.0|         0.0|    0.0|
|03f1f91ce9efe2c4|  0.0|         0.0|    0.0|
|04d0cce9eb0667a8|  0.0|         0.0|    0.0|
+----------------+-----+------------+-------+
only showing top 5 rows

22/12/13 15:57:28 WARN TaskSetManager: Stage 102 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/13 15:57:31 WARN TaskSetManager: Stage 103 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/13 15:57:33 WARN TaskSetManager: Stage 104 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:33 WARN TaskSetManager: Stage 105 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:33 WARN TaskSetManager: Stage 106 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:33 WARN TaskSetManager: Stage 107 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:33 WARN TaskSetManager: Stage 108 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:33 WARN TaskSetManager: Stage 109 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:33 WARN TaskSetManager: Stage 110 contains a task of very large size (25197 KiB). The maximum recommended task size is 10

[Stage 116:>  (0 + 2) / 2][Stage 117:>  (0 + 2) / 2][Stage 118:>  (0 + 2) / 2]

22/12/13 15:57:35 WARN TaskSetManager: Stage 120 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

+----------------+-----+------------+-------+------+
|              id|toxic|severe_toxic|obscene|threat|
+----------------+-----+------------+-------+------+
|002264ea4d5f2887|  0.0|         0.0|    0.0|   0.0|
|0310c62027c1cc81|  0.0|         0.0|    0.0|   0.0|
|03effbaf048d353d|  0.0|         0.0|    0.0|   0.0|
|03f1f91ce9efe2c4|  0.0|         0.0|    0.0|   0.0|
|04d0cce9eb0667a8|  0.0|         0.0|    0.0|   0.0|
+----------------+-----+------------+-------+------+
only showing top 5 rows

22/12/13 15:57:37 WARN TaskSetManager: Stage 131 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/13 15:57:40 WARN TaskSetManager: Stage 132 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/13 15:57:42 WARN TaskSetManager: Stage 133 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:42 WARN TaskSetManager: Stage 134 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:42 WARN TaskSetManager: Stage 135 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:42 WARN TaskSetManager: Stage 136 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:42 WARN TaskSetManager: Stage 137 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:42 WARN TaskSetManager: Stage 138 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:42 WARN TaskSetManager: Stage 139 contains a task of very large size (25197 KiB). The maximum recommended task size is 10

[Stage 145:>  (0 + 2) / 2][Stage 146:>  (0 + 2) / 2][Stage 147:>  (0 + 2) / 2]

22/12/13 15:57:44 WARN TaskSetManager: Stage 149 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

+----------------+-----+------------+-------+------+------+
|              id|toxic|severe_toxic|obscene|threat|insult|
+----------------+-----+------------+-------+------+------+
|002264ea4d5f2887|  0.0|         0.0|    0.0|   0.0|   0.0|
|0310c62027c1cc81|  0.0|         0.0|    0.0|   0.0|   0.0|
|03effbaf048d353d|  0.0|         0.0|    0.0|   0.0|   0.0|
|03f1f91ce9efe2c4|  0.0|         0.0|    0.0|   0.0|   0.0|
|04d0cce9eb0667a8|  0.0|         0.0|    0.0|   0.0|   0.0|
+----------------+-----+------------+-------+------+------+
only showing top 5 rows

22/12/13 15:57:45 WARN TaskSetManager: Stage 162 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/13 15:57:47 WARN TaskSetManager: Stage 163 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/13 15:57:49 WARN TaskSetManager: Stage 164 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:49 WARN TaskSetManager: Stage 165 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:49 WARN TaskSetManager: Stage 166 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:49 WARN TaskSetManager: Stage 167 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:49 WARN TaskSetManager: Stage 168 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:49 WARN TaskSetManager: Stage 169 contains a task of very large size (25197 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:49 WARN TaskSetManager: Stage 170 contains a task of very large size (25197 KiB). The maximum recommended task size is 10

[Stage 175:=> (1 + 1) / 2][Stage 176:>  (0 + 2) / 2][Stage 177:>  (0 + 2) / 2]

22/12/13 15:57:52 WARN TaskSetManager: Stage 180 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:57:52 WARN TaskSetManager: Stage 181 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.




+----------------+-----+------------+-------+------+------+-------------+
|              id|toxic|severe_toxic|obscene|threat|insult|identity_hate|
+----------------+-----+------------+-------+------+------+-------------+
|002264ea4d5f2887|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|0310c62027c1cc81|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|03effbaf048d353d|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|03f1f91ce9efe2c4|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|04d0cce9eb0667a8|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
+----------------+-----+------------+-------+------+------+-------------+
only showing top 5 rows



                                                                                

In [21]:
test_res_pd = test_res.toPandas()

  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):


22/12/13 15:58:21 WARN TaskSetManager: Stage 196 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:58:21 WARN TaskSetManager: Stage 197 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:58:21 WARN TaskSetManager: Stage 198 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:58:21 WARN TaskSetManager: Stage 199 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:58:22 WARN TaskSetManager: Stage 200 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.


[Stage 197:>  (0 + 2) / 2][Stage 198:>  (0 + 2) / 2][Stage 199:>  (0 + 2) / 2]

22/12/13 15:58:23 WARN TaskSetManager: Stage 201 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.
22/12/13 15:58:24 WARN TaskSetManager: Stage 202 contains a task of very large size (8395 KiB). The maximum recommended task size is 1000 KiB.


  self._sock = None


In [52]:
test_res_pd = test_res_pd.sort_values(by=['id'])

In [53]:
test_res_real = test_pd.sort_values(by=['id'])

In [24]:
test_res_pd.toxic = test_res_pd.toxic.astype(int)
test_res_pd.severe_toxic = test_res_pd.severe_toxic.astype(int)
test_res_pd.obscene = test_res_pd.obscene.astype(int)
test_res_pd.threat = test_res_pd.threat.astype(int)
test_res_pd.insult = test_res_pd.insult.astype(int)
test_res_pd.identity_hate = test_res_pd.identity_hate.astype(int)

In [49]:
print(f1_score(test_res_pd.toxic, test_res_real.toxic))
print(recall_score(test_res_pd.toxic, test_res_real.toxic))
print(precision_score(test_res_pd.toxic, test_res_real.toxic))
print(accuracy_score(test_res_pd.toxic, test_res_real.toxic))

0.957032029700994
0.9185889084731907
0.9988333981445475
0.9190584814378462


In [48]:
print(f1_score(test_res_pd.severe_toxic, test_res_real.severe_toxic))
print(recall_score(test_res_pd.severe_toxic, test_res_real.severe_toxic))
print(precision_score(test_res_pd.severe_toxic, test_res_real.severe_toxic))
print(accuracy_score(test_res_pd.severe_toxic, test_res_real.severe_toxic))

0.9948847171475369
0.9898463158422544
0.9999746726439227
0.989822775925601


In [42]:
print(f1_score(test_res_pd.obscene, test_res_real.obscene))
print(recall_score(test_res_pd.obscene, test_res_real.obscene))
print(precision_score(test_res_pd.obscene, test_res_real.obscene))
print(accuracy_score(test_res_pd.obscene, test_res_real.obscene))

0.9752031889009681
0.9529542695265075
0.998515848616559
0.9519715238262352


In [41]:
print(f1_score(test_res_pd.threat, test_res_real.threat))
print(recall_score(test_res_pd.threat, test_res_real.threat))
print(precision_score(test_res_pd.threat, test_res_real.threat))
print(accuracy_score(test_res_pd.threat, test_res_real.threat))

0.9983805566365794
0.9967663499862132
1.0
0.9967663499862132


In [40]:
print(f1_score(test_res_pd.identity_hate, test_res_real.identity_hate))
print(recall_score(test_res_pd.identity_hate, test_res_real.identity_hate))
print(precision_score(test_res_pd.identity_hate, test_res_real.identity_hate))
print(accuracy_score(test_res_pd.identity_hate, test_res_real.identity_hate))

0.995429535524974
0.99090065926353
1.0
0.99090065926353


HashingTF - Этот подход страдает от потенциальных коллизий хэшей, но экономит место на больших корпусах слов. Представляет собой трансформер.

Word2Vec - модель, преобразует слово в код для дальнейшей обработки.

В данном задании Word2Vec показала себя лучше на первых столбцах, HashingTF на последних