In [26]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

from sklearn.metrics import f1_score

In [2]:
spark = (SparkSession.builder
                  .appName('MLBD Comment Classification')
                  .enableHiveSupport()
                  .config("spark.executor.memory", "6G")
                  .config("spark.driver.memory","18G")
                  .config("spark.executor.cores","7")
                  .config("spark.python.worker.memory","6G")
                  .config("spark.driver.maxResultSize","0")
                  .config("spark.sql.crossJoin.enabled", "true")
                  .config("spark.serializer","org.apache.spark.serializer.KryoSerializer")
                  .config("spark.default.parallelism","2")
                  .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/10 19:13:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = pd.read_csv('./train.csv')
df.comment_text.fillna("", inplace=True)

In [4]:
percent = int(len(df) * 0.75)

In [5]:
train, test = spark.createDataFrame(df.iloc[:percent]), spark.createDataFrame(df.iloc[percent:])

In [6]:
with open('./stop_words.txt') as file:
    stop_words = list(map(lambda x: x.replace('"', '').strip(), file.readline().split(",")))

In [7]:
tokenizer = RegexTokenizer(inputCol="comment_text", outputCol="words", pattern="\\W")

In [8]:
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stop_words)

22/12/10 19:13:58 WARN StopWordsRemover: Default locale set was [en_BY]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


In [9]:
hashingTF = HashingTF(inputCol="filtered", outputCol="filtered_hashed")

In [10]:
idf = IDF(inputCol="filtered_hashed", outputCol="features")

In [11]:
pipeline = Pipeline(stages=[tokenizer, stop_words_remover, hashingTF, idf])
pipeline_fit = pipeline.fit(train)

dataset = pipeline_fit.transform(train)

[Stage 0:>                                                          (0 + 0) / 2]

22/12/10 19:14:00 WARN TaskSetManager: Stage 0 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

In [12]:
dataset.select("features").head(1)

22/12/10 19:14:04 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:04 WARN TaskSetManager: Stage 1 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.


[Stage 1:>                                                          (0 + 1) / 1]

22/12/10 19:14:08 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 1 (TID 2): Attempting to kill Python Worker


                                                                                

[Row(features=SparseVector(262144, {6240: 8.5145, 10214: 6.233, 11680: 4.7144, 23032: 4.851, 25000: 5.4047, 26144: 3.5575, 37479: 5.9755, 66299: 7.5982, 68213: 8.4345, 72125: 1.7795, 77971: 7.3618, 79300: 6.3647, 103863: 6.605, 110510: 5.3808, 146687: 6.3647, 151751: 8.648, 154643: 3.9257, 167363: 6.3794, 190344: 5.1943, 213145: 6.8723, 223402: 3.0647, 229137: 4.2037}))]

In [13]:
regParam = 0.1

In [14]:
dataset_test = pipeline_fit.transform(test)

In [15]:
test_res = test.select('id')
test_res.head()

22/12/10 19:14:10 WARN TaskSetManager: Stage 2 contains a task of very large size (8263 KiB). The maximum recommended task size is 1000 KiB.


[Stage 2:>                                                          (0 + 1) / 1]

22/12/10 19:14:14 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 2 (TID 3): Attempting to kill Python Worker


                                                                                

Row(id='7fe13f3ea7605f02')

In [16]:
cols_to_predict = [i for i in train.columns if i not in ['id', 'comment_text']]

In [17]:
for col in cols_to_predict:
    logreg = LogisticRegression(featuresCol='features', labelCol=col, regParam=regParam)
    model = logreg.fit(dataset)
    res = model.transform(dataset_test)
    test_res = test_res.join(res.select('id', 'prediction'), on="id")
    test_res = test_res.withColumnRenamed("prediction", col)
    test_res.show(5)

22/12/10 19:14:17 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:17 WARN TaskSetManager: Stage 3 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/10 19:14:20 WARN InstanceBuilder$JavaBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
22/12/10 19:14:20 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:20 WARN TaskSetManager: Stage 4 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.


[Stage 4:>                                                          (0 + 2) / 2]

22/12/10 19:14:22 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/12/10 19:14:22 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/12/10 19:14:22 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/12/10 19:14:22 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
22/12/10 19:14:22 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:22 WARN TaskSetManager: Stage 5 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/10 19:14:23 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:23 WARN TaskSetManager: Stage 6 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:14:23 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:23 WARN TaskSetManager: Stage 7 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:14:23 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:23 WARN TaskSetManager: Stage 8 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:14:23 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:23 WARN TaskSetManager: Stage 9 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:14:23 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22

22/12/10 19:14:27 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:27 WARN TaskSetManager: Stage 42 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:14:27 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:27 WARN TaskSetManager: Stage 43 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:14:27 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:27 WARN TaskSetManager: Stage 44 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:14:28 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:28 WARN TaskSetManager: Stage 45 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:14:28 WARN DAGScheduler: Broadcasting large task binary with size 4.1 Mi

                                                                                

+----------------+-----+
|              id|toxic|
+----------------+-----+
|805d0ddb7e79b173|  0.0|
|80fb26ac41c0a3e0|  0.0|
|81de7451eafefeb3|  0.0|
|82c3fd5493e4873e|  1.0|
|83a835e1c77f1ab8|  0.0|
+----------------+-----+
only showing top 5 rows

22/12/10 19:14:32 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:32 WARN TaskSetManager: Stage 63 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/10 19:14:36 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:36 WARN TaskSetManager: Stage 64 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/10 19:14:45 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:45 WARN TaskSetManager: Stage 65 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:14:45 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:45 WARN TaskSetManager: Stage 66 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:14:45 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:45 WARN TaskSetManager: Stage 67 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:14:46 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:46 WARN TaskSetManager: Stage 68 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:14:46 WARN DAGScheduler: Broadcasting large task binary with size 4.1 Mi

22/12/10 19:14:51 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:51 WARN TaskSetManager: Stage 101 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:14:51 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:51 WARN TaskSetManager: Stage 102 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:14:52 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:52 WARN TaskSetManager: Stage 103 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:14:52 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:52 WARN TaskSetManager: Stage 104 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:14:52 WARN DAGScheduler: Broadcasting large task binary with size 4.

                                                                                

+----------------+-----+------------+
|              id|toxic|severe_toxic|
+----------------+-----+------------+
|805d0ddb7e79b173|  0.0|         0.0|
|80fb26ac41c0a3e0|  0.0|         0.0|
|81de7451eafefeb3|  0.0|         0.0|
|82c3fd5493e4873e|  1.0|         0.0|
|83a835e1c77f1ab8|  0.0|         0.0|
+----------------+-----+------------+
only showing top 5 rows

22/12/10 19:14:57 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:14:57 WARN TaskSetManager: Stage 125 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/10 19:15:01 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:01 WARN TaskSetManager: Stage 126 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/10 19:15:05 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:05 WARN TaskSetManager: Stage 127 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:05 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:05 WARN TaskSetManager: Stage 128 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:05 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:05 WARN TaskSetManager: Stage 129 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:05 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:05 WARN TaskSetManager: Stage 130 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:05 WARN DAGScheduler: Broadcasting large task binary with size 4.

22/12/10 19:15:11 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:11 WARN TaskSetManager: Stage 164 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:11 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:11 WARN TaskSetManager: Stage 165 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:11 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:11 WARN TaskSetManager: Stage 166 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:12 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:12 WARN TaskSetManager: Stage 167 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:12 WARN DAGScheduler: Broadcasting large task binary with size 4.

                                                                                

+----------------+-----+------------+-------+
|              id|toxic|severe_toxic|obscene|
+----------------+-----+------------+-------+
|805d0ddb7e79b173|  0.0|         0.0|    0.0|
|80fb26ac41c0a3e0|  0.0|         0.0|    0.0|
|81de7451eafefeb3|  0.0|         0.0|    0.0|
|82c3fd5493e4873e|  1.0|         0.0|    1.0|
|83a835e1c77f1ab8|  0.0|         0.0|    0.0|
+----------------+-----+------------+-------+
only showing top 5 rows

22/12/10 19:15:20 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:20 WARN TaskSetManager: Stage 197 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/10 19:15:24 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:24 WARN TaskSetManager: Stage 198 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/10 19:15:28 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:28 WARN TaskSetManager: Stage 199 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:29 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:29 WARN TaskSetManager: Stage 200 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:29 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:29 WARN TaskSetManager: Stage 201 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:29 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:29 WARN TaskSetManager: Stage 202 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:29 WARN DAGScheduler: Broadcasting large task binary with size 4.

22/12/10 19:15:35 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:35 WARN TaskSetManager: Stage 235 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:35 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:35 WARN TaskSetManager: Stage 236 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:36 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:36 WARN TaskSetManager: Stage 237 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:36 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:36 WARN TaskSetManager: Stage 238 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:36 WARN DAGScheduler: Broadcasting large task binary with size 4.

[Stage 240:>                (0 + 2) / 2][Stage 241:>                (0 + 2) / 2][Stage 240:>  (0 + 2) / 2][Stage 241:>  (0 + 2) / 2][Stage 242:>  (0 + 2) / 2]

22/12/10 19:15:37 WARN TaskSetManager: Stage 244 contains a task of very large size (8263 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

+----------------+-----+------------+-------+------+
|              id|toxic|severe_toxic|obscene|threat|
+----------------+-----+------------+-------+------+
|805d0ddb7e79b173|  0.0|         0.0|    0.0|   0.0|
|80fb26ac41c0a3e0|  0.0|         0.0|    0.0|   0.0|
|81de7451eafefeb3|  0.0|         0.0|    0.0|   0.0|
|82c3fd5493e4873e|  1.0|         0.0|    1.0|   0.0|
|83a835e1c77f1ab8|  0.0|         0.0|    0.0|   0.0|
+----------------+-----+------------+-------+------+
only showing top 5 rows

22/12/10 19:15:43 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:44 WARN TaskSetManager: Stage 255 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/10 19:15:48 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:48 WARN TaskSetManager: Stage 256 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/10 19:15:56 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:56 WARN TaskSetManager: Stage 257 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:56 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:56 WARN TaskSetManager: Stage 258 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:57 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:57 WARN TaskSetManager: Stage 259 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:57 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:15:57 WARN TaskSetManager: Stage 260 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:15:57 WARN DAGScheduler: Broadcasting large task binary with size 4.

22/12/10 19:16:00 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:16:00 WARN TaskSetManager: Stage 293 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:16:00 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:16:00 WARN TaskSetManager: Stage 294 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:16:00 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:16:00 WARN TaskSetManager: Stage 295 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:16:00 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:16:00 WARN TaskSetManager: Stage 296 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:16:01 WARN DAGScheduler: Broadcasting large task binary with size 4.

[Stage 310:>                (0 + 2) / 2][Stage 311:>                (0 + 2) / 2][Stage 311:>  (0 + 2) / 2][Stage 312:>  (0 + 2) / 2][Stage 313:>  (0 + 2) / 2]

22/12/10 19:16:04 WARN TaskSetManager: Stage 314 contains a task of very large size (8263 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:16:08 WARN TaskSetManager: Stage 315 contains a task of very large size (8263 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

+----------------+-----+------------+-------+------+------+
|              id|toxic|severe_toxic|obscene|threat|insult|
+----------------+-----+------------+-------+------+------+
|805d0ddb7e79b173|  0.0|         0.0|    0.0|   0.0|   0.0|
|80fb26ac41c0a3e0|  0.0|         0.0|    0.0|   0.0|   0.0|
|81de7451eafefeb3|  0.0|         0.0|    0.0|   0.0|   0.0|
|82c3fd5493e4873e|  1.0|         0.0|    1.0|   0.0|   1.0|
|83a835e1c77f1ab8|  0.0|         0.0|    0.0|   0.0|   0.0|
+----------------+-----+------------+-------+------+------+
only showing top 5 rows

22/12/10 19:16:10 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:16:10 WARN TaskSetManager: Stage 328 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/10 19:16:14 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:16:14 WARN TaskSetManager: Stage 329 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/10 19:16:18 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:16:18 WARN TaskSetManager: Stage 330 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:16:18 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:16:18 WARN TaskSetManager: Stage 331 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:16:18 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:16:18 WARN TaskSetManager: Stage 332 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:16:18 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:16:18 WARN TaskSetManager: Stage 333 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:16:18 WARN DAGScheduler: Broadcasting large task binary with size 4.

22/12/10 19:16:21 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:16:21 WARN TaskSetManager: Stage 367 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:16:21 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:16:21 WARN TaskSetManager: Stage 368 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:16:21 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:16:21 WARN TaskSetManager: Stage 369 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:16:21 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
22/12/10 19:16:21 WARN TaskSetManager: Stage 370 contains a task of very large size (25385 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:16:21 WARN DAGScheduler: Broadcasting large task binary with size 4.

[Stage 383:>  (0 + 2) / 2][Stage 384:>  (0 + 2) / 2][Stage 385:>  (0 + 2) / 2]

22/12/10 19:16:26 WARN TaskSetManager: Stage 387 contains a task of very large size (8263 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:16:26 WARN TaskSetManager: Stage 388 contains a task of very large size (8263 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

+----------------+-----+------------+-------+------+------+-------------+
|              id|toxic|severe_toxic|obscene|threat|insult|identity_hate|
+----------------+-----+------------+-------+------+------+-------------+
|805d0ddb7e79b173|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|80fb26ac41c0a3e0|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|81de7451eafefeb3|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|82c3fd5493e4873e|  1.0|         0.0|    1.0|   0.0|   1.0|          0.0|
|83a835e1c77f1ab8|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
+----------------+-----+------------+-------+------+------+-------------+
only showing top 5 rows



In [18]:
test_res_pd = test_res.toPandas()

22/12/10 19:16:36 WARN TaskSetManager: Stage 403 contains a task of very large size (8263 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:16:36 WARN DAGScheduler: Broadcasting large task binary with size 5.4 MiB
22/12/10 19:16:36 WARN DAGScheduler: Broadcasting large task binary with size 5.4 MiB
22/12/10 19:16:36 WARN TaskSetManager: Stage 404 contains a task of very large size (8263 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:16:37 WARN DAGScheduler: Broadcasting large task binary with size 5.4 MiB
22/12/10 19:16:37 WARN TaskSetManager: Stage 405 contains a task of very large size (8263 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:16:37 WARN TaskSetManager: Stage 406 contains a task of very large size (8263 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:16:37 WARN DAGScheduler: Broadcasting large task binary with size 5.4 MiB
22/12/10 19:16:37 WARN TaskSetManager: Stage 407 contains a task of very large size 

[Stage 404:>                (0 + 2) / 2][Stage 405:>                (0 + 2) / 2][Stage 404:>  (0 + 2) / 2][Stage 405:>  (0 + 2) / 2][Stage 406:>  (0 + 2) / 2]

22/12/10 19:16:41 WARN TaskSetManager: Stage 408 contains a task of very large size (8263 KiB). The maximum recommended task size is 1000 KiB.


[Stage 404:=> (1 + 1) / 2][Stage 405:>  (0 + 2) / 2][Stage 406:>  (0 + 2) / 2]

22/12/10 19:16:41 WARN TaskSetManager: Stage 409 contains a task of very large size (8263 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

In [28]:
test_res_pd = test_res_pd.sort_values(by=['id'])

In [27]:
test_res_real = df.iloc[percent:].sort_values(by=['id'])

In [43]:
test_res_pd

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
36775,00024b59235015f3,1.0,0.0,1.0,0.0,1.0,0.0
21280,000ccfe27151b36f,0.0,0.0,0.0,0.0,0.0,0.0
21595,001293279f17e968,0.0,0.0,0.0,0.0,0.0,0.0
24732,00144b2c8a4299dd,0.0,0.0,0.0,0.0,0.0,0.0
26475,001619e06b493525,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
35880,fff77cd19fb47eb0,0.0,0.0,0.0,0.0,0.0,0.0
21498,fff90f6920245ab8,0.0,0.0,0.0,0.0,0.0,0.0
25654,fffedeecd0364534,0.0,0.0,0.0,0.0,0.0,0.0
20169,ffff3a700c54e047,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
test_res_real

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
143608,00024b59235015f3,Virgin\nMy only warning? You'll block me? Well...,1,0,1,0,1,0
143609,000ccfe27151b36f,Someone else took care of this one.,0,0,0,0,0,0
143610,001293279f17e968,El Dorado's Fountain of Youth?\nIt was my impr...,0,0,0,0,0,0
143611,00144b2c8a4299dd,"""\n\nI did, actually. Even if you grant the fo...",0,0,0,0,0,0
143612,001619e06b493525,"WikiProject Paranormal \n\nHey there, just won...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
143603,fff77cd19fb47eb0,No personal attacks,0,0,0,0,0,0
143604,fff90f6920245ab8,How about edit the broken links of Incident at...,0,0,0,0,0,0
143605,fffedeecd0364534,to be driven away and die,1,0,0,0,0,0
143606,ffff3a700c54e047,"""\n\nI am glad I can amuse you. Perhaps your t...",0,0,0,0,0,0


In [36]:
f1_score(test_res_pd.toxic, test_res_real.toxic)

0.25897435897435894

In [38]:
f1_score(test_res_pd.severe_toxic, test_res_real.severe_toxic)

0.01878307535590946

In [39]:
f1_score(test_res_pd.obscene, test_res_real.obscene)

0.18507949449653485

In [41]:
f1_score(test_res_pd.threat, test_res_real.threat)

0.00570085512826924

In [42]:
f1_score(test_res_pd.identity_hate, test_res_real.identity_hate)

0.0437956204379562