# preprocessing

We use serverless dataproc to handle processing here.


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from erisk.utils import get_spark

spark = get_spark(cores=8, memory="28g")
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/17 18:27:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/17 18:27:23 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [3]:
bucket = "gs://dsgt-clef-erisk-2024"
test_df = spark.read.parquet(f"{bucket}/task1/parquet/test")
test_df.show()

                                                                                

+--------+--------------------+--------------------+--------------------+---------------+--------+
|   DOCNO|                POST|                 PRE|                TEXT|_corrupt_record|filename|
+--------+--------------------+--------------------+--------------------+---------------+--------+
|   0_0_6|I'm trying to wor...|\n\nAlthough most...|I will not tell m...|           NULL|s_0.trec|
|456_1_12|You're not like '...|In general though...|Oh, and if you're...|           NULL|s_1.trec|
| 764_1_5|Maybe it's one of...|My past experienc...|But I still want ...|           NULL|s_1.trec|
|651_0_28|\n\nWe all run ba...|\n\nSo this woman...|I couldn't even i...|           NULL|s_1.trec|
| 268_1_3| Both were great,...|\n\nI've only had...|One a couple year...|           NULL|s_1.trec|
|364_0_12|I started opening...|Even though I too...|Which I can under...|           NULL|s_1.trec|
|765_0_33|Nowhere on my inv...|I ask about the t...|Words can not exp...|           NULL|s_1.trec|
|409_0_18|

In [6]:
test_df.where("_corrupt_record <> null").show()

+-----+----+---+----+---------------+--------+
|DOCNO|POST|PRE|TEXT|_corrupt_record|filename|
+-----+----+---+----+---------------+--------+
+-----+----+---+----+---------------+--------+



In [7]:
test_df.groupby("filename").count().show()



+----------+-----+
|  filename|count|
+----------+-----+
|s_153.trec|27966|
|s_196.trec|26988|
|s_194.trec|28570|
|s_265.trec|26173|
|s_165.trec|28915|
|s_272.trec|29164|
| s_15.trec|29058|
|s_220.trec|27630|
|s_118.trec|27238|
|s_133.trec|26989|
|s_111.trec|26887|
|s_157.trec|26094|
|s_277.trec|26536|
|s_180.trec|26234|
|s_193.trec|28355|
|s_115.trec|30587|
|s_158.trec|27811|
|s_268.trec|28074|
|  s_1.trec|27087|
|s_280.trec|27301|
+----------+-----+
only showing top 20 rows



                                                                                

In [8]:
test_df.count()

15542200

In [9]:
# lets also load up the train df
train_df = spark.read.parquet(f"{bucket}/task1/parquet/train")
train_df.show()
train_df.count()

                                                                                

+--------+--------------------+--------+
|   DOCNO|                TEXT|filename|
+--------+--------------------+--------+
| s_0_0_0|    1.ye katiliyorum|s_0.trec|
| s_0_1_0|ok haklsn abi gol...|s_0.trec|
| s_0_2_0| almanca yarrak gibi|s_0.trec|
| s_0_3_0|hani u oyunlarn e...|s_0.trec|
| s_0_3_1|dead cellste ygda...|s_0.trec|
| s_0_3_2|bunlarn bir dili ...|s_0.trec|
| s_0_4_0|lnce diriltiyor s...|s_0.trec|
| s_0_6_0|       ziya gzel sal|s_0.trec|
| s_0_7_0|  artk dedem deilsin|s_0.trec|
| s_0_8_0|sorma bizim matem...|s_0.trec|
| s_0_9_0|240 Volt FUCKMAST...|s_0.trec|
|s_0_10_0|bunlar nerden evi...|s_0.trec|
|s_0_11_0|beynine gidecek k...|s_0.trec|
|s_0_12_0|semeyen vizyonsuz...|s_0.trec|
|s_0_13_0|       ok haklsn abi|s_0.trec|
|s_0_14_0|ilkokul zamanlari...|s_0.trec|
|s_0_15_0|iliki kurmakta zo...|s_0.trec|
|s_0_15_1|liseye giden bir ...|s_0.trec|
|s_0_15_2|tipimin ve kiilii...|s_0.trec|
|s_0_15_3|ben insanlarla ko...|s_0.trec|
+--------+--------------------+--------+
only showing top

4264693

In [10]:
from pyspark.sql import functions as F

# let's apply tf-idf to the text column, and also include word2vec
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, Word2Vec
from pyspark.ml import Pipeline

hashing_features = 256
word2vec_features = 256
tokenizer = Tokenizer(inputCol="TEXT", outputCol="words")
hashingTF = HashingTF(
    inputCol=tokenizer.getOutputCol(),
    outputCol="hashingtf",
    numFeatures=hashing_features,
)
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tfidf")
# word2vec = Word2Vec(
#     vectorSize=word2vec_features,
#     minCount=0,
#     inputCol=tokenizer.getOutputCol(),
#     outputCol="word2vec",
# )
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf])

pipeline_model = pipeline.fit(train_df)
pipeline_model.transform(train_df).show()



+--------+--------------------+--------+--------------------+--------------------+--------------------+
|   DOCNO|                TEXT|filename|               words|           hashingtf|               tfidf|
+--------+--------------------+--------+--------------------+--------------------+--------------------+
| s_0_0_0|    1.ye katiliyorum|s_0.trec| [1.ye, katiliyorum]|(256,[18,181],[1....|(256,[18,181],[3....|
| s_0_1_0|ok haklsn abi gol...|s_0.trec|[ok, haklsn, abi,...|(256,[53,77,118,1...|(256,[53,77,118,1...|
| s_0_2_0| almanca yarrak gibi|s_0.trec|[almanca, yarrak,...|(256,[78,108,148]...|(256,[78,108,148]...|
| s_0_3_0|hani u oyunlarn e...|s_0.trec|[hani, u, oyunlar...|(256,[41,47,50,71...|(256,[41,47,50,71...|
| s_0_3_1|dead cellste ygda...|s_0.trec|[dead, cellste, y...|(256,[2,11,47,53,...|(256,[2,11,47,53,...|
| s_0_3_2|bunlarn bir dili ...|s_0.trec|[bunlarn, bir, di...|(256,[8,26,44,83,...|(256,[8,26,44,83,...|
| s_0_4_0|lnce diriltiyor s...|s_0.trec|[lnce, diriltiyor...|(25

                                                                                

In [16]:
# generate a new dataframe with features from both train and test

total_df = (
    train_df.select(
        "DOCNO",
        "TEXT",
        "filename",
        F.lit("train").alias("dataset"),
    )
    .union(
        test_df.select(
            "DOCNO",
            F.concat(
                F.coalesce(F.col("PRE"), F.lit("")),
                F.coalesce(F.col("TEXT"), F.lit("")),
                F.coalesce(F.col("POST"), F.lit("")),
            ).alias("TEXT"),
            "filename",
            F.lit("test").alias("dataset"),
        ),
    )
    .where("filename is not null")
    .where("TEXT is not null")
)
total_df.show()
total_df.count()

                                                                                

+--------+--------------------+--------+-------+
|   DOCNO|                TEXT|filename|dataset|
+--------+--------------------+--------+-------+
| s_0_0_0|    1.ye katiliyorum|s_0.trec|  train|
| s_0_1_0|ok haklsn abi gol...|s_0.trec|  train|
| s_0_2_0| almanca yarrak gibi|s_0.trec|  train|
| s_0_3_0|hani u oyunlarn e...|s_0.trec|  train|
| s_0_3_1|dead cellste ygda...|s_0.trec|  train|
| s_0_3_2|bunlarn bir dili ...|s_0.trec|  train|
| s_0_4_0|lnce diriltiyor s...|s_0.trec|  train|
| s_0_6_0|       ziya gzel sal|s_0.trec|  train|
| s_0_7_0|  artk dedem deilsin|s_0.trec|  train|
| s_0_8_0|sorma bizim matem...|s_0.trec|  train|
| s_0_9_0|240 Volt FUCKMAST...|s_0.trec|  train|
|s_0_10_0|bunlar nerden evi...|s_0.trec|  train|
|s_0_11_0|beynine gidecek k...|s_0.trec|  train|
|s_0_12_0|semeyen vizyonsuz...|s_0.trec|  train|
|s_0_13_0|       ok haklsn abi|s_0.trec|  train|
|s_0_14_0|ilkokul zamanlari...|s_0.trec|  train|
|s_0_15_0|iliki kurmakta zo...|s_0.trec|  train|
|s_0_15_1|liseye gid

                                                                                

19806893

In [17]:
from pyspark.ml.functions import vector_to_array

pipeline_model = pipeline.fit(total_df)
pipeline_model.write().overwrite().save(f"{bucket}/task1/models/pipeline_tfidf")
print("pipeline model saved")

res_df = (
    pipeline_model.transform(total_df)
    .withColumn("hashingtf", vector_to_array(F.col("hashingtf")))
    .withColumn("tfidf", vector_to_array(F.col("tfidf")))
)
res_df.printSchema()

# save both the pipeline and the total_df
res_df.write.mode("overwrite").parquet(f"{bucket}/task1/parquet/combined_tfidf")
print("combined tfidf saved")

                                                                                

root
 |-- DOCNO: string (nullable = true)
 |-- TEXT: string (nullable = true)
 |-- filename: string (nullable = true)
 |-- dataset: string (nullable = false)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hashingtf: array (nullable = false)
 |    |-- element: double (containsNull = false)
 |-- tfidf: array (nullable = false)
 |    |-- element: double (containsNull = false)



                                                                                

In [19]:
! gcloud storage du --readable-sizes {bucket}/task1/parquet/combined_tfidf

0B           gs://dsgt-clef-erisk-2024/task1/parquet/combined_tfidf/
0B           gs://dsgt-clef-erisk-2024/task1/parquet/combined_tfidf/_SUCCESS
1.04kiB      gs://dsgt-clef-erisk-2024/task1/parquet/combined_tfidf/part-00000-4ee3bfbe-fc38-48f5-a93a-1d50237f7fef-c000.snappy.parquet
472.95MiB    gs://dsgt-clef-erisk-2024/task1/parquet/combined_tfidf/part-00002-4ee3bfbe-fc38-48f5-a93a-1d50237f7fef-c000.snappy.parquet
299.32MiB    gs://dsgt-clef-erisk-2024/task1/parquet/combined_tfidf/part-00006-4ee3bfbe-fc38-48f5-a93a-1d50237f7fef-c000.snappy.parquet
409.05MiB    gs://dsgt-clef-erisk-2024/task1/parquet/combined_tfidf/part-00008-4ee3bfbe-fc38-48f5-a93a-1d50237f7fef-c000.snappy.parquet
405.82MiB    gs://dsgt-clef-erisk-2024/task1/parquet/combined_tfidf/part-00009-4ee3bfbe-fc38-48f5-a93a-1d50237f7fef-c000.snappy.parquet
408.61MiB    gs://dsgt-clef-erisk-2024/task1/parquet/combined_tfidf/part-00010-4ee3bfbe-fc38-48f5-a93a-1d50237f7fef-c000.snappy.parquet
406.09MiB    gs://dsgt-clef-erisk-2024

In [20]:
total_df = spark.read.parquet(f"{bucket}/task1/parquet/combined_tfidf")
total_df.show()

[Stage 64:>                                                         (0 + 1) / 1]

+--------+--------------------+--------+-------+--------------------+--------------------+--------------------+
|   DOCNO|                TEXT|filename|dataset|               words|           hashingtf|               tfidf|
+--------+--------------------+--------+-------+--------------------+--------------------+--------------------+
| s_0_0_0|    1.ye katiliyorum|s_0.trec|  train| [1.ye, katiliyorum]|[0.0, 0.0, 0.0, 0...|[0.0, 0.0, 0.0, 0...|
| s_0_1_0|ok haklsn abi gol...|s_0.trec|  train|[ok, haklsn, abi,...|[0.0, 0.0, 0.0, 0...|[0.0, 0.0, 0.0, 0...|
| s_0_2_0| almanca yarrak gibi|s_0.trec|  train|[almanca, yarrak,...|[0.0, 0.0, 0.0, 0...|[0.0, 0.0, 0.0, 0...|
| s_0_3_0|hani u oyunlarn e...|s_0.trec|  train|[hani, u, oyunlar...|[0.0, 0.0, 0.0, 0...|[0.0, 0.0, 0.0, 0...|
| s_0_3_1|dead cellste ygda...|s_0.trec|  train|[dead, cellste, y...|[0.0, 0.0, 1.0, 0...|[0.0, 0.0, 1.1335...|
| s_0_3_2|bunlarn bir dili ...|s_0.trec|  train|[bunlarn, bir, di...|[0.0, 0.0, 0.0, 0...|[0.0, 0.0, 0.0

                                                                                

In [21]:
spark.stop()