# preprocessing

Let's add in word2vec to our processing pipeline.


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from erisk.utils import get_spark

spark = get_spark(cores=8, memory="28g")
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/17 19:06:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/17 19:06:13 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [3]:
from pyspark.sql import functions as F

# let's apply tf-idf to the text column, and also include word2vec
from pyspark.ml.feature import Tokenizer, Word2Vec
from pyspark.ml import Pipeline

bucket = "gs://dsgt-clef-erisk-2024"
test_df = spark.read.parquet(f"{bucket}/task1/parquet/test")
train_df = spark.read.parquet(f"{bucket}/task1/parquet/train")

word2vec_features = 256
tokenizer = Tokenizer(inputCol="TEXT", outputCol="words")
word2vec = Word2Vec(
    vectorSize=word2vec_features,
    minCount=0,
    inputCol=tokenizer.getOutputCol(),
    outputCol="word2vec",
)
pipeline = Pipeline(stages=[tokenizer, word2vec])

pipeline_model = pipeline.fit(train_df)
%time pipeline_model.transform(train_df).show()

[Stage 5:>                                                          (0 + 1) / 1]

In [None]:
# generate a new dataframe with features from both train and test

total_df = (
    train_df.select(
        "DOCNO",
        "TEXT",
        "filename",
        F.lit("train").alias("dataset"),
    )
    .union(
        test_df.select(
            "DOCNO",
            F.concat(
                F.coalesce(F.col("PRE"), F.lit("")),
                F.coalesce(F.col("TEXT"), F.lit("")),
                F.coalesce(F.col("POST"), F.lit("")),
            ).alias("TEXT"),
            "filename",
            F.lit("test").alias("dataset"),
        ),
    )
    .where("filename is not null")
    .where("TEXT is not null")
)

                                                                                

+--------+--------------------+--------+-------+
|   DOCNO|                TEXT|filename|dataset|
+--------+--------------------+--------+-------+
| s_0_0_0|    1.ye katiliyorum|s_0.trec|  train|
| s_0_1_0|ok haklsn abi gol...|s_0.trec|  train|
| s_0_2_0| almanca yarrak gibi|s_0.trec|  train|
| s_0_3_0|hani u oyunlarn e...|s_0.trec|  train|
| s_0_3_1|dead cellste ygda...|s_0.trec|  train|
| s_0_3_2|bunlarn bir dili ...|s_0.trec|  train|
| s_0_4_0|lnce diriltiyor s...|s_0.trec|  train|
| s_0_6_0|       ziya gzel sal|s_0.trec|  train|
| s_0_7_0|  artk dedem deilsin|s_0.trec|  train|
| s_0_8_0|sorma bizim matem...|s_0.trec|  train|
| s_0_9_0|240 Volt FUCKMAST...|s_0.trec|  train|
|s_0_10_0|bunlar nerden evi...|s_0.trec|  train|
|s_0_11_0|beynine gidecek k...|s_0.trec|  train|
|s_0_12_0|semeyen vizyonsuz...|s_0.trec|  train|
|s_0_13_0|       ok haklsn abi|s_0.trec|  train|
|s_0_14_0|ilkokul zamanlari...|s_0.trec|  train|
|s_0_15_0|iliki kurmakta zo...|s_0.trec|  train|
|s_0_15_1|liseye gid

                                                                                

19806893

In [None]:
from pyspark.ml.functions import vector_to_array

pipeline_model = pipeline.fit(total_df)
pipeline_model.write().overwrite().save(f"{bucket}/task1/models/pipeline_word2vec")
print("pipeline model saved")

res_df = pipeline_model.transform(total_df).withColumn(
    "word2vec", vector_to_array(F.col("word2vec"))
)
res_df.printSchema()

# save both the pipeline and the total_df
res_df.write.mode("overwrite").parquet(f"{bucket}/task1/parquet/combined_word2vec")
print("combined word2vec saved")

                                                                                

root
 |-- DOCNO: string (nullable = true)
 |-- TEXT: string (nullable = true)
 |-- filename: string (nullable = true)
 |-- dataset: string (nullable = false)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hashingtf: array (nullable = false)
 |    |-- element: double (containsNull = false)
 |-- tfidf: array (nullable = false)
 |    |-- element: double (containsNull = false)



                                                                                

In [None]:
! gcloud storage du --readable-sizes {bucket}/task1/parquet/combined_word2vec

0B           gs://dsgt-clef-erisk-2024/task1/parquet/combined_tfidf/
0B           gs://dsgt-clef-erisk-2024/task1/parquet/combined_tfidf/_SUCCESS
1.04kiB      gs://dsgt-clef-erisk-2024/task1/parquet/combined_tfidf/part-00000-4ee3bfbe-fc38-48f5-a93a-1d50237f7fef-c000.snappy.parquet
472.95MiB    gs://dsgt-clef-erisk-2024/task1/parquet/combined_tfidf/part-00002-4ee3bfbe-fc38-48f5-a93a-1d50237f7fef-c000.snappy.parquet
299.32MiB    gs://dsgt-clef-erisk-2024/task1/parquet/combined_tfidf/part-00006-4ee3bfbe-fc38-48f5-a93a-1d50237f7fef-c000.snappy.parquet
409.05MiB    gs://dsgt-clef-erisk-2024/task1/parquet/combined_tfidf/part-00008-4ee3bfbe-fc38-48f5-a93a-1d50237f7fef-c000.snappy.parquet
405.82MiB    gs://dsgt-clef-erisk-2024/task1/parquet/combined_tfidf/part-00009-4ee3bfbe-fc38-48f5-a93a-1d50237f7fef-c000.snappy.parquet
408.61MiB    gs://dsgt-clef-erisk-2024/task1/parquet/combined_tfidf/part-00010-4ee3bfbe-fc38-48f5-a93a-1d50237f7fef-c000.snappy.parquet
406.09MiB    gs://dsgt-clef-erisk-2024

In [None]:
total_df = spark.read.parquet(f"{bucket}/task1/parquet/combined_word2vec")
total_df.show()

[Stage 64:>                                                         (0 + 1) / 1]

+--------+--------------------+--------+-------+--------------------+--------------------+--------------------+
|   DOCNO|                TEXT|filename|dataset|               words|           hashingtf|               tfidf|
+--------+--------------------+--------+-------+--------------------+--------------------+--------------------+
| s_0_0_0|    1.ye katiliyorum|s_0.trec|  train| [1.ye, katiliyorum]|[0.0, 0.0, 0.0, 0...|[0.0, 0.0, 0.0, 0...|
| s_0_1_0|ok haklsn abi gol...|s_0.trec|  train|[ok, haklsn, abi,...|[0.0, 0.0, 0.0, 0...|[0.0, 0.0, 0.0, 0...|
| s_0_2_0| almanca yarrak gibi|s_0.trec|  train|[almanca, yarrak,...|[0.0, 0.0, 0.0, 0...|[0.0, 0.0, 0.0, 0...|
| s_0_3_0|hani u oyunlarn e...|s_0.trec|  train|[hani, u, oyunlar...|[0.0, 0.0, 0.0, 0...|[0.0, 0.0, 0.0, 0...|
| s_0_3_1|dead cellste ygda...|s_0.trec|  train|[dead, cellste, y...|[0.0, 0.0, 1.0, 0...|[0.0, 0.0, 1.1335...|
| s_0_3_2|bunlarn bir dili ...|s_0.trec|  train|[bunlarn, bir, di...|[0.0, 0.0, 0.0, 0...|[0.0, 0.0, 0.0

                                                                                

In [None]:
spark.stop()