# preprocessing

Let's add in word2vec to our processing pipeline.


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from erisk.utils import get_spark

spark = get_spark(cores=8, memory="28g")
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/17 19:06:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/17 19:06:13 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [6]:
from pyspark.sql import functions as F

# let's apply tf-idf to the text column, and also include word2vec
from pyspark.ml.feature import Tokenizer, Word2Vec
from pyspark.ml import Pipeline

bucket = "gs://dsgt-clef-erisk-2024"
test_df = spark.read.parquet(f"{bucket}/task1/parquet/test")
train_df = spark.read.parquet(f"{bucket}/task1/parquet/train")

word2vec_features = 64
tokenizer = Tokenizer(inputCol="TEXT", outputCol="words")
word2vec = Word2Vec(
    vectorSize=word2vec_features,
    numPartitions=8,
    inputCol=tokenizer.getOutputCol(),
    outputCol="word2vec",
)
pipeline = Pipeline(stages=[tokenizer, word2vec])

pipeline_model = pipeline.fit(train_df)
%time pipeline_model.transform(train_df.repartition(8)).show()



+-------------+--------------------+-----------+--------------------+--------------------+
|        DOCNO|                TEXT|   filename|               words|            word2vec|
+-------------+--------------------+-----------+--------------------+--------------------+
| s_1773_565_0| Yes the pot has ...|s_1773.trec|[, yes, the, pot,...|[-0.1233540048392...|
|s_1418_1347_1|Is the show faith...|s_1418.trec|[is, the, show, f...|[-0.1363150575530...|
| s_1447_631_0|The feeling is al...|s_1447.trec|[the, feeling, is...|[0.04741903394460...|
|s_1452_832_14|I didn't know abo...|s_1452.trec|[i, didn't, know,...|[0.08563407436013...|
|s_2302_231_12|This implies that...|s_2302.trec|[this, implies, t...|[-0.0261339815167...|
|  s_195_798_1|Too bad there isn...| s_195.trec|[too, bad, there,...|[0.00197446951642...|
| s_1419_351_0|Get $5 when you s...|s_1419.trec|[get, $5, when, y...|[-0.1175157446414...|
|s_1452_1763_2|I wish he was com...|s_1452.trec|[i, wish, he, was...|[-0.0113339025499...|

                                                                                

In [7]:
# generate a new dataframe with features from both train and test

total_df = (
    train_df.select(
        "DOCNO",
        "TEXT",
        "filename",
        F.lit("train").alias("dataset"),
    )
    .union(
        test_df.select(
            "DOCNO",
            F.concat(
                F.coalesce(F.col("PRE"), F.lit("")),
                F.coalesce(F.col("TEXT"), F.lit("")),
                F.coalesce(F.col("POST"), F.lit("")),
            ).alias("TEXT"),
            "filename",
            F.lit("test").alias("dataset"),
        ),
    )
    .where("filename is not null")
    .where("TEXT is not null")
).repartition(16)

In [8]:
from pyspark.ml.functions import vector_to_array

pipeline_model = pipeline.fit(total_df)
pipeline_model.write().overwrite().save(f"{bucket}/task1/models/pipeline_word2vec")
print("pipeline model saved")

res_df = pipeline_model.transform(total_df).withColumn(
    "word2vec", vector_to_array(F.col("word2vec"))
)
res_df.printSchema()

# save both the pipeline and the total_df
res_df.write.mode("overwrite").parquet(f"{bucket}/task1/parquet/combined_word2vec")
print("combined word2vec saved")

24/03/17 20:56:41 WARN TaskSetManager: Stage 42 contains a task of very large size (41237 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

pipeline model saved
root
 |-- DOCNO: string (nullable = true)
 |-- TEXT: string (nullable = true)
 |-- filename: string (nullable = true)
 |-- dataset: string (nullable = false)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- word2vec: array (nullable = false)
 |    |-- element: double (containsNull = false)



                                                                                

combined word2vec saved


In [9]:
! gcloud storage du --readable-sizes {bucket}/task1/parquet/combined_word2vec

0B           gs://dsgt-clef-erisk-2024/task1/parquet/combined_word2vec/
0B           gs://dsgt-clef-erisk-2024/task1/parquet/combined_word2vec/_SUCCESS
993.76MiB    gs://dsgt-clef-erisk-2024/task1/parquet/combined_word2vec/part-00000-9b14448d-8d9c-4882-a2a1-4bcdbd4d01de-c000.snappy.parquet
993.65MiB    gs://dsgt-clef-erisk-2024/task1/parquet/combined_word2vec/part-00001-9b14448d-8d9c-4882-a2a1-4bcdbd4d01de-c000.snappy.parquet
993.82MiB    gs://dsgt-clef-erisk-2024/task1/parquet/combined_word2vec/part-00002-9b14448d-8d9c-4882-a2a1-4bcdbd4d01de-c000.snappy.parquet
993.31MiB    gs://dsgt-clef-erisk-2024/task1/parquet/combined_word2vec/part-00003-9b14448d-8d9c-4882-a2a1-4bcdbd4d01de-c000.snappy.parquet
993.74MiB    gs://dsgt-clef-erisk-2024/task1/parquet/combined_word2vec/part-00004-9b14448d-8d9c-4882-a2a1-4bcdbd4d01de-c000.snappy.parquet
993.86MiB    gs://dsgt-clef-erisk-2024/task1/parquet/combined_word2vec/part-00005-9b14448d-8d9c-4882-a2a1-4bcdbd4d01de-c000.snappy.parquet
993.92MiB    g

In [10]:
total_df = spark.read.parquet(f"{bucket}/task1/parquet/combined_word2vec")
total_df.show()

[Stage 49:>                                                         (0 + 1) / 1]

+-------------+--------------------+-----------+-------+--------------------+--------------------+
|        DOCNO|                TEXT|   filename|dataset|               words|            word2vec|
+-------------+--------------------+-----------+-------+--------------------+--------------------+
| s_1065_569_8|I did not die or ...|s_1065.trec|  train|[i, did, not, die...|[-0.0302185551686...|
|  s_1483_86_1|My wife left me 5...|s_1483.trec|  train|[my, wife, left, ...|[-0.0837481777582...|
| s_191_268_30|Either discovery ...| s_191.trec|  train|[either, discover...|[0.03546374198049...|
|  s_1785_23_1|Sorry I phrased i...|s_1785.trec|  train|[sorry, i, phrase...|[-0.1627588227391...|
| s_2587_200_0| You'll always be...|s_2587.trec|  train|[, you'll, always...|[-0.0674093064541...|
|s_2242_640_14|\n\nMy point in s...|s_2242.trec|  train|[, , my, point, i...|[-0.0760117996911...|
| s_2731_623_3|The robot, of cou...|s_2731.trec|  train|[the, robot,, of,...|[-0.1646860837936...|
|s_2544_10

                                                                                

In [11]:
spark.stop()