In [2]:
# rsync the data locally
! gcloud storage rsync -r \
    gs://dsgt-clef-erisk-2024/task1/processed/data/count/v3/data \
    /mnt/data/erisk/task1/processed/data/count/v3/data

At gs://dsgt-clef-erisk-2024/task1/processed/data/count/v3/data/**, worker process 225984 thread 140166221711168 listed 501...
At file:///mnt/data/erisk/task1/processed/data/count/v3/data/**, worker process 225984 thread 140166221711168 listed 501...
  Completed files 0 | 0B                                                       


In [11]:
gcs_prefix = "gs://dsgt-clef-erisk-2024"
local_prefix = "/mnt/data/erisk"

In [1]:
from erisk.utils import get_spark

spark = get_spark(
    memory="30g",
    **{
        "spark.sql.parquet.enableVectorizedReader": False,
    },
)
df = spark.read.parquet("/mnt/data/erisk/task1/processed/data/count/v3/data")
df.printSchema()
df.show()
df.count()

24/04/01 05:25:30 WARN Utils: Your hostname, daphne-major resolves to a loopback address: 127.0.1.1; using 172.28.199.217 instead (on interface eth0)
24/04/01 05:25:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/01 05:25:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/04/01 05:25:31 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


root
 |-- docid: string (nullable = true)
 |-- text: string (nullable = true)
 |-- filename: string (nullable = true)
 |-- dataset: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered_words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- counttf: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- tfidf: array (nullable = true)
 |    |-- element: double (containsNull = true)

+------------+--------------------+-----------+-------+--------------------+--------------------+--------------------+--------------------+
|       docid|                text|   filename|dataset|               words|      filtered_words|             counttf|               tfidf|
+------------+--------------------+-----------+-------+--------------------+--------------------+--------------------+--------------------+
|   52118_1_2|Someone who doesn...|  s_53.trec|   test|[someone, who, do...

                                                                                

19806893

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 38280)
Traceback (most recent call last):
  File "/home/anthony/.pyenv/versions/3.11.6/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/home/anthony/.pyenv/versions/3.11.6/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/home/anthony/.pyenv/versions/3.11.6/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/home/anthony/.pyenv/versions/3.11.6/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/home/anthony/omscs/dsgt-clef/venv/lib/python3.11/site-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/home/anthony/omscs/dsgt-clef/venv/lib/python3.11/site-packages/pyspark/accumulators.py", line 

In [4]:
df.select("docid").distinct().count()

                                                                                

19802904

In [5]:
# check the average length of the text and the tokens
from pyspark.sql import functions as F

# average text length, words length, and filtered words length
df.select(
    F.length("text"), F.size("words"), F.size("filtered_words"), F.size("counttf")
).describe().show()

24/04/01 04:23:50 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------+------------------+------------------+--------------------+-------------+
|summary|      length(text)|       size(words)|size(filtered_words)|size(counttf)|
+-------+------------------+------------------+--------------------+-------------+
|  count|          19806893|          19806893|            19806893|     19806893|
|   mean| 229.3834227306625| 44.56283496861421|   22.29600740509882|      10000.0|
| stddev|172.28785433552613|28.756539502576018|  15.776512221755395|          0.0|
|    min|                 0|                 0|                   0|        10000|
|    max|             39999|              8000|                8000|        10000|
+-------+------------------+------------------+--------------------+-------------+



                                                                                

In [6]:
df.where("length(text) > 10").count()

                                                                                

19802881

In [7]:
19806893 - 19802881

4012

In [2]:
# can we find documents that have an unusual amount of repeated words?
# the naive way would be to explode the count vectorizer, and find one where the mass of the words is concentrated in a few words
import numpy as np
import pandas as pd
from pyspark.sql import functions as F


@F.pandas_udf("float")
def repeated_score(counts: pd.Series) -> pd.Series:
    k = 3
    arr = np.stack(counts.values, axis=0)
    total = arr.sum(axis=1) + 1
    topk = np.sort(arr, axis=1)[:, -k:].sum(axis=1) + 1
    return pd.Series(topk / total)


@F.pandas_udf("float")
def repeated_score_modified(counts: pd.Series) -> pd.Series:
    k = 3
    arr = np.stack(counts.values, axis=0)
    total = arr.sum(axis=1) + 1
    topk = np.sort(arr, axis=1)[:, -k:].sum(axis=1) + 1
    return pd.Series((topk / total) * np.log(total))


df.select(repeated_score("counttf").alias("repeated_score"), "text").show(
    truncate=False
)

df.select(repeated_score_modified("counttf").alias("repeated_score"), "text").show(
    truncate=False
)

                                                                                

+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|repeated_score|text                                                                                                                                                                                                                                                                                    

In [9]:
# let's try the scores again, but using pure spark constructs
from functools import reduce

scored = (
    df.withColumn("sorted_counttf", F.reverse(F.array_sort("counttf")))
    .withColumn(
        "total", F.expr("AGGREGATE(sorted_counttf, 0D, (acc, x) -> acc + x)") + 1
    )
    .withColumn(
        "topk",
        reduce(
            lambda x, y: x + y, [F.col("sorted_counttf").getItem(i) for i in range(3)]
        )
        + 1,
    )
    .withColumn("score", F.expr("topk / total * log(total)"))
    .select("docid", "score", "text")
)
scored.show()

+------------+------------------+--------------------+
|       docid|             score|                text|
+------------+------------------+--------------------+
|   52118_1_2|0.7942490404307401|Someone who doesn...|
| 249565_0_10|1.0949641200838114|\n\nDoes anyone h...|
| 431404_1_32|0.8179550128510825|Is this silent tr...|
|s_1463_688_2|0.9210340371976184| Now think about ...|
|s_1030_652_0| 1.111948656603036| Such a simplisti...|
|  277184_0_6|0.7518684318511113|\n\nNow, I do thi...|
|  377940_0_2|0.9012852309630962|I have a feeling ...|
|  535145_0_3|0.6816291773759021|we Regulate who c...|
|  390061_0_7| 0.954280848326263|But then I starte...|
|s_1697_168_5|               0.0|https://challonge...|
|s_1233_571_2|1.0397207708399179|The most used and...|
| 346571_0_60|0.9425204748625923|Mind you, this is...|
|s_978_1716_0|1.2875503299472804|We can find a lot...|
|  124489_0_6|0.8958797346140275|Thankfully though...|
|   18301_0_0|0.6437751649736402|So my gf and I ha...|
| 141331_4

In [27]:
sorted_scores = scored.select("docid", "score").orderBy(F.desc("score")).cache()
(
    df.join(sorted_scores.limit(20), "docid")
    .orderBy(F.desc("score"))
    .select("docid", "score", "text")
).show(truncate=200)

24/04/01 06:47:46 WARN CacheManager: Asked to cache already cached data.

+------------+-----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|       docid|            score|                                                                                                                                                                                                    text|
+------------+-----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|s_1721_303_1|8.987321812850125|fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fu...|
|   35552_0_2| 8.77855097543215|but you cant get more boners. \n

                                                                                

In [16]:
# write sorted scores to disk
sorted_scores.repartition(32).write.parquet(
    f"{local_prefix}/task1/processed/data/count_scores/v1", mode="overwrite"
)
sorted_scores = spark.read.parquet(
    f"{local_prefix}/task1/processed/data/count_scores/v1"
)

                                                                                

In [22]:
! echo gcloud storage rsync -r --delete-unmatched-destination-objects {local_prefix}/task1/processed/data/count_scores/v1 {gcs_prefix}/task1/processed/data/count_scores/v1

gcloud storage rsync -r --delete-unmatched-destination-objects /mnt/data/erisk/task1/processed/data/count_scores/v1 gs://dsgt-clef-erisk-2024/task1/processed/data/count_scores/v1


In [29]:
(
    df.join(sorted_scores.sample(0.1).limit(100), "docid")
    .orderBy(F.desc("score"))
    .select("docid", "score", "text")
).show(n=100, truncate=200)



+-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|        docid|             score|                                                                                                                                                                                                    text|
+-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| s_1721_301_0| 8.502891406705377|fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fu...|
|  503650_0_21| 7.983891468132409|I am willing to do wha

                                                                                

In [25]:
# what is the score distribution?
sorted_scores.describe().show()
sorted_scores.groupBy(F.round("score", 0).alias("score_round")).count().orderBy(
    "score_round"
).show()

                                                                                

+-------+--------+------------------+
|summary|   docid|             score|
+-------+--------+------------------+
|  count|19802903|          19806893|
|   mean|    NULL|0.8918288414078448|
| stddev|    NULL|0.2744943296143926|
|    min|   0_0_0|               0.0|
|    max|s_9_99_7| 8.987321812850125|
+-------+--------+------------------+





+-----------+--------+
|score_round|   count|
+-----------+--------+
|        0.0|  532047|
|        1.0|18929558|
|        2.0|  334028|
|        3.0|    7761|
|        4.0|    2959|
|        5.0|     352|
|        6.0|      67|
|        7.0|      64|
|        8.0|      53|
|        9.0|       4|
+-----------+--------+



                                                                                

In [30]:
(
    df.join(sorted_scores.where("score > 2").sample(0.1).limit(50), "docid")
    .orderBy(F.desc("score"))
    .select("docid", "score", "text")
).show(n=50, truncate=200)



+-------------+------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|        docid|             score|                                                                                                                                                                                                                                                               text|
+-------------+------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|  503650_0_20| 7.984909251348568|                                                           \n\nSo, Reddit, what c

                                                                                

In [32]:
# and what about filtering out stuff that suffered in the tokenization process?

(
    df.join(sorted_scores.where("score > 2").sample(0.1).limit(50), "docid")
    .orderBy(F.desc("score"))
    .select(
        "docid",
        "score",
        F.length("text").alias("text_length"),
        F.length(F.concat_ws(" ", "words")).alias("words_length"),
        "text",
    )
).show(n=50, truncate=200)



+-------------+------------------+-----------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|        docid|             score|text_length|words_length|                                                                                                                                                                                                    text|
+-------------+------------------+-----------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|    35552_0_2|  8.77855097543215|      39969|       39969|but you cant get more boners. \n\nBONER BONER BONER BONER BONER BONER BONER BONER BONER BONER BONER BONER BONER BONER BONER BONER BONER BONER BONER BONER BONE

                                                                                