In [1]:
from erisk.utils import get_spark

spark = get_spark()

gcs_prefix = "gs://dsgt-clef-erisk-2024"
local_prefix = "/mnt/data/erisk"

res = spark.read.parquet(f"{gcs_prefix}/task1/processed/data/word2vec_relevant/v1")
res.groupBy("dataset").count().show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/01 20:36:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/04/01 20:36:45 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
                                                                                

+-------+-----+
|dataset|count|
+-------+-----+
|  train|16148|
|   test|59867|
+-------+-----+



In [2]:
res.printSchema()

root
 |-- docid: string (nullable = true)
 |-- dataset: string (nullable = true)
 |-- text: string (nullable = true)
 |-- word2vec: array (nullable = true)
 |    |-- element: double (containsNull = true)



In [6]:
# try compressing the text to get entropy
from pyspark.sql import functions as F
import zlib


@F.udf("integer")
def compress_text_len(text):
    return len(zlib.compress(text.encode()))


stats = (
    res.withColumn("text_compressed_len", compress_text_len(F.col("text")))
    .withColumn("text_length", F.length(F.col("text")))
    .withColumn(
        "compression_ratio", F.col("text_compressed_len") / F.col("text_length")
    )
    .select("docid", "text_length", "text_compressed_len", "compression_ratio", "text")
).cache()

# high compression ratio is bad
stats.orderBy("compression_ratio").show()



+------------+-----------+-------------------+--------------------+--------------------+
|       docid|text_length|text_compressed_len|   compression_ratio|                text|
+------------+-----------+-------------------+--------------------+--------------------+
|s_1799_242_1|      30168|                184|0.006099177936886768| (_/ )_/ hail the...|
|  358876_0_0|       9779|                 94|0.009612434809285203|\nmeow meow meowm...|
|s_1799_241_0|       6046|                 84|0.013893483294740324| (/ )/ hail the b...|
|s_1799_240_0|       6046|                 84|0.013893483294740324| (/ )/ hail the b...|
|s_1799_239_0|       6066|                 86|0.014177382129904385| (/ )/ hail the b...|
|  122062_7_0|       9705|                144|0.014837712519319939|Best in the world...|
| s_3015_62_1|       1621|                 42|0.025909932140653916| **OH MY GOD OH M...|
|  119008_1_0|       1127|                 32| 0.02839396628216504|DESU DESU DESU DE...|
|  356713_1_0|       

                                                                                

In [7]:
stats.describe().show()

24/04/01 20:42:14 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 8:>                                                          (0 + 8) / 8]

+-------+----------+------------------+-------------------+--------------------+--------------------+
|summary|     docid|       text_length|text_compressed_len|   compression_ratio|                text|
+-------+----------+------------------+-------------------+--------------------+--------------------+
|  count|     76015|             76015|              76015|               76015|               76015|
|   mean|      NULL| 142.1566532921134| 107.21177399197526|  0.8966455729908611|                NULL|
| stddev|      NULL|249.55529307024085|  72.07795776050841| 0.21003221132928318|                NULL|
|    min|100013_1_0|                 5|                 12|0.006099177936886768|\tArabian Nights:...|
|    max|s_99_408_0|             30168|               4742|                 2.6|”I want to die, I...|
+-------+----------+------------------+-------------------+--------------------+--------------------+



                                                                                

In [23]:
stats.where("compression_ratio < 0.5").sample(1.0).show(truncate=False)

+-------------+-----------+-------------------+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
stats.groupBy(F.round("compression_ratio", 1).alias("ratio")).count().orderBy(
    "ratio"
).show()

+-----+-----+
|ratio|count|
+-----+-----+
|  0.0|   14|
|  0.1|   39|
|  0.2|  156|
|  0.3|   55|
|  0.4|  157|
|  0.5|  274|
|  0.6| 4002|
|  0.7|15299|
|  0.8|17307|
|  0.9|13033|
|  1.0| 8847|
|  1.1| 7066|
|  1.2| 4863|
|  1.3| 2588|
|  1.4| 1177|
|  1.5|  579|
|  1.6|  334|
|  1.7|  219|
|  1.8|    3|
|  1.9|    1|
+-----+-----+
only showing top 20 rows



In [24]:
spark.stop()