In [1]:
%load_ext autoreload
%autoreload 2
from longeval.spark import get_spark

spark = get_spark(cores=8, memory="28g")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/25 00:50:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/25 00:50:48 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [2]:
from longeval.collection import ParquetCollection
from pathlib import Path

data_root = Path("~/shared/longeval/2025/parquet/").expanduser()
train = ParquetCollection(spark, data_root / "train")
train.documents.printSchema()

root
 |-- contents: string (nullable = true)
 |-- docid: string (nullable = true)
 |-- split: string (nullable = true)
 |-- language: string (nullable = true)
 |-- date: string (nullable = true)



In [3]:
test = ParquetCollection(spark, data_root / "test")
test.documents.printSchema()

root
 |-- contents: string (nullable = true)
 |-- docid: string (nullable = true)
 |-- split: string (nullable = true)
 |-- language: string (nullable = true)
 |-- date: string (nullable = true)



In [7]:
# let's determine how many duplicate documents we have
from pyspark.sql import functions as F

(
    (train.documents.union(test.documents))
    .groupBy("split", "date", "docid")
    .agg(F.count("*").alias("n"))
    .where(F.count("*") > 1)
    .groupBy("split", "date", "n")
    .agg(F.count("*").alias("freq"))
    .orderBy("split", "date", "n")
).show()



+-----+-------+---+------+
|split|   date|  n|  freq|
+-----+-------+---+------+
| test|2023-03|  2|212342|
| test|2023-03|  3| 70942|
| test|2023-03|  4| 25630|
| test|2023-03|  5|  8785|
| test|2023-03|  6|  2838|
| test|2023-03|  7|   725|
| test|2023-03|  8|   223|
| test|2023-03|  9|    75|
| test|2023-03| 10|    26|
| test|2023-03| 11|     5|
| test|2023-03| 12|     1|
| test|2023-03| 13|     1|
| test|2023-03| 15|     1|
| test|2023-03| 28|     1|
| test|2023-04|  2|206784|
| test|2023-04|  3| 67915|
| test|2023-04|  4| 24136|
| test|2023-04|  5|  8031|
| test|2023-04|  6|  2493|
| test|2023-04|  7|   668|
+-----+-------+---+------+
only showing top 20 rows



                                                                                

In [17]:
# what's the total number of duplicate documents?
(
    (train.documents.union(test.documents))
    .groupBy("split", "date", "docid")
    .agg(F.count("*").alias("n"))
    .groupBy("split", "date", "n")
    .agg(F.count("*").alias("freq"))
    # now pivot on split
    .groupBy("n")
    .pivot("split", ["train", "test"])
    .agg(
        F.avg("freq").cast("integer").alias("avg_freq"),
        F.std("freq").cast("integer").alias("std_freq"),
    )
    .orderBy("n")
).show()



+---+--------------+--------------+-------------+-------------+
|  n|train_avg_freq|train_std_freq|test_avg_freq|test_std_freq|
+---+--------------+--------------+-------------+-------------+
|  1|       1592567|        297225|      1643971|       181625|
|  2|        158678|         51089|       198806|        30419|
|  3|         40312|         23807|        64502|        14372|
|  4|         12405|         10174|        22669|         6487|
|  5|          4002|          3694|         7577|         2638|
|  6|          1200|          1234|         2382|          917|
|  7|           300|           321|          616|          257|
|  8|            94|            98|          185|           79|
|  9|            31|            34|           63|           27|
| 10|            18|             5|           21|            9|
| 11|             3|             2|            5|            0|
| 12|             1|             0|            1|            0|
| 13|             1|             0|     

                                                                                

In [14]:
# let's choose a document that shows up more than 3 times, and then see what the differences are between documents
duplicate = (
    train.documents.groupBy("docid")
    .agg(F.count("*").alias("n"))
    .where(F.col("n") > 3)
    .orderBy("docid")
    .limit(1)
)
train.documents.join(duplicate, "docid").show(truncate=100, vertical=True)



-RECORD 0--------------------------------------------------------------------------------------------------------
 docid    | doc10                                                                                                
 contents | WWW.SAURCLIENT.FR ESPACE CLIENT WWW.SAURCLIENT.FR ESPACE CLIENT WWW.SAURCLIENT.FR ESPACE CLIENT L... 
 split    | train                                                                                                
 language | French                                                                                               
 date     | 2023-02                                                                                              
 n        | 9                                                                                                    
-RECORD 1--------------------------------------------------------------------------------------------------------
 docid    | doc10                                                                       

                                                                                

In [None]:
# are these the same document? yeah they are
train.documents.join(duplicate, "docid").select(F.length("contents")).show()



+----------------+
|length(contents)|
+----------------+
|            2477|
|            2477|
|            2477|
|            2477|
|            2477|
|            2477|
|            2477|
|            2477|
|            2477|
+----------------+



                                                                                