In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from longeval.spark import get_spark
from longeval.collection import ParquetCollection
from pathlib import Path

spark = get_spark(cores=8, memory="20g")

data_root = Path("~/shared/longeval/2025/parquet/").expanduser()
train = ParquetCollection(spark, data_root)
train.queries.printSchema()
train.qrels.printSchema()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/22 11:11:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/22 11:11:38 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


root
 |-- qid: string (nullable = true)
 |-- query: string (nullable = true)

root
 |-- qid: string (nullable = true)
 |-- rank: integer (nullable = true)
 |-- docid: string (nullable = true)
 |-- rel: integer (nullable = true)
 |-- split: string (nullable = true)
 |-- language: string (nullable = true)
 |-- date: string (nullable = true)



In [3]:
train.documents.printSchema()
train.documents.groupBy("date").count().orderBy("date").show()

root
 |-- contents: string (nullable = true)
 |-- docid: string (nullable = true)
 |-- split: string (nullable = true)
 |-- language: string (nullable = true)
 |-- date: string (nullable = true)



                                                                                

+-------+-------+
|   date|  count|
+-------+-------+
|2022-06|1775681|
|2022-07|1777616|
|2022-08|1787018|
|2022-09|1210186|
|2022-10|2418103|
|2022-11|2433787|
|2022-12|2534242|
|2023-01|2537565|
|2023-02|2526382|
+-------+-------+



In [5]:
retrieval_root = Path("~/shared/longeval/2025/bm25/retrieval").expanduser()
retrieval = spark.read.parquet(retrieval_root.as_posix())
retrieval.printSchema()
retrieval.show(n=3)
retrieval.count()

root
 |-- qid: string (nullable = true)
 |-- total: long (nullable = true)
 |-- max_score: double (nullable = true)
 |-- docids: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- scores: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- qrel: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- docid: string (nullable = true)
 |    |    |-- rel: integer (nullable = true)
 |-- date: string (nullable = true)

+-----+-----+------------------+--------------------+--------------------+--------------------+-------+
|  qid|total|         max_score|              docids|              scores|                qrel|   date|
+-----+-----+------------------+--------------------+--------------------+--------------------+-------+
|10006|  100| 9.093899726867676|[doc1688336, doc1...|[9.09389972686767...|[{2867767, 2}, {1...|2023-01|
| 1001|  100|10.286600112915039|[doc3356420, doc1...|[10.2866001129150...|[{1642418, 

                                                                                

92875

In [16]:
from pyspark.sql import functions as F

# zip docids scores and qres into a single column
joined = (
    retrieval.withColumn("zipped", F.arrays_zip("docids", "scores"))
    .select("qid", "date", F.explode("zipped").alias("zipped"))
    .select(
        "qid",
        "date",
        F.col("zipped.docids").alias("docid"),
        F.col("zipped.scores").alias("score"),
    )
    .join(
        train.documents.select("docid", "date", "contents"),
        on=["docid", "date"],
    )
    .join(
        train.queries.select("qid", "query"),
        on=["qid"],
    )
)
joined.printSchema()

root
 |-- qid: string (nullable = true)
 |-- docid: string (nullable = true)
 |-- date: string (nullable = true)
 |-- score: double (nullable = true)
 |-- contents: string (nullable = true)
 |-- query: string (nullable = true)



In [None]:
joined.write.partitionBy("date").parquet(
    Path("~/shared/longeval/2025/bm25/retrieval_joined").expanduser().as_posix(),
    mode="overwrite",
)

