In [1]:
%load_ext autoreload
%autoreload 2

## retrieval

In [2]:
from longeval.spark import get_spark
from pathlib import Path

spark = get_spark(cores=8, memory="20g")

data_root = Path("~/shared/longeval/2025/bm25/retrieval").expanduser()
retrieval = spark.read.parquet(data_root.as_posix())
retrieval.printSchema()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/26 04:08:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/26 04:08:29 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in standalone/kubernetes and LOCAL_DIRS in YARN).
25/05/26 04:08:30 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


root
 |-- qid: string (nullable = true)
 |-- rank: integer (nullable = true)
 |-- docid: string (nullable = true)
 |-- score: double (nullable = true)
 |-- date: string (nullable = true)
 |-- sample_id: integer (nullable = true)



In [3]:
retrieval.count()

                                                                                

20976626

In [4]:
from pyspark.sql import functions as F

df = (
    retrieval.groupBy("date", "qid")
    .agg(F.count("docid").alias("n"), F.countDistinct("docid").alias("k"))
    .cache()
)
df.show(10)



+-------+-----+---+---+
|   date|  qid|  n|  k|
+-------+-----+---+---+
|2022-08|19306|100|100|
|2022-08|11159|100|100|
|2022-08|13052|100|100|
|2022-08|13431|100|100|
|2022-08|21240|100|100|
|2022-08|18721|100|100|
|2022-08| 7362|100|100|
|2022-08| 9710|100|100|
|2022-08|12726|100|100|
|2022-08| 6942|100|100|
+-------+-----+---+---+
only showing top 10 rows


                                                                                

In [5]:
# how many duplicate queries? turns out there aren't any
df.groupBy("date", "qid").count().filter(F.col("count") > 1).count()

                                                                                

0

In [6]:
df.describe().show()

25/05/26 04:09:07 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------+-------+------------------+------------------+------------------+
|summary|   date|               qid|                 n|                 k|
+-------+-------+------------------+------------------+------------------+
|  count| 213402|            213402|            213402|            213402|
|   mean|   NULL| 29197.23128180617|  98.2962952549648|  98.2962952549648|
| stddev|   NULL|21621.549424554232|12.094725117718163|12.094725117718163|
|    min|2022-06|                10|                 1|                 1|
|    max|2023-08|              9999|               100|               100|
+-------+-------+------------------+------------------+------------------+



                                                                                

In [7]:
df.where("k < 100").groupBy("date").count().orderBy("date").show()

+-------+-----+
|   date|count|
+-------+-----+
|2022-06|  711|
|2022-07|  716|
|2022-08|  763|
|2022-09|  138|
|2022-10|  197|
|2022-11|  227|
|2022-12|  239|
|2023-01|  271|
|2023-02|  153|
|2023-03|  133|
|2023-04|  229|
|2023-05|  198|
|2023-06|  165|
|2023-07|  197|
|2023-08|  225|
+-------+-----+



In [8]:
df.where("k >= 100").groupBy("date").count().orderBy("date").show()

+-------+-----+
|   date|count|
+-------+-----+
|2022-06|23431|
|2022-07|24225|
|2022-08|26957|
|2022-09| 7604|
|2022-10|11891|
|2022-11|14650|
|2022-12|14991|
|2023-01|15606|
|2023-02| 7747|
|2023-03| 5474|
|2023-04|14217|
|2023-05|11350|
|2023-06| 8519|
|2023-07| 9563|
|2023-08|12615|
+-------+-----+



                                                                                

## evaluation

In [9]:
data_root = Path("~/shared/longeval/2025/bm25/evaluation").expanduser()
evaluation = spark.read.parquet(data_root.as_posix())
evaluation.printSchema()

root
 |-- map: double (nullable = true)
 |-- ndcg: double (nullable = true)
 |-- ndcg_cut_10: double (nullable = true)
 |-- ndcg_rel: double (nullable = true)
 |-- qid: string (nullable = true)
 |-- date: string (nullable = true)



In [12]:
evaluation.describe().show()



+-------+------+------+-----------+--------+------------------+-------+
|summary|   map|  ndcg|ndcg_cut_10|ndcg_rel|               qid|   date|
+-------+------+------+-----------+--------+------------------+-------+
|  count|150517|150517|     150517|  150517|            150517| 150517|
|   mean|   0.0|   0.0|        0.0|     0.0| 24729.62055448886|   NULL|
| stddev|   0.0|   0.0|        0.0|     0.0|18165.424782909155|   NULL|
|    min|   0.0|   0.0|        0.0|     0.0|                10|2022-06|
|    max|   0.0|   0.0|        0.0|     0.0|              9999|2023-02|
+-------+------+------+-----------+--------+------------------+-------+



                                                                                