In [1]:
%load_ext autoreload
%autoreload 2

## retrieval

In [3]:
from longeval.spark import get_spark
from pathlib import Path

spark = get_spark(cores=8, memory="20g")

data_root = Path("~/shared/longeval/2025/bm25/retrieval").expanduser()
retrieval = spark.read.parquet(data_root.as_posix())
retrieval.printSchema()

25/05/25 22:51:44 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


root
 |-- qid: string (nullable = true)
 |-- rank: integer (nullable = true)
 |-- docid: string (nullable = true)
 |-- score: double (nullable = true)
 |-- date: string (nullable = true)



In [4]:
retrieval.count()

                                                                                

22483047

In [20]:
from pyspark.sql import functions as F

df = (
    retrieval.groupBy("date", "qid")
    .agg(F.count("docid").alias("n"), F.countDistinct("docid").alias("k"))
    .cache()
)
df.show(10)



+-------+-----+---+---+
|   date|  qid|  n|  k|
+-------+-----+---+---+
|2022-08|22805|100|100|
|2022-08|23673|100|100|
|2022-08|24139|100|100|
|2022-08|16216|100|100|
|2022-08|17195|100|100|
|2022-08| 8735|100|100|
|2022-08| 9710|100|100|
|2022-07|27129|100|100|
|2022-08|33618|100|100|
|2022-08|33673|100|100|
+-------+-----+---+---+
only showing top 10 rows


                                                                                

In [27]:
# how many duplicate queries? turns out there aren't any
df.groupBy("date", "qid").count().filter(F.col("count") > 1).count()

0

In [28]:
df.describe().show()

+-------+-------+------------------+------------------+------------------+
|summary|   date|               qid|                 n|                 k|
+-------+-------+------------------+------------------+------------------+
|  count| 213402|            213402|            213402|            213402|
|   mean|   NULL| 29197.23128180617|105.35537155228161| 95.17458130664193|
| stddev|   NULL|21621.549424554232| 28.65202909187452|13.964482794544134|
|    min|2022-06|                10|                 1|                 1|
|    max|2023-08|              9999|               200|               100|
+-------+-------+------------------+------------------+------------------+



In [29]:
df.where("k < 100").groupBy("date").count().orderBy("date").show()

+-------+-----+
|   date|count|
+-------+-----+
|2022-06|  711|
|2022-07|  716|
|2022-08|  763|
|2022-09|  138|
|2022-10|11942|
|2022-11|14661|
|2022-12|  239|
|2023-01|15819|
|2023-02|  153|
|2023-03|  133|
|2023-04|  229|
|2023-05|  198|
|2023-06|  165|
|2023-07|  197|
|2023-08|  225|
+-------+-----+



In [16]:
df.where("k >= 100").groupBy("date").count().orderBy("date").show()

+-------+-----+
|   date|count|
+-------+-----+
|2022-06|23431|
|2022-07|24225|
|2022-08|26957|
|2022-09| 7604|
|2022-10|  146|
|2022-11|  216|
|2022-12|14991|
|2023-01|   58|
|2023-02| 7747|
|2023-03| 5474|
|2023-04|14217|
|2023-05|11350|
|2023-06| 8519|
|2023-07| 9563|
|2023-08|12615|
+-------+-----+



## evaluation

In [31]:
data_root = Path("~/shared/longeval/2025/bm25/evaluation").expanduser()
evaluation = spark.read.parquet(data_root.as_posix())
evaluation.printSchema()

root
 |-- map: double (nullable = true)
 |-- ndcg: double (nullable = true)
 |-- ndcg_cut_10: double (nullable = true)
 |-- ndcg_rel: double (nullable = true)
 |-- qid: string (nullable = true)
 |-- date: string (nullable = true)



In [36]:
evaluation.where("map > 0").show()

+---+----+-----------+--------+---+----+
|map|ndcg|ndcg_cut_10|ndcg_rel|qid|date|
+---+----+-----------+--------+---+----+
+---+----+-----------+--------+---+----+

