In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from longeval.spark import get_spark
from longeval.collection import ParquetCollection
from pathlib import Path

spark = get_spark(cores=8, memory="20g")

data_root = Path("~/shared/longeval/2025/parquet/").expanduser()
train = ParquetCollection(spark, data_root)
train.queries.printSchema()
train.qrels.printSchema()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/19 16:24:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/19 16:24:42 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


root
 |-- qid: string (nullable = true)
 |-- query: string (nullable = true)

root
 |-- qid: string (nullable = true)
 |-- rank: integer (nullable = true)
 |-- docid: string (nullable = true)
 |-- rel: integer (nullable = true)
 |-- split: string (nullable = true)
 |-- language: string (nullable = true)
 |-- date: string (nullable = true)



In [3]:
# let's double check assumptions about overlap in the data
train.documents.printSchema()
train.documents.groupBy("docid").count().summary().show()

25/04/19 16:24:56 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


root
 |-- contents: string (nullable = true)
 |-- docid: string (nullable = true)
 |-- split: string (nullable = true)
 |-- language: string (nullable = true)
 |-- date: string (nullable = true)



                                                                                

+-------+---------+------------------+
|summary|    docid|             count|
+-------+---------+------------------+
|  count|  3436116|           3436116|
|   mean|     NULL| 5.529667799340884|
| stddev|     NULL|4.0605147611216585|
|    min|     doc1|                 1|
|    25%|     NULL|                 3|
|    50%|     NULL|                 5|
|    75%|     NULL|                 8|
|    max|doc999999|               311|
+-------+---------+------------------+



In [4]:
train.documents.groupBy("date").count().orderBy("date").show()



+-------+-------+
|   date|  count|
+-------+-------+
|2022-06|1775681|
|2022-07|1777616|
|2022-08|1787018|
|2022-09|1210186|
|2022-10|2418103|
|2022-11|2433787|
|2022-12|2534242|
|2023-01|2537565|
|2023-02|2526382|
+-------+-------+



                                                                                

So this analysis tells us that we should be building a different index for each of the dates. Each document also appears at different dates.

There are other things that we might want to detect -- is a page english or is it french? This is from a french internet index, but I'm pretty sure most of the content on the internet is english anyways. 

Regardless, the thing is super slow right now because there are a lot of documents. Here we're still working with 

In [5]:
from longeval.experiment.bm25.evaluation import (
    prepare_queries,
    run_search,
    score_search,
)

queries = prepare_queries(train)
queries.printSchema()
queries.show(5)

  from .autonotebook import tqdm as notebook_tqdm


root
 |-- date: string (nullable = true)
 |-- qid: string (nullable = true)
 |-- query: string (nullable = true)
 |-- qrel: array (nullable = false)
 |    |-- element: struct (containsNull = false)
 |    |    |-- docid: string (nullable = true)
 |    |    |-- rel: integer (nullable = true)



                                                                                

+-------+----+--------------------+--------------------+
|   date| qid|               query|                qrel|
+-------+----+--------------------+--------------------+
|2022-09|   1|        101boyvideos|[{20007, 0}, {273...|
|2022-10|  10|    a vendre chateau|[{1592834, 0}, {1...|
|2022-08| 100|   appli pole emploi|[{7753, 0}, {3619...|
|2022-07|1000|ent mon bureau nu...|[{10529, 1}, {749...|
|2022-10|1000|ent mon bureau nu...|[{7496, 2}, {1052...|
+-------+----+--------------------+--------------------+
only showing top 5 rows



In [8]:
index_path = Path("~/scratch/longeval/temp/bm25/index").expanduser().as_posix()

date = queries.select("date").distinct().orderBy("date").first().date
search_df = run_search(queries.where(f"date='{date}'"), index_path, k=100)
eval_df = score_search(search_df)
eval_df

25/04/19 16:57:17 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
25/04/19 16:57:19 WARN TaskSetManager: Stage 60 contains a task of very large size (2364 KiB). The maximum recommended task size is 1000 KiB.
25/04/19 16:57:23 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


{Row(date='2022-06'): DataFrame[map: double, ndcg: double, ndcg_cut_10: double, ndcg_rel: double, qid: string],
 '2022-06': DataFrame[map: double, ndcg: double, ndcg_cut_10: double, ndcg_rel: double, qid: string]}

In [9]:
eval_df.show()

+---+----+-----------+--------+-----+
|map|ndcg|ndcg_cut_10|ndcg_rel|  qid|
+---+----+-----------+--------+-----+
|0.0| 0.0|        0.0|     0.0|10010|
|0.0| 0.0|        0.0|     0.0|10012|
|0.0| 0.0|        0.0|     0.0|10038|
|0.0| 0.0|        0.0|     0.0|10077|
|0.0| 0.0|        0.0|     0.0| 1008|
|0.0| 0.0|        0.0|     0.0|10088|
|0.0| 0.0|        0.0|     0.0|10096|
|0.0| 0.0|        0.0|     0.0|10104|
|0.0| 0.0|        0.0|     0.0|10114|
|0.0| 0.0|        0.0|     0.0| 1012|
|0.0| 0.0|        0.0|     0.0|10124|
|0.0| 0.0|        0.0|     0.0|10133|
|0.0| 0.0|        0.0|     0.0|10138|
|0.0| 0.0|        0.0|     0.0|10177|
|0.0| 0.0|        0.0|     0.0|10179|
|0.0| 0.0|        0.0|     0.0|10191|
|0.0| 0.0|        0.0|     0.0|10195|
|0.0| 0.0|        0.0|     0.0|10203|
|0.0| 0.0|        0.0|     0.0| 1021|
|0.0| 0.0|        0.0|     0.0|10250|
+---+----+-----------+--------+-----+
only showing top 20 rows

