In [17]:
from longeval.spark import get_spark
from longeval.collection import ParquetCollection
from pyspark.sql import functions as F

spark = get_spark()
root = "../../tests/integration"
collection = ParquetCollection(spark, f"{root}/parquet/train/2023_01/English")

collection.queries.printSchema()
collection.queries.show(3)

collection.qrels.printSchema()
collection.qrels.show(3)

relevant_queries = collection.queries.join(
    collection.qrels.where("rel > 0")
    .groupBy("qid")
    .agg(F.collect_set("docid").alias("rel_docids")),
    on="qid",
).select("qid", "query", "rel_docids")

relevant_queries.printSchema()
relevant_queries.show(3)

25/03/15 19:43:56 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


root
 |-- qid: string (nullable = true)
 |-- query: string (nullable = true)

+--------+--------------------+
|     qid|               query|
+--------+--------------------+
| q012318|case over the border|
| q012396|      water atlantic|
|q0123180|blanquette de vea...|
+--------+--------------------+
only showing top 3 rows

root
 |-- qid: string (nullable = true)
 |-- rank: integer (nullable = true)
 |-- docid: string (nullable = true)
 |-- rel: integer (nullable = true)

+-------+----+---------------+---+
|    qid|rank|          docid|rel|
+-------+----+---------------+---+
|q012318|   0|doc012303114898|  0|
|q012318|   0|doc012307806130|  1|
|q012318|   0|doc012311314092|  0|
+-------+----+---------------+---+
only showing top 3 rows

root
 |-- qid: string (nullable = true)
 |-- query: string (nullable = true)
 |-- rel_docids: array (nullable = false)
 |    |-- element: string (containsNull = false)

+-----------------+-------------------+--------------------+
|              qid|   

In [16]:
pdf = relevant_queries.toPandas()
print(pdf.head())

                 qid                 query  \
0  q0123103079215124   bill of sale vessel   
1  q0123103079215188      areches beaufort   
2  q0123103079215846        schengen space   
3  q0123103079215871          office chair   
4  q0123103079215891  fontainebleau cinema   

                                          rel_docids             index_name  
0  [doc012308214224, doc012311713704, doc01230130...  train-english-2023_01  
1  [doc012312405753, doc012304816793, doc01230430...  train-english-2023_01  
2  [doc012300218460, doc012303114703, doc01231200...  train-english-2023_01  
3  [doc012300401944, doc012300814609, doc01230430...  train-english-2023_01  
4  [doc012301119210, doc012303612581, doc01230890...  train-english-2023_01  


In [92]:
from opensearchpy import OpenSearch


def generate_bulk_query(df, index_name: str) -> list[dict]:
    data = []
    for row in df.itertuples():
        data += [
            {
                "index": index_name,
            },
            {
                "query": {
                    "match": {
                        "contents": {
                            "query": row.query,
                        }
                    }
                },
                "_source": False,
            },
        ]
    return data


client = OpenSearch("http://localhost:9200")
index_name = "train-english-2023_01"

results = client.msearch(generate_bulk_query(pdf, index_name))
# now iterate over the results and add in the original query
for row, obj in zip(pdf.itertuples(), results["responses"]):
    obj["qid"] = row.qid

In [95]:
import pandas as pd

print(results.keys())
print(results["took"])

resp = pd.DataFrame(results["responses"])
display(resp.head())
shards = pd.DataFrame(resp["_shards"].tolist())
display(shards.head())
hits = pd.DataFrame(resp["hits"].tolist())
hits["qid"] = pdf["qid"]
display(hits.head())
display(hits.iloc[0].hits[:3])

dict_keys(['took', 'responses'])
519


Unnamed: 0,took,timed_out,_shards,hits,status,qid
0,12,False,"{'total': 1, 'successful': 1, 'skipped': 0, 'f...","{'total': {'value': 1191, 'relation': 'eq'}, '...",200,q0123103079215124
1,8,False,"{'total': 1, 'successful': 1, 'skipped': 0, 'f...","{'total': {'value': 2, 'relation': 'eq'}, 'max...",200,q0123103079215188
2,16,False,"{'total': 1, 'successful': 1, 'skipped': 0, 'f...","{'total': {'value': 138, 'relation': 'eq'}, 'm...",200,q0123103079215846
3,6,False,"{'total': 1, 'successful': 1, 'skipped': 0, 'f...","{'total': {'value': 136, 'relation': 'eq'}, 'm...",200,q0123103079215871
4,4,False,"{'total': 1, 'successful': 1, 'skipped': 0, 'f...","{'total': {'value': 35, 'relation': 'eq'}, 'ma...",200,q0123103079215891


Unnamed: 0,total,successful,skipped,failed
0,1,1,0,0
1,1,1,0,0
2,1,1,0,0
3,1,1,0,0
4,1,1,0,0


Unnamed: 0,total,max_score,hits,qid
0,"{'value': 1191, 'relation': 'eq'}",10.457349,"[{'_index': 'train-english-2023_01', '_id': 'S...",q0123103079215124
1,"{'value': 2, 'relation': 'eq'}",7.458131,"[{'_index': 'train-english-2023_01', '_id': 'V...",q0123103079215188
2,"{'value': 138, 'relation': 'eq'}",5.751015,"[{'_index': 'train-english-2023_01', '_id': 'o...",q0123103079215846
3,"{'value': 136, 'relation': 'eq'}",9.848803,"[{'_index': 'train-english-2023_01', '_id': 'w...",q0123103079215871
4,"{'value': 35, 'relation': 'eq'}",8.403374,"[{'_index': 'train-english-2023_01', '_id': 'K...",q0123103079215891


[{'_index': 'train-english-2023_01',
  '_id': 'SLq_mJUB7FcVRN93gFpa',
  '_score': 10.457349},
 {'_index': 'train-english-2023_01',
  '_id': 'Nrq_mJUB7FcVRN93gVuG',
  '_score': 9.1460285},
 {'_index': 'train-english-2023_01',
  '_id': 'irq_mJUB7FcVRN93f1im',
  '_score': 8.8720665}]

In [96]:
tmp = spark.createDataFrame(results["responses"])
tmp.printSchema()
tmp.select("hits").show(5, truncate=100)

root
 |-- _shards: map (nullable = true)
 |    |-- key: string
 |    |-- value: long (valueContainsNull = true)
 |-- hits: map (nullable = true)
 |    |-- key: string
 |    |-- value: map (valueContainsNull = true)
 |    |    |-- key: string
 |    |    |-- value: long (valueContainsNull = true)
 |-- qid: string (nullable = true)
 |-- status: long (nullable = true)
 |-- timed_out: boolean (nullable = true)
 |-- took: long (nullable = true)

+-----------------------------------------------------------------------------+
|                                                                         hits|
+-----------------------------------------------------------------------------+
|{hits -> NULL, total -> {value -> 1191, relation -> NULL}, max_score -> NULL}|
|   {hits -> NULL, total -> {value -> 2, relation -> NULL}, max_score -> NULL}|
| {hits -> NULL, total -> {value -> 138, relation -> NULL}, max_score -> NULL}|
| {hits -> NULL, total -> {value -> 136, relation -> NULL}, max_score -> NUL

In [99]:
schema = """
qid: string,
hits: struct<
    total: struct<value: long, relation: string>,
    max_score: double,
    hits: array<struct<_index: string, _id: string, _score: double>>
>
"""
resp = spark.createDataFrame(results["responses"], schema=schema).select(
    "qid", "hits.*"
)
resp.printSchema()
resp.show()

root
 |-- qid: string (nullable = true)
 |-- total: struct (nullable = true)
 |    |-- value: long (nullable = true)
 |    |-- relation: string (nullable = true)
 |-- max_score: double (nullable = true)
 |-- hits: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _index: string (nullable = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- _score: double (nullable = true)

+-----------------+----------+----------+--------------------+
|              qid|     total| max_score|                hits|
+-----------------+----------+----------+--------------------+
|q0123103079215124|{1191, eq}| 10.457349|[{train-english-2...|
|q0123103079215188|   {2, eq}|  7.458131|[{train-english-2...|
|q0123103079215846| {138, eq}| 5.7510147|[{train-english-2...|
|q0123103079215871| {136, eq}|  9.848803|[{train-english-2...|
|q0123103079215891|  {35, eq}|  8.403374|[{train-english-2...|
|q0123103079215932| {360, eq}| 17.849213|[{train-english-2...|
|q012