In [1]:
import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import ast

  from .autonotebook import tqdm as notebook_tqdm


Combine relevant files

In [None]:
data_preprocess_eda_dir = os.getcwd()
data_dir = os.path.abspath(os.path.join(data_preprocess_eda_dir, "..", "data"))

queries_file = os.path.join(data_dir, "antique_train_queries.csv")
qrels_file = os.path.join(data_dir, "antique_train_qrels.csv")
docs_file = os.path.join(data_dir, "antique_train_docs.csv")

queries_df = pd.read_csv(queries_file, dtype={"query_id": str})
qrels_df = pd.read_csv(qrels_file, dtype={"query_id": str, "doc_id": str})
docs_df = pd.read_csv(docs_file, dtype={"doc_id": str})

In [4]:
merged_df = qrels_df.merge(queries_df, on="query_id", how="left")

In [5]:
merged_df = merged_df.merge(docs_df, on="doc_id", how="left")

In [6]:
merged_df.rename(columns={"text_x": "query_text", "text_y": "doc_text"}, inplace=True)

In [None]:
merged_file = os.path.join(data_dir, "antique_train_merged.csv")
merged_df.to_csv(merged_file, index=False)

print(f"Merged dataset saved to {merged_file}")
display(merged_df.head())

Merged dataset saved to c:\Users\karishma\OneDrive\Projects\qCLEF\data\antique_train_merged.csv


Unnamed: 0,query_id,doc_id,relevance,iteration,query_text,doc_text
0,2531329,2531329_0,4,U0,Why do some men spit into the urinal before ur...,I do it all the time. It is kind of a ritual ...
1,2531329,2531329_5,4,Q0,Why do some men spit into the urinal before ur...,To clear out the mucus deep down in the throat...
2,2531329,2531329_4,3,Q0,Why do some men spit into the urinal before ur...,"maybe they want a target to hit. Well, I gues..."
3,2531329,2531329_7,3,Q0,Why do some men spit into the urinal before ur...,Where else would we spit?... Apart from sports...
4,2531329,2531329_6,3,Q0,Why do some men spit into the urinal before ur...,Because they have a cough or phlegm and hacked...


Examine data

In [9]:
merged_file_path = os.path.join(data_dir, "antique_train_merged.csv")

train_df = pd.read_csv(merged_file_path)
train_df.head()

Unnamed: 0,query_id,doc_id,relevance,iteration,query_text,doc_text
0,2531329,2531329_0,4,U0,Why do some men spit into the urinal before ur...,I do it all the time. It is kind of a ritual ...
1,2531329,2531329_5,4,Q0,Why do some men spit into the urinal before ur...,To clear out the mucus deep down in the throat...
2,2531329,2531329_4,3,Q0,Why do some men spit into the urinal before ur...,"maybe they want a target to hit. Well, I gues..."
3,2531329,2531329_7,3,Q0,Why do some men spit into the urinal before ur...,Where else would we spit?... Apart from sports...
4,2531329,2531329_6,3,Q0,Why do some men spit into the urinal before ur...,Because they have a cough or phlegm and hacked...


In [10]:
train_df['doc_text'][0]

'I do it all the time.  It is kind of a ritual for me, like you have when you are shooting free throws.  I guess it relaxes me and lets me get ready to take care of business.'

Encode data

In [13]:
output_csv = os.path.join(data_dir, "antique_train_with_embeddings.csv")

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def encode_texts(texts):
    return model.encode(texts, batch_size=32, convert_to_numpy=True, show_progress_bar=True)

query_embeddings = encode_texts(train_df["query_text"].tolist())
doc_embeddings = encode_texts(train_df["doc_text"].tolist())

train_df["query_embedding"] = query_embeddings.tolist()
train_df["doc_embedding"] = doc_embeddings.tolist()

train_df.to_csv(output_csv, index=False, float_format="%.18f")

print(f"Train DataFrame with embeddings saved to {output_csv}")
display(train_df.head())

Batches: 100%|██████████| 857/857 [00:07<00:00, 118.09it/s]
Batches: 100%|██████████| 857/857 [00:12<00:00, 66.77it/s] 


Train DataFrame with embeddings saved to c:\Users\karishma\OneDrive\Projects\qCLEF\data\antique_train_with_embeddings.csv


Unnamed: 0,query_id,doc_id,relevance,iteration,query_text,doc_text,query_embedding,doc_embedding
0,2531329,2531329_0,4,U0,Why do some men spit into the urinal before ur...,I do it all the time. It is kind of a ritual ...,"[0.06218868866562843, -0.015421810559928417, 0...","[0.0625365749001503, 0.04067419469356537, -0.0..."
1,2531329,2531329_5,4,Q0,Why do some men spit into the urinal before ur...,To clear out the mucus deep down in the throat...,"[0.06218868866562843, -0.015421810559928417, 0...","[0.08527150005102158, -0.03753829002380371, 0...."
2,2531329,2531329_4,3,Q0,Why do some men spit into the urinal before ur...,"maybe they want a target to hit. Well, I gues...","[0.06218868866562843, -0.015421810559928417, 0...","[0.13832318782806396, -0.0030896335374563932, ..."
3,2531329,2531329_7,3,Q0,Why do some men spit into the urinal before ur...,Where else would we spit?... Apart from sports...,"[0.06218868866562843, -0.015421810559928417, 0...","[0.0586501806974411, 0.019921783357858658, 0.0..."
4,2531329,2531329_6,3,Q0,Why do some men spit into the urinal before ur...,Because they have a cough or phlegm and hacked...,"[0.06218866631388664, -0.015421760268509388, 0...","[0.08216521143913269, 0.040984902530908585, 0...."
