In [1]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import os
from ast import literal_eval

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_preprocess_eda_dir = os.getcwd()
data_dir = os.path.abspath(os.path.join(data_preprocess_eda_dir, "..", "data"))

queries_file = os.path.join(data_dir, "antique_test_queries.csv")
qrels_file = os.path.join(data_dir, "antique_test_qrels.csv")
docs_file = os.path.join(data_dir, "antique_test_docs.csv")

queries_df = pd.read_csv(queries_file, dtype={"query_id": str})
qrels_df = pd.read_csv(qrels_file, dtype={"query_id": str, "doc_id": str})
docs_df = pd.read_csv(docs_file, dtype={"doc_id": str})

merged_df = qrels_df.merge(queries_df, on="query_id", how="left")
merged_df = merged_df.merge(docs_df, on="doc_id", how="left")

merged_df.rename(columns={"text_x": "query_text", "text_y": "doc_text"}, inplace=True)

merged_file = os.path.join(data_dir, "antique_test_merged.csv")
merged_df.to_csv(merged_file, index=False)

print(f"Merged test dataset saved to {merged_file}")

Merged test dataset saved to c:\Users\karishma\OneDrive\Projects\qCLEF\data\antique_test_merged.csv


In [3]:
merged_df.head()

Unnamed: 0,query_id,doc_id,relevance,iteration,query_text,doc_text
0,1964316,1964316_5,4,U0,"What do you mean by ""weed""?",Weed could mean the bad thing that grow in ur ...
1,1964316,1674088_11,1,Q0,"What do you mean by ""weed""?",sell weed
2,1964316,1218838_13,2,Q0,"What do you mean by ""weed""?",My weed!!
3,1964316,1519022_15,2,Q0,"What do you mean by ""weed""?",because we dont know what the hell to make leg...
4,1964316,3059341_5,2,Q0,"What do you mean by ""weed""?",Its a weed.


In [4]:
data_dir = os.path.abspath(os.path.join(os.getcwd(), "..", "data"))
output_csv = os.path.join(data_dir, "antique_test_with_embeddings.csv")

test_df = pd.read_csv(os.path.join(data_dir, "antique_test_merged.csv"))

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def encode_texts(texts):
    return model.encode(texts, batch_size=32, convert_to_numpy=True, show_progress_bar=True)

query_embeddings = encode_texts(test_df["query_text"].tolist())
doc_embeddings = encode_texts(test_df["doc_text"].tolist())

test_df["query_embedding"] = query_embeddings.tolist()
test_df["doc_embedding"] = doc_embeddings.tolist()

test_df.to_csv(output_csv, index=False, float_format="%.18f")

print(f"Test DataFrame with embeddings saved to {output_csv}")

Batches: 100%|██████████| 206/206 [00:01<00:00, 129.30it/s]
Batches: 100%|██████████| 206/206 [00:03<00:00, 66.88it/s] 


Test DataFrame with embeddings saved to c:\Users\karishma\OneDrive\Projects\qCLEF\data\antique_test_with_embeddings.csv


In [5]:
test_df.head()

Unnamed: 0,query_id,doc_id,relevance,iteration,query_text,doc_text,query_embedding,doc_embedding
0,1964316,1964316_5,4,U0,"What do you mean by ""weed""?",Weed could mean the bad thing that grow in ur ...,"[0.03988630697131157, 0.007177216000854969, -0...","[0.031502485275268555, 0.021122073754668236, -..."
1,1964316,1674088_11,1,Q0,"What do you mean by ""weed""?",sell weed,"[0.03988630697131157, 0.007177216000854969, -0...","[0.05643288791179657, 0.04761527478694916, -0...."
2,1964316,1218838_13,2,Q0,"What do you mean by ""weed""?",My weed!!,"[0.03988630697131157, 0.007177216000854969, -0...","[-0.02230476588010788, 0.05732327699661255, -0..."
3,1964316,1519022_15,2,Q0,"What do you mean by ""weed""?",because we dont know what the hell to make leg...,"[0.03988630697131157, 0.007177216000854969, -0...","[0.09742356091737747, -0.10034129023551941, 0...."
4,1964316,3059341_5,2,Q0,"What do you mean by ""weed""?",Its a weed.,"[0.03988630697131157, 0.007177216000854969, -0...","[0.03170603886246681, 0.03499998152256012, -0...."
