In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sentence_transformers import SentenceTransformer, util
from datasets import load_dataset

# --- Load Kaggle dataset ---
df_kaggle = pd.read_csv("train.csv").dropna(subset=["question1", "question2"])
train_df_kaggle, val_df_kaggle = train_test_split(df_kaggle, test_size=0.2, random_state=42)
y_val_kaggle = val_df_kaggle["is_duplicate"].values
print("Kaggle QQP loaded:", len(df_kaggle), "samples")

# --- Load Hugging Face GLUE QQP dataset ---
dataset = load_dataset("glue", "qqp")
df_hf = dataset["validation"].to_pandas().dropna(subset=["question1", "question2"])
y_val_hf = df_hf["label"].values
print("Hugging Face GLUE QQP loaded:", len(df_hf), "samples")

# --- Load SBERT model ---
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded: all-MiniLM-L6-v2")


Kaggle QQP loaded: 404287 samples
Hugging Face GLUE QQP loaded: 40430 samples
Model loaded: all-MiniLM-L6-v2


In [None]:
print("Encoding Kaggle QQP validation set...")
emb1_kaggle = model.encode(val_df_kaggle["question1"].tolist(), convert_to_tensor=True, show_progress_bar=True)
emb2_kaggle = model.encode(val_df_kaggle["question2"].tolist(), convert_to_tensor=True, show_progress_bar=True)


Encoding Kaggle QQP validation set...


Batches: 100%|██████████| 2527/2527 [03:09<00:00, 13.36it/s]
Batches: 100%|██████████| 2527/2527 [03:10<00:00, 13.25it/s]


In [None]:
import torch
import numpy as np
from tqdm import tqdm
from sentence_transformers import util

# Function to compute cosine similarities in smaller batches
def batched_cosine_sim(emb1, emb2, batch_size=512):
    sims = []
    for i in tqdm(range(0, len(emb1), batch_size), desc="Computing cosine similarities"):
        batch_emb1 = emb1[i:i+batch_size]
        batch_emb2 = emb2[i:i+batch_size]
        batch_sims = util.pytorch_cos_sim(batch_emb1, batch_emb2).diagonal()
        sims.append(batch_sims.cpu())
    return torch.cat(sims).numpy()

# --- Kaggle evaluation ---
cosine_scores_kaggle = batched_cosine_sim(emb1_kaggle, emb2_kaggle, batch_size=512)
threshold = 0.75  # can tune later

preds_kaggle = (cosine_scores_kaggle > threshold).astype(int)
print("\n=== Kaggle QQP Results ===")
print("Accuracy:", accuracy_score(y_val_kaggle, preds_kaggle))
print("F1:", f1_score(y_val_kaggle, preds_kaggle))


Computing cosine similarities: 100%|██████████| 158/158 [00:00<00:00, 749.77it/s]


=== Kaggle QQP Results ===
Accuracy: 0.7713646145093869
F1: 0.7385037554634567





In [None]:
print("\nEncoding Hugging Face GLUE QQP validation set...")
emb1_hf = model.encode(df_hf["question1"].tolist(), convert_to_tensor=True, show_progress_bar=True)
emb2_hf = model.encode(df_hf["question2"].tolist(), convert_to_tensor=True, show_progress_bar=True)

# Use the same memory-safe cosine function
cosine_scores_hf = batched_cosine_sim(emb1_hf, emb2_hf, batch_size=512)

preds_hf = (cosine_scores_hf > threshold).astype(int)

print("\n=== Hugging Face GLUE QQP Results ===")
print("Accuracy:", accuracy_score(y_val_hf, preds_hf))
print("F1:", f1_score(y_val_hf, preds_hf))



Encoding Hugging Face GLUE QQP validation set...


Batches: 100%|██████████| 1264/1264 [01:35<00:00, 13.18it/s]
Batches: 100%|██████████| 1264/1264 [01:37<00:00, 13.01it/s]



=== Hugging Face GLUE QQP Results ===
Accuracy: 0.7708632203809053
F1: 0.736982567713361
