In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load Kaggle dataset
df_kaggle = pd.read_csv("train.csv").dropna(subset=['question1', 'question2'])
train_df_kaggle, val_df_kaggle = train_test_split(df_kaggle, test_size=0.2, random_state=42)
print("Kaggle QQP loaded:", len(df_kaggle), "samples")


Kaggle QQP loaded: 404287 samples


In [13]:
from datasets import load_dataset

dataset = load_dataset("glue", "qqp")
df_hf = dataset["train"].to_pandas().dropna(subset=['question1', 'question2'])
train_df_hf, val_df_hf = train_test_split(df_hf, test_size=0.2, random_state=42)
print("Hugging Face GLUE QQP loaded:", len(df_hf), "samples")


Hugging Face GLUE QQP loaded: 363846 samples


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import pandas as pd

def run_tfidf_baseline(train_df, val_df, label_col='is_duplicate'):
    print("\nRunning TF-IDF baseline...")
    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf.fit(pd.concat([train_df['question1'], train_df['question2']]))

    def pairwise_cosine(df):
        q1 = tfidf.transform(df['question1'])
        q2 = tfidf.transform(df['question2'])
        sims = [cosine_similarity(q1[i], q2[i])[0][0] for i in range(q1.shape[0])]
        return np.array(sims).reshape(-1, 1)

    X_train = pairwise_cosine(train_df)
    y_train = train_df[label_col].values
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)

    X_val = pairwise_cosine(val_df)
    y_val = val_df[label_col].values
    preds = clf.predict(X_val)

    print(f"Accuracy: {accuracy_score(y_val, preds):.4f}")
    print(f"F1 Score: {f1_score(y_val, preds):.4f}")

# --- Run on both datasets ---
print("=== Kaggle QQP ===")
run_tfidf_baseline(train_df_kaggle, val_df_kaggle)

print("\n=== Hugging Face GLUE QQP ===")
run_tfidf_baseline(train_df_hf, val_df_hf, label_col='label')


=== Kaggle QQP ===

Running TF-IDF baseline...
Accuracy: 0.6466
F1 Score: 0.4468

=== Hugging Face GLUE QQP ===

Running TF-IDF baseline...
Accuracy: 0.6485
F1 Score: 0.4483
