In [1]:
from sentence_transformers import SentenceTransformer, InputExample, losses, models,util
from torch.utils.data import DataLoader
import pandas as pd
from random import choice
import numpy as np
from rank_bm25 import BM25Okapi
from tqdm import tqdm

In [2]:
# Build training data Basic Positive + Random Negative
model = SentenceTransformer("all-MiniLM-L6-v2")


# Load datasets
df_queries = pd.read_csv("subtask4b_query_tweets_train.tsv", sep="\t")
df_cord = pd.read_pickle("subtask4b_collection_data.pkl")

# Build a dict of cord_uid → abstract
cord_abstract_map = dict(zip(df_cord['cord_uid'], df_cord['abstract']))

# Clean out any queries with missing abstracts
df_queries = df_queries[df_queries['cord_uid'].isin(cord_abstract_map)]


train_rows = []

all_uids = list(cord_abstract_map.keys())

for _, row in df_queries.iterrows():
    query = row['tweet_text']
    pos_uid = row['cord_uid']
    pos_abstract = cord_abstract_map[pos_uid]

    # Sample a hard negative (non-matching abstract)
    neg_uid = choice([uid for uid in all_uids if uid != pos_uid])
    neg_abstract = cord_abstract_map[neg_uid]

    train_rows.append({
        "tweet_text": query,
        "pos_abstract": pos_abstract,
        "neg_abstract": neg_abstract
    })

df_train = pd.DataFrame(train_rows)
df_train.to_csv("training_data.csv", index=False)
print(f"Generated {len(df_train)} training rows with random negatives.")

Generated 12853 training rows with random negatives.


In [None]:

model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
# Load the training data
df = pd.read_csv("training_data.csv")

# Format: query | positive_passage | hard_negative_passage
train_examples = []
for _, row in df.iterrows():
    query = "query: " + row["tweet_text"]
    positive = "passage: " + row["pos_abstract"]
    negative = "passage: " + row["neg_abstract"]
    
    train_examples.append(
        InputExample(texts=[query, positive, negative])
    )


train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)

train_loss = losses.MultipleNegativesRankingLoss(model=model)

num_epochs = 3
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    show_progress_bar=True,
    output_path="fine-tuned-multi-qa-MiniLM-L6-cos-v1"
)


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


KeyboardInterrupt: 

In [15]:
# MRR@5 Evaluation
import torch

model_name = "fine-tuned-all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

PATH_COLLECTION_DATA = 'subtask4b_collection_data.pkl'

df_collection = pd.read_pickle(PATH_COLLECTION_DATA)

PATH_QUERY_TRAIN_DATA = 'subtask4b_query_tweets_train.tsv'
PATH_QUERY_DEV_DATA = 'subtask4b_query_tweets_dev.tsv'

df_query_train = pd.read_csv(PATH_QUERY_TRAIN_DATA, sep = '\t')
df_query_dev = pd.read_csv(PATH_QUERY_DEV_DATA, sep = '\t')

predictions = []
df_collection['full_text'] = df_collection['title'].fillna('') + " " + df_collection['abstract'].fillna('')

doc_embeddings = model.encode(df_collection['full_text'].tolist(), show_progress_bar=True, convert_to_tensor=True)
tweet_embeddings = model.encode(df_query_dev['tweet_text'].tolist(), convert_to_tensor=True, show_progress_bar=True)


tweet_texts = df_query_dev['tweet_text'].tolist()
tweet_ids = df_query_dev['post_id'].tolist()
true_labels = df_query_dev['cord_uid'].tolist()

doc_texts = df_collection['full_text'].tolist()
doc_uids = df_collection['cord_uid'].tolist()

for i in tqdm(range(len(tweet_embeddings))):
    tweet_vec = tweet_embeddings[i]
    cosine_scores = util.cos_sim(tweet_vec, doc_embeddings)[0]
    top_results = torch.topk(cosine_scores, k=5)
    top_indices = top_results.indices.tolist()
    top_cord_uids = [doc_uids[idx] for idx in top_indices]

    predictions.append({
        'post_id': tweet_ids[i],
        'tweet_text': tweet_texts[i],
        'true': true_labels[i],
        'preds': top_cord_uids
    })

# MRR@5 Evaluation
def mrr_at_k(predictions, k=5):
    total_mrr = 0
    for pred in predictions:
        if pred['true'] in pred['preds']:
            rank = pred['preds'].index(pred['true']) + 1
            total_mrr += 1 / rank
    return total_mrr / len(predictions)

mrr5 = mrr_at_k(predictions, k=5)
print(f"MRR@5 for multi-qa-MiniLM-L6-cos-v1: {mrr5:.4f}")

Batches:   0%|          | 0/242 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

100%|██████████| 1400/1400 [00:00<00:00, 2125.04it/s]

MRR@5 for multi-qa-MiniLM-L6-cos-v1: 0.6067





In [13]:
# MRR@5 Evaluation for combined model
model_name2 = "fine-tuned-multi-qa-MiniLM-L6-cos-v1"
model2 = SentenceTransformer(model_name2)
doc_embeddings2 = model2.encode(df_collection['full_text'].tolist(), show_progress_bar=True, convert_to_tensor=True)
tweet_embeddings2 = model2.encode(df_query_dev['tweet_text'].tolist(), convert_to_tensor=True, show_progress_bar=True)

predictions = []
k = 5

for i in range(len(tweet_texts)):
    tweet_id = tweet_ids[i]
    tweet_text = tweet_texts[i]
    true_uid = true_labels[i]

    scores1 = util.cos_sim(tweet_embeddings[i], doc_embeddings)[0] 
    scores2 = util.cos_sim(tweet_embeddings2[i], doc_embeddings2)[0]

    avg_scores = (scores1 + scores2) / 2.0

    top_k = torch.topk(avg_scores, k=k)
    top_indices = top_k.indices.tolist()
    top_cord_uids = [doc_uids[idx] for idx in top_indices]

    predictions.append({
        'post_id': tweet_id,
        'true': true_uid,
        'preds': top_cord_uids
    })

mrr_score = mrr_at_k(predictions, k=5)
print(f"MRR@{k} (score average): {mrr_score:.4f}")

Batches:   0%|          | 0/242 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

MRR@5 (score average): 0.6071
