In [6]:
BASE_PATH = "/content/drive/MyDrive/nlp_7th_sem/lab5"

CORPUS_FILE = f"{BASE_PATH}/validation_corpus.csv"
QUERIES_FILE = f"{BASE_PATH}/validation_queries.csv"
TRIPLET_DATASET_NAME = f"{BASE_PATH}/train_triplet.csv"

In [7]:
import pandas as pd
from sentence_transformers import InputExample
from torch.utils.data import DataLoader

def load_validation_data(corpus_path, queries_path):
    print("Loading Validation Corpus and Queries...")

    df_corpus = pd.read_csv(corpus_path, sep="|")

    df_corpus['display_name'] = df_corpus['display name'].fillna('')
    df_corpus['description'] = df_corpus['description'].fillna('')

    df_corpus['combined_text'] = (
        df_corpus['display_name'] + " " + df_corpus['description']
    )

    print(f"Corpus loaded: {len(df_corpus)} documents.")

    df_queries = pd.read_csv(
        queries_path,
        sep="|",
        converters={'expected_results': pd.eval}
    )
    print(f"Queries loaded: {len(df_queries)} queries.")

    return df_corpus, df_queries


def load_training_data(dataset_path):
    print(f"Loading Training Data...")

    df_train = pd.read_csv(dataset_path)

    train_examples = []
    for _, row in df_train.iterrows():
        train_examples.append(
            InputExample(
                texts=[row['anchor'], row['positive'], row['negative']]
            )
        )

    print(f"Training examples prepared: {len(train_examples)}")

    return DataLoader(train_examples, shuffle=True, batch_size=32)


In [14]:
df_corpus, df_validation_queries = load_validation_data(CORPUS_FILE, QUERIES_FILE)

train_dataloader = load_training_data(TRIPLET_DATASET_NAME)

Loading Validation Corpus and Queries...
Corpus loaded: 5000 documents.
Queries loaded: 55 queries.
Loading Training Data...
Training examples prepared: 10000


In [9]:
from sentence_transformers import SentenceTransformer, models

word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)

pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode='mean'
)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [19]:
from sentence_transformers import losses

import os
os.environ['WANDB_DISABLED'] = 'true'

num_epochs = 3
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)
train_loss = losses.TripletLoss(model=model, distance_metric=losses.TripletDistanceMetric.COSINE, triplet_margin=0.8)

print(f"Starting training for {num_epochs} epochs...")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    output_path=f'{BASE_PATH}/sentence_transformer_model',
    show_progress_bar=True,

)
print("Training complete. Model saved.")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Starting training for 3 epochs...


Step,Training Loss
500,0.161


Training complete. Model saved.


In [20]:
from sklearn.metrics.pairwise import cosine_similarity


print("\nGenerating corpus embeddings...")
corpus_embeddings = model.encode(df_corpus["combined_text"].tolist(), show_progress_bar=True, convert_to_tensor=True)

print("Generating query embeddings...")
query_embeddings = model.encode(df_validation_queries['queries'].tolist(), show_progress_bar=True, convert_to_tensor=True)

print("Calculating cosine similarities...")
cos_scores = cosine_similarity(query_embeddings.cpu().numpy(), corpus_embeddings.cpu().numpy())


Generating corpus embeddings...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Generating query embeddings...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating cosine similarities...


In [21]:
import numpy as np
from tqdm.autonotebook import tqdm

TOP_K = 10
all_results = []
for i in tqdm(range(len(df_validation_queries['queries'])), desc="Retrieving top results"):
    top_indices = np.argsort(cos_scores[i])[-TOP_K:][::-1]
    all_results.append(top_indices.tolist())

df_validation_queries['retrieved_results'] = all_results

Retrieving top results:   0%|          | 0/55 [00:00<?, ?it/s]

In [33]:
def calculate_metrics(df, k=5):
    precision_list, recall_list, f_score_list, avg_precision_list = [], [], [], []

    for _, row in df.iterrows():
        expected = set(row['expected_results'])
        retrieved = row['retrieved_results'][:k]
        print(f"Expected: {expected}, Retrieved: {retrieved}")

        relevant_retrieved = len(expected.intersection(retrieved))

        # Precision@k: (Relevant Retrieved) / k
        P_k = relevant_retrieved / k

        # Recall@k: (Relevant Retrieved) / (Total Relevant)
        R_k = relevant_retrieved / len(expected) if len(expected) > 0 else 0

        # F-score@:
        F_k = (2 * P_k * R_k) / (P_k + R_k) if (P_k + R_k) > 0 else 0

        precision_list.append(P_k)
        recall_list.append(R_k)
        f_score_list.append(F_k)

        # Average Precision (AP) for MAP calculation (using all retrieved results)
        ap = 0
        hits = 0
        for rank, doc_index in enumerate(row['retrieved_results']):
            if doc_index in expected:
                hits += 1
                ap += hits / (rank + 1)

        AP = ap / len(expected) if len(expected) > 0 else 0
        avg_precision_list.append(AP)


    return {
        f'Precision@{k}': np.mean(precision_list),
        f'Recall@{k}': np.mean(recall_list),
        f'F-score@{k}': np.mean(f_score_list),
        'MAP': np.mean(avg_precision_list)
    }

metrics = calculate_metrics(df_validation_queries, k=10)
print("\n--- Training Results (Post-Training) ---")
print(metrics)

Expected: {4544, 546, 1666, 4581, 357, 76, 3628, 1178, 1051, 2012}, Retrieved: [937, 1281, 2674, 1543, 371, 4400, 4991, 10, 4499, 3858]
Expected: {1865, 937, 2807, 4204, 3152, 4629, 3446, 4791, 697, 3383}, Retrieved: [3849, 2372, 854, 2698, 2325, 3230, 2719, 3647, 3167, 4870]
Expected: {1088, 225, 1732, 1412, 8, 3368, 1866, 4751, 3795, 29}, Retrieved: [254, 3614, 2918, 1951, 4384, 4499, 2358, 3746, 2890, 2440]
Expected: {3365, 3525, 2794, 492, 2735, 1745, 2645, 3286, 2937, 2650}, Retrieved: [4977, 1281, 2459, 48, 2494, 937, 3746, 1415, 1388, 285]
Expected: {997, 840, 778, 4461, 4813, 240, 1622, 1912, 3388, 989}, Retrieved: [1953, 4583, 4187, 1690, 1360, 3605, 2711, 1460, 1869, 163]
Expected: {1732, 1513, 2794, 2094, 2288, 3473, 4372, 3286, 471, 3038}, Retrieved: [3038, 1732, 4926, 1654, 636, 3486, 2794, 874, 492, 2943]
Expected: {2371, 4483, 4325, 1732, 2856, 2993, 4562, 2358, 1913, 637}, Retrieved: [4325, 721, 4185, 353, 1768, 236, 4595, 2705, 1553, 4991]
Expected: {4961, 290, 834, 14

In [None]:
import umap
import matplotlib.pyplot as plt
import random

# Choose a random query to visualize