In [9]:
import pandas as pd
from sentence_transformers import InputExample
from torch.utils.data import DataLoader

BASE_PATH = "/content/drive/MyDrive/nlp_7th_sem/lab5"

CORPUS_FILE = f"{BASE_PATH}/validation_corpus.csv"
QUERIES_FILE = f"{BASE_PATH}/validation_queries.csv"
TRIPLET_DATASET_NAME = f"{BASE_PATH}/train_triplet.csv"

In [10]:
def load_validation_data(corpus_path, queries_path):
    print("Loading Validation Corpus and Queries...")

    try:
        df_corpus = pd.read_csv(corpus_path, sep="|")
    except Exception:
        df_corpus = pd.read_csv(corpus_path)

    df_corpus['display_name'] = df_corpus['display name'].fillna('')
    df_corpus['description'] = df_corpus['description'].fillna('')

    df_corpus['combined_text'] = (
        df_corpus['display_name'] + " " + df_corpus['description']
    )

    corpus_documents = df_corpus['combined_text'].tolist()

    print(f"Corpus loaded: {len(corpus_documents)} documents (Combined 'display name' and 'description').")

    df_queries = pd.read_csv(
        queries_path,
        sep="|",
        converters={'expected_results': pd.eval}
    )
    print(f"Queries loaded: {len(df_queries)} queries.")

    return corpus_documents, df_queries


def load_and_prepare_triplet_data(dataset_path):
    print(f"\nLoading and preparing Triplet Training Data from: {dataset_path}...")

    try:
        df_train = pd.read_csv(dataset_path)
    except FileNotFoundError:
        print(f"Error: Training file '{dataset_path}' not found. Cannot proceed with training.")
        return None, 0
    except Exception as e:
        print(f"Error reading training file '{dataset_path}': {e}")
        return None, 0

    train_examples = []
    for _, row in df_train.iterrows():
        try:
            train_examples.append(
                InputExample(
                    texts=[row['anchor'], row['positive'], row['negative']]
                )
            )
        except KeyError as e:
            print(f"Error: Missing required column {e}. Please ensure columns are 'anchor', 'positive', 'negative'.")
            return None, 0

    print(f"Training examples prepared: {len(train_examples)} triplets.")

    # Create a PyTorch DataLoader
    batch_size = 32

    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
    print(f"DataLoader created with batch size: {batch_size}")

    return train_dataloader, len(train_examples)


In [12]:
corpus_documents, df_validation_queries = load_validation_data(CORPUS_FILE, QUERIES_FILE)

train_dataloader, num_train_examples = load_and_prepare_triplet_data(TRIPLET_DATASET_NAME)


Loading Validation Corpus and Queries...
Corpus loaded: 5000 documents (Combined 'display name' and 'description').
Queries loaded: 55 queries.

Loading and preparing Triplet Training Data from: /content/drive/MyDrive/nlp_7th_sem/lab5/train_triplet.csv...
Training examples prepared: 10000 triplets.
DataLoader created with batch size: 32


In [None]:
from sentence_transformers import SentenceTransformer, models

word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)

pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode='mean'
)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
from sentence_transformers import losses

num_epochs = 4
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)
train_loss = losses.TripletLoss(model=model)

print(f"Starting training for {num_epochs} epochs...")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    output_path=f'{BASE_PATH}/sentence_transformer_model',
    show_progress_bar=True
)
print("Training complete. Model saved.")

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.autonotebook import tqdm

print("\nGenerating corpus embeddings...")
corpus_embeddings = model.encode(corpus_documents, show_progress_bar=True, convert_to_tensor=True)

query_texts = df_validation_queries['queries'].tolist()
print("Generating query embeddings...")
query_embeddings = model.encode(query_texts, show_progress_bar=True, convert_to_tensor=True)

# 3. Calculate Similarity (e.g., Cosine Similarity)
print("Calculating cosine similarities...")
# Transpose query_embeddings for easy matrix multiplication if using PyTorch/NumPy
cos_scores = cosine_similarity(query_embeddings.cpu().numpy(), corpus_embeddings.cpu().numpy())

# 4. Get the Top K results for each query
TOP_K = 10
all_results = []
for i in tqdm(range(len(query_texts)), desc="Retrieving top results"):
    # Get the indices of the top K highest scores
    top_indices = np.argsort(cos_scores[i])[-TOP_K:][::-1]
    all_results.append(top_indices.tolist())

# Add results to the DataFrame
df_validation_queries['retrieved_results'] = all_results

In [None]:
# Helper function for metrics
def calculate_metrics(df, k=5):
    precision_list, recall_list, f_score_list, avg_precision_list = [], [], [], []

    for _, row in df.iterrows():
        expected = set(row['expected_results'])
        retrieved = row['retrieved_results'][:k]

        relevant_retrieved = len(expected.intersection(retrieved))

        # Precision@k: (Relevant Retrieved) / k
        P_k = relevant_retrieved / k

        # Recall@k: (Relevant Retrieved) / (Total Relevant)
        R_k = relevant_retrieved / len(expected) if len(expected) > 0 else 0

        F_k = (2 * P_k * R_k) / (P_k + R_k) if (P_k + R_k) > 0 else 0

        precision_list.append(P_k)
        recall_list.append(R_k)
        f_score_list.append(F_k)

        # Average Precision (AP) for MAP calculation (using all retrieved results)
        ap = 0
        hits = 0
        for rank, doc_index in enumerate(row['retrieved_results']):
            if doc_index in expected:
                hits += 1
                ap += hits / (rank + 1)

        AP = ap / len(expected) if len(expected) > 0 else 0
        avg_precision_list.append(AP)


    return {
        f'Precision@{k}': np.mean(precision_list),
        f'Recall@{k}': np.mean(recall_list),
        f'F-score@{k}': np.mean(f_score_list),
        'MAP': np.mean(avg_precision_list)
    }

metrics = calculate_metrics(df_validation_queries, k=5)
print("\n--- Training Results (Post-Training) ---")
print(metrics)

In [None]:
import umap
import matplotlib.pyplot as plt
import random

# Choose a random query to visualize
query_index_to_plot = random.randint(0, len(query_texts) - 1)
query_text_to_plot = query_texts[query_index_to_plot]
top_10_indices = df_validation_queries.loc[query_index_to_plot, 'retrieved_results'][:10]
expected_indices = set(df_validation_queries.loc[query_index_to_plot, 'expected_results'])

# 1. Convert all embeddings to NumPy for UMAP
all_embeddings_np = corpus_embeddings.cpu().numpy()
query_embedding_np = query_embeddings.cpu().numpy()[query_index_to_plot].reshape(1, -1)
combined_embeddings = np.vstack([all_embeddings_np, query_embedding_np])

# 2. Apply UMAP to reduce dimensions to 2D
print(f"\nApplying UMAP for visualization on query: '{query_text_to_plot}'")
reducer = umap.UMAP(n_components=2, random_state=42)
projected_embeddings = reducer.fit_transform(combined_embeddings)

# Separate projected corpus and query points
corpus_projected = projected_embeddings[:-1]
query_projected = projected_embeddings[-1]

# 3. Plotting
plt.figure(figsize=(10, 8))

plt.scatter(corpus_projected[:, 0], corpus_projected[:, 1], c='gray', alpha=0.5, label='All Documents')

top_10_projected = corpus_projected[top_10_indices]
plt.scatter(top_10_projected[:, 0], top_10_projected[:, 1], c='blue', s=100, label=f'Top 10 Retrieved Results')

plt.scatter(query_projected[0], query_projected[1], c='red', marker='X', s=200, label='Search Query (X)')

expected_projected = corpus_projected[list(expected_indices.intersection(set(range(len(corpus_projected)))))]
plt.scatter(expected_projected[:, 0], expected_projected[:, 1], c='green', marker='o', s=150, alpha=0.8, label='Expected Results')

plt.title(f'UMAP Projection of Embeddings for Query: "{query_text_to_plot}"')
plt.xlabel('First Component')
plt.ylabel('Second Component')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()