In [1]:
# Cell 1: Install Required Libraries
!pip install -q scikit-learn matplotlib torch transformers sentence-transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Cell 2: Import Libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import re
import torch
from sentence_transformers import SentenceTransformer
import pandas as pd
from scipy.stats import spearmanr
import math

In [3]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Cell 3: Define Utility Functions
def simple_clean(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# Cell 4: Load Data
df = pd.read_excel('/content/synthetic_resume_summaries (1).xlsx')
all_texts = df["text"].tolist()
all_summaries = df["summary"].tolist()
print(f"Number of samples: {len(all_texts)}")

Number of samples: 1000


In [5]:
# Cell 5: Split Data into Train and Test
num_samples = len(all_texts)
test_size = 100
if num_samples > test_size:
    train_indices, test_indices = train_test_split(range(num_samples), test_size=test_size, random_state=42)
else:
    test_indices = list(range(num_samples))
    print(f"Warning: Number of samples {num_samples} is less than or equal to test_size {test_size}, using all as test.")
print(f"Train indices: {len(train_indices)}")
print(f"Test indices: {len(test_indices)}")

Train indices: 900
Test indices: 100


In [6]:
# Cell 6: Prepare Corpus and Queries
corpus = all_texts
queries = [all_summaries[i] for i in test_indices]
print(f"Corpus size: {len(corpus)}")
print(f"Query size: {len(queries)}")

Corpus size: 1000
Query size: 100


In [9]:
# Cell 7: Load Model
model_name = "msmarco-distilbert-cos-v5"
model = SentenceTransformer(model_name, trust_remote_code=True)
print(f"Model loaded: {model_name}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/545 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/319 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded: msmarco-distilbert-cos-v5


In [10]:
# Cell 8: Encode Corpus and Queries
corpus_embeddings = model.encode([simple_clean(doc) for doc in corpus], convert_to_tensor=True)
query_embeddings = model.encode([simple_clean(q) for q in queries], convert_to_tensor=True)
# Move tensors to CPU before converting to NumPy arrays
corpus_embeddings_np = corpus_embeddings.cpu().numpy() # Move to CPU first
query_embeddings_np = query_embeddings.cpu().numpy() # Move to CPU first
print(f"Corpus embeddings shape: {corpus_embeddings_np.shape}")
print(f"Query embeddings shape: {query_embeddings_np.shape}")

Corpus embeddings shape: (1000, 768)
Query embeddings shape: (100, 768)


In [11]:
# Cell 9: Compute Cosine Similarities
similarities = cosine_similarity(query_embeddings_np, corpus_embeddings_np)
print(f"Similarities matrix shape: {similarities.shape}")

Similarities matrix shape: (100, 1000)


In [12]:
# Cell 10: Calculate Performance Metrics
num_queries = len(queries)
mrr_total = 0.0
ap_total = 0.0
ndcg_total = 0.0
top1_correct = 0
spearman_total = 0.0

for k in range(num_queries):
    ranked_indices = np.argsort(similarities[k, :])[::-1]
    # MRR
    rank = np.where(ranked_indices == test_indices[k])[0]
    if len(rank) > 0:
        rank = rank[0] + 1
        mrr_total += 1.0 / rank
    # MAP
    sum_prec = 0.0
    rel_found = 0
    for position, doc_id in enumerate(ranked_indices):
        if doc_id == test_indices[k]:
            rel_found += 1
            sum_prec += rel_found / (position + 1)
    if rel_found > 0:
        ap = sum_prec / rel_found
    else:
        ap = 0.0
    ap_total += ap
    # NDCG
    rel = [1 if doc_id == test_indices[k] else 0 for doc_id in ranked_indices]
    dcg = sum([rel[i] / math.log2(i + 2) for i in range(len(rel))])
    idcg = 1.0 / math.log2(2)
    ndcg = dcg / idcg if idcg > 0 else 0.0
    ndcg_total += ndcg
    # Top-1 Accuracy
    if ranked_indices[0] == test_indices[k]:
        top1_correct += 1
    # Spearman
    scores = similarities[k, :]
    true_labels = [1 if j == test_indices[k] else 0 for j in range(len(corpus))]
    rho, p = spearmanr(scores, true_labels)
    if not math.isnan(rho):
        spearman_total += rho

mrr = mrr_total / num_queries
map_score = ap_total / num_queries
average_ndcg = ndcg_total / num_queries
accuracy = top1_correct / num_queries
average_spearman = spearman_total / num_queries

In [13]:
print("Performance Summary:")
print(f"- MRR: {mrr:.4f}")
print(f"- MAP: {map_score:.4f}")
print(f"- NDCG: {average_ndcg:.4f}")
print(f"- Top-1 Accuracy: {accuracy:.4f}")
print(f"- Average Spearman: {average_spearman:.4f}")

Performance Summary:
- MRR: 0.8230
- MAP: 0.8230
- NDCG: 0.8599
- Top-1 Accuracy: 0.7600
- Average Spearman: 0.0536
