In [11]:
# Cell 1: Install Required Libraries
!pip install -q scikit-learn matplotlib torch transformers sentence-transformers langchain-openai tiktoken

In [12]:
# Cell 2: Import Libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import re
import pandas as pd
from scipy.stats import spearmanr
import math
from langchain_openai import OpenAIEmbeddings
import tiktoken

In [13]:
import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY3')

In [14]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Cell 3: Define Utility Functions
def simple_clean(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# Cell 4: Load Data
df = pd.read_excel('/content/synthetic_resume_summaries (1).xlsx')
all_texts = df["text"].tolist()
all_summaries = df["summary"].tolist()
print(f"Number of samples: {len(all_texts)}")

Number of samples: 1000


In [16]:
# Cell 5: Split Data into Train and Test
num_samples = len(all_texts)
test_size = 100
if num_samples > test_size:
    train_indices, test_indices = train_test_split(range(num_samples), test_size=test_size, random_state=42)
else:
    test_indices = list(range(num_samples))
    print(f"Warning: Number of samples {num_samples} is less than or equal to test_size {test_size}, using all as test.")
print(f"Train indices: {len(train_indices)}")
print(f"Test indices: {len(test_indices)}")

Train indices: 900
Test indices: 100


In [17]:
# Cell 6: Prepare Corpus and Queries
corpus = all_texts
queries = [all_summaries[i] for i in test_indices]
print(f"Corpus size: {len(corpus)}")
print(f"Query size: {len(queries)}")

Corpus size: 1000
Query size: 100


In [18]:
# Cell 8: Encode Corpus and Queries
# Initialize tiktoken encoding
encoding = tiktoken.get_encoding("cl100k_base")

In [19]:
# Cell 7: Load Model
model_name = "text-embedding-3-large"
# Cell 7: Load Model
embeddings = OpenAIEmbeddings(model=model_name, dimensions=3072)
print(f"Model loaded: {model_name}")

Model loaded: text-embedding-3-large


In [20]:
# Process corpus
cleaned_corpus = [simple_clean(doc) for doc in corpus]
token_counts = [len(encoding.encode(doc)) for doc in cleaned_corpus]

In [21]:
# Batch corpus to keep total tokens < 300,000 per request
batches = []
current_batch = []
current_total = 0
max_tokens = 300000

for doc, count in zip(cleaned_corpus, token_counts):
    if current_total + count > max_tokens:
        if current_batch:
            batches.append(current_batch)
            current_batch = [doc]
            current_total = count
        else:
            # Handle rare case of single doc > max_tokens (unlikely since per-text limit is 8191)
            batches.append([doc])
            current_total = 0
    else:
        current_batch.append(doc)
        current_total += count
if current_batch:
    batches.append(current_batch)

In [22]:
# Embed each batch and collect embeddings
corpus_embeddings = []
for i, batch in enumerate(batches):
    batch_tokens = sum(len(encoding.encode(doc)) for doc in batch)
    print(f"Embedding batch {i+1}/{len(batches)} with {len(batch)} documents, {batch_tokens} tokens")
    batch_embeddings = embeddings.embed_documents(batch)
    corpus_embeddings.extend(batch_embeddings)

# Process queries (batch if necessary, though likely small)
cleaned_queries = [simple_clean(q) for q in queries]
query_token_counts = [len(encoding.encode(q)) for q in cleaned_queries]
total_query_tokens = sum(query_token_counts)

Embedding batch 1/4 with 334 documents, 298917 tokens
Embedding batch 2/4 with 330 documents, 298809 tokens
Embedding batch 3/4 with 312 documents, 299198 tokens
Embedding batch 4/4 with 24 documents, 19919 tokens


In [23]:
if total_query_tokens > max_tokens:
    query_batches = []
    current_batch = []
    current_total = 0
    for q, count in zip(cleaned_queries, query_token_counts):
        if current_total + count > max_tokens:
            if current_batch:
                query_batches.append(current_batch)
                current_batch = [q]
                current_total = count
            else:
                query_batches.append([q])
                current_total = 0
        else:
            current_batch.append(q)
            current_total += count
    if current_batch:
        query_batches.append(current_batch)

    query_embeddings = []
    for i, batch in enumerate(query_batches):
        batch_tokens = sum(len(encoding.encode(q)) for q in batch)
        print(f"Embedding query batch {i+1}/{len(query_batches)} with {len(batch)} queries, {batch_tokens} tokens")
        batch_embeddings = embeddings.embed_documents(batch)
        query_embeddings.extend(batch_embeddings)
else:
    print(f"Embedding {len(cleaned_queries)} queries, {total_query_tokens} tokens")
    query_embeddings = embeddings.embed_documents(cleaned_queries)

Embedding 100 queries, 15289 tokens


In [24]:
# Convert to NumPy arrays
corpus_embeddings_np = np.array(corpus_embeddings)
query_embeddings_np = np.array(query_embeddings)

print(f"Corpus embeddings shape: {corpus_embeddings_np.shape}")
print(f"Query embeddings shape: {query_embeddings_np.shape}")

Corpus embeddings shape: (1000, 3072)
Query embeddings shape: (100, 3072)


In [25]:
# Cell 9: Compute Cosine Similarities
similarities = cosine_similarity(query_embeddings_np, corpus_embeddings_np)
print(f"Similarities matrix shape: {similarities.shape}")

Similarities matrix shape: (100, 1000)


In [26]:
# Cell 10: Calculate Performance Metrics
num_queries = len(queries)
mrr_total = 0.0
ap_total = 0.0
ndcg_total = 0.0
top1_correct = 0
spearman_total = 0.0

for k in range(num_queries):
    ranked_indices = np.argsort(similarities[k, :])[::-1]
    # MRR
    rank = np.where(ranked_indices == test_indices[k])[0]
    if len(rank) > 0:
        rank = rank[0] + 1
        mrr_total += 1.0 / rank
    # MAP
    sum_prec = 0.0
    rel_found = 0
    for position, doc_id in enumerate(ranked_indices):
        if doc_id == test_indices[k]:
            rel_found += 1
            sum_prec += rel_found / (position + 1)
    if rel_found > 0:
        ap = sum_prec / rel_found
    else:
        ap = 0.0
    ap_total += ap
    # NDCG
    rel = [1 if doc_id == test_indices[k] else 0 for doc_id in ranked_indices]
    dcg = sum([rel[i] / math.log2(i + 2) for i in range(len(rel))])
    idcg = 1.0 / math.log2(2)
    ndcg = dcg / idcg if idcg > 0 else 0.0
    ndcg_total += ndcg
    # Top-1 Accuracy
    if ranked_indices[0] == test_indices[k]:
        top1_correct += 1
    # Spearman
    scores = similarities[k, :]
    true_labels = [1 if j == test_indices[k] else 0 for j in range(len(corpus))]
    rho, p = spearmanr(scores, true_labels)
    if not math.isnan(rho):
        spearman_total += rho

In [27]:
mrr = mrr_total / num_queries
map_score = ap_total / num_queries
average_ndcg = ndcg_total / num_queries
accuracy = top1_correct / num_queries
average_spearman = spearman_total / num_queries

print("Performance Summary:")
print(f"- MRR: {mrr:.4f}")
print(f"- MAP: {map_score:.4f}")
print(f"- NDCG: {average_ndcg:.4f}")
print(f"- Top-1 Accuracy: {accuracy:.4f}")
print(f"- Average Spearman: {average_spearman:.4f}")

Performance Summary:
- MRR: 0.9574
- MAP: 0.9574
- NDCG: 0.9669
- Top-1 Accuracy: 0.9400
- Average Spearman: 0.0546
