In [None]:
# Install necessary libraries (run once per Colab session)
!uv pip install scikit-learn matplotlib torch transformers sentence-transformers mteb chromadb langchain  langchain-community psutil

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[2mkubernetes[0m [32m----------------------[2m--------[0m[0m 1.34 MiB/1.90 MiB
[2mmteb      [0m [32m----------------[2m--------------[0m[0m 1.01 MiB/1.97 MiB
[2mlangchain-community[0m [32m-------------[2m-----------------[0m[0m 1006.90 KiB/2.41 MiB
[2mnvidia-cuda-cupti-cu12[0m [32m------------------[2m------------[0m[0m 7.89 MiB/13.17 MiB
[2monnxruntime[0m [32m----------------[2m--------------[0m[0m 7.94 MiB/15.26 MiB
[2mchromadb  [0m [32m--------------[2m----------------[0m[0m 8.05 MiB/17.44 MiB
[2mnvidia-nvjitlink-cu12[0m [32m------------[2m------------------[0m[0m 7.98 MiB/20.09 MiB
[2mnvidia-cuda-nvrtc-cu12[0m [32m-----------[2m-------------------[0m[0m 7.95 MiB/23.50 MiB
[2mnvidia-curand-cu12[0m [32m-----[2m-------------------------[0m[0m 8.01 MiB/53.70 MiB
[2mnvidia-cusolver-cu12[0m [32m--[2m----------------------------[0m[0m 7.96 MiB/122.01 MiB
[2mnvidia-

In [None]:
# Import libraries for data processing, embeddings, vector store, and performance measurement
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score
import nltk
import re
import torch
from sentence_transformers import SentenceTransformer
from mteb import MTEB
import pandas as pd
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
import time
import psutil
import shutil

# Download NLTK resources for text processing
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Define a utility function to clean text (lowercase, remove extra spaces)
def simple_clean(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Load the Excel file containing resume texts and summaries
df = pd.read_excel('/content/synthetic_resume_summaries (1).xlsx')
df.head()

In [None]:
# Extract texts (full resumes) and summaries
texts = df["text"].tolist()
summaries = df["summary"].tolist()
N = len(texts)

# Clean texts and summaries to standardize them
texts_clean = [simple_clean(text) for text in texts]
summaries_clean = [simple_clean(str(summary)) for summary in summaries]

# Create documents with metadata (ID) for ChromaDB to track which text corresponds to which summary
docs = [Document(page_content=text, metadata={"id": i}) for i, text in enumerate(texts_clean)]

In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings
# Initialize the embedding model (SentenceTransformer for generating vector embeddings)
model_name = "multi-qa-mpnet-base-dot-v1"
model = SentenceTransformerEmbeddings(model_name=model_name) # Pass model_name as a keyword argument within model_kwargs
embedding_function = model  # Use the model directly as the embedding function

In [None]:
# Measure ingestion time: Time taken to add all documents to ChromaDB
start_time = time.time()
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embedding_function,
    persist_directory="./chroma_db"
)
ingestion_time = time.time() - start_time
print(f"Ingestion time: {ingestion_time} seconds")

Ingestion time: 40.14806079864502 seconds


In [None]:
# Set up the retriever to fetch top-1 document for each query (can adjust k for recall@K)
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

In [None]:
# Initialize lists to store performance metrics
latencies = []
recalls = []

# Loop through each summary as a query to measure latency and recall
for i, query in enumerate(summaries_clean):
    # Measure latency: Time taken to retrieve documents for this query
    start_time = time.time()
    retrieved_docs = retriever.get_relevant_documents(query)
    latency = time.time() - start_time
    latencies.append(latency)

    # Check recall: If the correct document (ID=i) is retrieved
    retrieved_ids = [doc.metadata["id"] for doc in retrieved_docs]
    correct_id = i
    if correct_id in retrieved_ids:
        recalls.append(1)  # Correct document retrieved
    else:
        recalls.append(0)  # Correct document not retrieved

# Calculate recall rate: Proportion of queries where the correct document was retrieved
average_recall = sum(recalls) / len(recalls)
print(f"Average recall@1: {average_recall}")

  retrieved_docs = retriever.get_relevant_documents(query)


Average recall@1: 0.698


In [None]:
# Calculate average latency: Mean time taken per query
average_latency = sum(latencies) / len(latencies)
print(f"Average latency: {average_latency} seconds")

Average latency: 0.018303057670593263 seconds


In [None]:
# Calculate QPS: Total queries divided by total time (sequential approximation)
total_queries = len(summaries_clean)
total_time = sum(latencies)
QPS = total_queries / total_time if total_time > 0 else 0
print(f"QPS: {QPS}")

QPS: 54.63567989553227


In [None]:
# Measure memory usage: Current memory used by the process in GB
memory_usage = psutil.virtual_memory().used / (1024 ** 3)
print(f"Memory usage: {memory_usage} GB")

Memory usage: 2.7359390258789062 GB


In [None]:
# Measure disk usage: Size of the ChromaDB directory in GB
disk_usage = shutil.disk_usage('./chroma_db').used / (1024 ** 3)
print(f"Disk usage for chroma_db: {disk_usage} GB")

Disk usage for chroma_db: 40.26665496826172 GB
