In [None]:
# Install necessary libraries (updated for FAISS)
!uv pip install scikit-learn matplotlib torch transformers sentence-transformers mteb faiss-cpu langchain langchain-community psutil

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[2mmteb      [0m [32m-[2m-----------------------------[0m[0m 16.00 KiB/1.97 MiB
[2mlangchain-community[0m [32m-[2m-----------------------------[0m[0m 16.00 KiB/2.41 MiB
[2mnvidia-cuda-cupti-cu12[0m [32m-[2m-----------------------------[0m[0m 99.45 KiB/13.17 MiB
[2mnvidia-nvjitlink-cu12[0m [32m-[2m-----------------------------[0m[0m 526.72 KiB/20.09 MiB
[2mnvidia-cuda-nvrtc-cu12[0m [32m-[2m-----------------------------[0m[0m 462.06 KiB/23.50 MiB
[2mfaiss-cpu [0m [32m-[2m-----------------------------[0m[0m 456.56 KiB/29.25 MiB
[2mnvidia-curand-cu12[0m [32m-[2m-----------------------------[0m[0m 492.34 KiB/53.70 MiB
[2mnvidia-cusolver-cu12[0m [32m-[2m-----------------------------[0m[0m 576.53 KiB/122.01 MiB
[2mnvidia-cusparse-cu12[0m [32m-[2m-----------------------------[0m[0m 603.39 KiB/197.84 MiB
[2mnvidia-cufft-cu12[0m [32m-[2m-----------------------------[0m[0m 

In [None]:
# Import libraries (updated vector store import)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score
import nltk
import re
import torch
from sentence_transformers import SentenceTransformer
from mteb import MTEB
import pandas as pd
from langchain.docstore.document import Document
from langchain.vectorstores import FAISS  # Changed import
from langchain.embeddings import HuggingFaceEmbeddings  # Updated embedding class
import time
import psutil
import shutil

# Download NLTK resources (unchanged)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Text cleaning function (unchanged)
def simple_clean(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
# Data loading (unchanged)
df = pd.read_excel('/content/synthetic_resume_summaries (1).xlsx')
df.head()

Unnamed: 0,text,summary
0,Medical Advisor - General Medicines (Diabetes)...,**Medical Advisor** with over 15 years of expe...
1,"linkedin.com/in/hafsah09/\n\nSUMMARY ,\n* Over...",**Digital Product Management Professional** wi...
2,"'o\n\n+\n\nSummary ,\nI have a very positive a...",**Accountant** with experience in financial ma...
3,BRIDGETTE\nWENG\n2007\n\nPROFILE\n\nExtremely ...,**Medical Assistant** with hands-on experience...
4,"Professional Summary ,\nA highly organised, mo...",**Aspiring Legal Professional** with a strong ...


In [None]:
texts = df["text"].tolist()
summaries = df["summary"].tolist()
N = len(texts)

# Data cleaning (unchanged)
texts_clean = [simple_clean(text) for text in texts]
summaries_clean = [simple_clean(str(summary)) for summary in summaries]

# Document creation (unchanged)
docs = [Document(page_content=text, metadata={"id": i}) for i, text in enumerate(texts_clean)]

In [None]:
# Embedding setup (updated to HuggingFaceEmbeddings)
model_name = "multi-qa-mpnet-base-dot-v1"
embedding_function = HuggingFaceEmbeddings(model_name=model_name)

  embedding_function = HuggingFaceEmbeddings(model_name=model_name)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# FAISS Ingestion with persistence (modified section)
start_time = time.time()
vectorstore = FAISS.from_documents(
    documents=docs,
    embedding=embedding_function
)
# Save FAISS index to disk
vectorstore.save_local("faiss_index")
ingestion_time = time.time() - start_time
print(f"Ingestion time: {ingestion_time} seconds")

Ingestion time: 35.73035407066345 seconds


In [None]:
# Retriever setup (unchanged interface)
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

In [None]:
# Performance metrics (unchanged collection logic)
latencies = []
recalls = []

for i, query in enumerate(summaries_clean):
    start_time = time.time()
    retrieved_docs = retriever.get_relevant_documents(query)
    latency = time.time() - start_time
    latencies.append(latency)

    retrieved_ids = [doc.metadata["id"] for doc in retrieved_docs]
    correct_id = i
    recalls.append(1 if correct_id in retrieved_ids else 0)

# Calculate metrics (unchanged)
average_recall = sum(recalls) / len(recalls)
print(f"Average recall@1: {average_recall}")

  retrieved_docs = retriever.get_relevant_documents(query)


Average recall@1: 0.703


In [None]:
average_latency = sum(latencies) / len(latencies)
print(f"Average latency: {average_latency} seconds")

Average latency: 0.015947448968887327 seconds


In [None]:
total_queries = len(summaries_clean)
total_time = sum(latencies)
QPS = total_queries / total_time if total_time > 0 else 0
print(f"QPS: {QPS}")

QPS: 62.705953908424455


In [None]:
# Memory usage (unchanged)
memory_usage = psutil.virtual_memory().used / (1024 ** 3)
print(f"Memory usage: {memory_usage} GB")

Memory usage: 2.261425018310547 GB


In [None]:
# Disk usage measurement updated for FAISS
disk_usage = shutil.disk_usage('./faiss_index').used / (1024 ** 3)
print(f"Disk usage for faiss_index: {disk_usage} GB")

Disk usage for faiss_index: 40.1653938293457 GB
