In [None]:
# Install necessary libraries (updated for Pinecone)
!pip install scikit-learn matplotlib torch transformers sentence-transformers mteb pinecone-client langchain-pinecone langchain langchain-community psutil

Collecting mteb
  Downloading mteb-1.38.2-py3-none-any.whl.metadata (28 kB)
Collecting pinecone-client
  Downloading pinecone_client-6.0.0-py3-none-any.whl.metadata (3.4 kB)
Collecting langchain-pinecone
  Downloading langchain_pinecone-0.2.5-py3-none-any.whl.metadata (1.3 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.23-py3-none-any.whl.metadata (2.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
C

In [None]:
# Import libraries (updated vector store import)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score
import nltk
import re
import torch
from sentence_transformers import SentenceTransformer
from langchain.embeddings import SentenceTransformerEmbeddings
from mteb import MTEB
import pandas as pd
from langchain.docstore.document import Document
from langchain_pinecone.vectorstores import PineconeVectorStore  # Changed import
from langchain.embeddings import HuggingFaceEmbeddings  # Updated embedding class
import time
import psutil
import shutil
from pinecone import Pinecone, ServerlessSpec # Added import for Pinecone

In [None]:
# Download NLTK resources (unchanged)
nltk.download('punkt')
nltk.download('stopwords')

# Text cleaning function (unchanged)
def simple_clean(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Data loading (unchanged)
df = pd.read_excel('/content/synthetic_resume_summaries (1).xlsx')
df.head()

Unnamed: 0,text,summary
0,Medical Advisor - General Medicines (Diabetes)...,**Medical Advisor** with over 15 years of expe...
1,"linkedin.com/in/hafsah09/\n\nSUMMARY ,\n* Over...",**Digital Product Management Professional** wi...
2,"'o\n\n+\n\nSummary ,\nI have a very positive a...",**Accountant** with experience in financial ma...
3,BRIDGETTE\nWENG\n2007\n\nPROFILE\n\nExtremely ...,**Medical Assistant** with hands-on experience...
4,"Professional Summary ,\nA highly organised, mo...",**Aspiring Legal Professional** with a strong ...


In [None]:
texts = df["text"].tolist()
summaries = df["summary"].tolist()
N = len(texts)

# Data cleaning (unchanged)
texts_clean = [simple_clean(text) for text in texts]
summaries_clean = [simple_clean(str(summary)) for summary in summaries]

In [None]:
# Document creation (unchanged)
docs = [Document(page_content=text, metadata={"id": i}) for i, text in enumerate(texts_clean)]

# Embedding setup (unchanged)
model_name = "all-MiniLM-L6-v2"
embedding_function = SentenceTransformerEmbeddings(model_name=model_name)

In [None]:
# Pinecone Initialization (new)
import os
from google.colab import userdata
# Initialize Pinecone with your API key and environment
os.environ["PINECONE_API_KEY"] = userdata.get('PINECONE_API')

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

# Initialize Pinecone
try:
    pc = Pinecone(api_key=pinecone_api_key)
except Exception as e:
    raise Exception(f"Failed to initialize Pinecone: {e}")

In [None]:
# langchain-resume-index
# Metric: cosine
# Dimensions: 768
# Host: https://langchain-resume-index-j20l4sa.svc.aped-4627-b74a.pinecone.io
# Cloud:  aws
# AWS Region: us-east-1
# Type: Dense
# Capacity mode:  Serverless
# Record Count: 1,000

In [None]:
# Define index name and parameters
index_name = "langchain-resume-index"  # Change if desired
dimension = 768  # Matches multi-qa-mpnet-base-dot-v1
metric = "cosine"  # Standard for this embedding model
cloud = "aws"  # Can be changed to "gcp" or others based on your Pinecone setup
region = "us-east-1"  # Adjust based on your Pinecone environment

In [None]:
# Check for existing indexes and create if needed
try:
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=384, # 384 to match the embedding dimension
            metric="cosine", # Replace with your model metric
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
        # Wait for index to be ready
        while not pc.describe_index(index_name).status["ready"]:
            time.sleep(1)
    index = pc.Index(index_name)
except Exception as e:
    raise Exception(f"Failed to create or connect to Pinecone index: {e}")

In [None]:
# # Delete the index
# try:
#     pc.delete_index(index_name)
#     print(f"Index '{index_name}' deleted successfully.")
# except Exception as e:
#     print(f"Error deleting index '{index_name}': {e}")

Index 'langchain-resume-index' deleted successfully.


In [None]:
# Pinecone Ingestion (improved)
start_time = time.time()
try:
    vectorstore = PineconeVectorStore(index=index, embedding=embedding_function)
    # Ingest documents in smaller batches
    batch_size = 100  # Adjust this value based on your data and Pinecone limits
    for i in range(0, len(docs), batch_size):
        vectorstore.add_documents(docs[i : i + batch_size])
except Exception as e:
    raise Exception(f"Failed to ingest documents to Pinecone: {e}")
ingestion_time = time.time() - start_time
print(f"Ingestion time: {ingestion_time} seconds")

Ingestion time: 10.34293818473816 seconds


In [None]:
# Retriever setup (unchanged)
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

In [None]:
# Performance metrics (unchanged)
latencies = []
recalls = []

for i, query in enumerate(summaries_clean):
    start_time = time.time()
    retrieved_docs = retriever.get_relevant_documents(query)
    latency = time.time() - start_time
    latencies.append(latency)

    retrieved_ids = [doc.metadata["id"] for doc in retrieved_docs]
    correct_id = i
    recalls.append(1 if correct_id in retrieved_ids else 0)

# Calculate metrics (unchanged)
average_recall = sum(recalls) / len(recalls)
print(f"Average recall@1: {average_recall}")

  retrieved_docs = retriever.get_relevant_documents(query)


Average recall@1: 0.7


In [None]:
average_latency = sum(latencies) / len(latencies)
print(f"Average latency: {average_latency} seconds")

Average latency: 0.08334371948242188 seconds


In [None]:
total_queries = len(summaries_clean)
total_time = sum(latencies)
QPS = total_queries / total_time if total_time > 0 else 0
print(f"QPS: {QPS}")

QPS: 59.99252290455499


In [None]:
# Memory usage (unchanged)
memory_usage = psutil.virtual_memory().used / (1024 ** 3)
print(f"Memory usage: {memory_usage} GB")

Memory usage: 2.5468482971191406 GB
