In [None]:
# Install necessary libraries (updated for Weaviate)
!pip install sentence-transformers mteb weaviate-client langchain-weaviate langchain langchain-community psutil



In [None]:
# Import libraries (updated vector store import)
import numpy as np
import matplotlib.pyplot as plt
# from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score
import nltk
import re
import torch
from sentence_transformers import SentenceTransformer
from mteb import MTEB
import pandas as pd
from langchain.docstore.document import Document
from langchain_community.vectorstores import Weaviate
from langchain_weaviate.vectorstores import WeaviateVectorStore  # Changed import
from langchain.embeddings import HuggingFaceEmbeddings  # Updated embedding class
import time
import psutil
import weaviate
from weaviate.classes.init import Auth
from weaviate.exceptions import WeaviateStartUpError

In [None]:
# Download NLTK resources (unchanged)
nltk.download('punkt')
nltk.download('stopwords')

# Text cleaning function (unchanged)
def simple_clean(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Data loading (unchanged)
df = pd.read_excel('/content/synthetic_resume_summaries (1).xlsx')
df.head()

Unnamed: 0,text,summary
0,Medical Advisor - General Medicines (Diabetes)...,**Medical Advisor** with over 15 years of expe...
1,"linkedin.com/in/hafsah09/\n\nSUMMARY ,\n* Over...",**Digital Product Management Professional** wi...
2,"'o\n\n+\n\nSummary ,\nI have a very positive a...",**Accountant** with experience in financial ma...
3,BRIDGETTE\nWENG\n2007\n\nPROFILE\n\nExtremely ...,**Medical Assistant** with hands-on experience...
4,"Professional Summary ,\nA highly organised, mo...",**Aspiring Legal Professional** with a strong ...


In [None]:
texts = df["text"].tolist()
summaries = df["summary"].tolist()
N = len(texts)

# Data cleaning (unchanged)
texts_clean = [simple_clean(text) for text in texts]
summaries_clean = [simple_clean(str(summary)) for summary in summaries]

In [None]:
# Document creation (unchanged)
docs = [Document(page_content=text, metadata={"id": i}) for i, text in enumerate(texts_clean)]

In [None]:

# Embedding setup (unchanged)
model_name = "all-MiniLM-L6-v2"
embedding_function = HuggingFaceEmbeddings(model_name=model_name)

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
import os
from google.colab import userdata
# Weaviate Cloud Initialization (inspired by sample code)
# Securely get Weaviate credentials
if not os.getenv("WEAVIATE_URL"):
    os.environ["WEAVIATE_URL"] = userdata.get('WEAVIATE_URL')
if not os.getenv("WEAVIATE_API_KEY"):
    os.environ["WEAVIATE_API_KEY"] = userdata.get('WEAVIATE_API_KEY')

wcd_url = os.environ.get("WEAVIATE_URL")
wcd_api_key = os.environ.get("WEAVIATE_API_KEY")

In [None]:
print(f"Cluster URL: {wcd_url}")
print(f"API Key: {wcd_api_key}")

Cluster URL: yo3xbjtjr5hxsmph1ibmw.c0.asia-southeast1.gcp.weaviate.cloud
API Key: impIFRdfR0u8qOUMCdu80l1GMBdcR2JmWbrt


In [None]:
# Connect to Weaviate Cloud
try:
    weaviate_client = weaviate.connect_to_weaviate_cloud(
        cluster_url=wcd_url,
        auth_credentials=Auth.api_key(wcd_api_key)
    )
except WeaviateStartUpError as e:
    raise Exception(f"Failed to connect to Weaviate Cloud: {e}. Verify your cluster URL and API key.")
except Exception as e:
    raise Exception(f"Unexpected error connecting to Weaviate Cloud: {e}")

In [None]:
# Weaviate Ingestion
start_time = time.time()
try:
    # Create WeaviateVectorStore and ingest documents
    # Weaviate auto-creates a class/schema with vectors of dimension 768
    vectorstore = WeaviateVectorStore.from_documents(
        documents=docs,
        embedding=embedding_function,
        client=weaviate_client
    )
except Exception as e:
    weaviate_client.close()
    raise Exception(f"Failed to ingest documents to Weaviate: {e}")
ingestion_time = time.time() - start_time
print(f"Ingestion time: {ingestion_time} seconds")

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Retriever setup (unchanged)
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

In [None]:
# Performance metrics (unchanged)
latencies = []
recalls = []

for i, query in enumerate(summaries_clean):
    start_time = time.time()
    retrieved_docs = retriever.get_relevant_documents(query)
    latency = time.time() - start_time
    latencies.append(latency)

    retrieved_ids = [doc.metadata["id"] for doc in retrieved_docs]
    correct_id = i
    recalls.append(1 if correct_id in retrieved_ids else 0)

  retrieved_docs = retriever.get_relevant_documents(query)


In [None]:
# Calculate metrics (unchanged)
average_recall = sum(recalls) / len(recalls)
print(f"Average recall@1: {average_recall}")

Average recall@1: 0.0


In [None]:
average_latency = sum(latencies) / len(latencies)
print(f"Average latency: {average_latency} seconds")

Average latency: 0.18535755681991578 seconds


In [None]:
total_queries = len(summaries_clean)
total_time = sum(latencies)
QPS = total_queries / total_time if total_time > 0 else 0
print(f"QPS: {QPS}")

QPS: 5.394978317347754


In [None]:
# Memory usage (unchanged)
memory_usage = psutil.virtual_memory().used / (1024 ** 3)
print(f"Memory usage: {memory_usage} GB")

Memory usage: 2.5017623901367188 GB


In [None]:
# Clean up (close Weaviate client)
weaviate_client.close()