In [None]:
%pip install llama_index
%pip install llama-index-embeddings-huggingface
%pip install chromadb
%pip install llama-index-vector-stores-chroma

In [None]:
import pandas as pd
import chromadb
from tqdm import tqdm
from llama_index.core import VectorStoreIndex,Document,StorageContext, load_index_from_storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.node_parser import SentenceSplitter
import ast

In [None]:
CORPUS_FILENAME = "corpus_clean.csv"
corpus_df = pd.read_csv(CORPUS_FILENAME)
# Convert the DataFrame to a list of Document objects
documents = [Document(text=row['text'], doc_id=row['uuid']) for _, row in corpus_df.iterrows()]

In [None]:
# Chunk the documents into nodes of 256 tokens, with 20 overlapping tokens between nodes
node_parser = SentenceSplitter(chunk_size=256, chunk_overlap=20)
nodes = node_parser.get_nodes_from_documents(documents)

In [None]:
#setup vector store (ChromaDB)
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
# Initialize the embedding model (https://huggingface.co/BAAI/bge-small-en-v1.5)
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Create the vector store index from nodes
index = VectorStoreIndex(nodes, embed_model=embed_model,storage_context=storage_context, show_progress=True)

In [None]:
retriever = index.as_retriever()
retriever.retrieve("what are the latest tennis news?")

In [None]:
#Import queries dataset to test the retriever
queries = pd.read_csv("queries.csv")

In [None]:
queries.shape

In [None]:
#Retrieve uuids for a sample of queries
query_sample = queries.sample(500,random_state=42).copy().reset_index(drop=True)
for row in tqdm(range(query_sample.shape[0])):
    text = query_sample.loc[row, 'query']
    result = ast.literal_eval(query_sample.loc[row, 'result'])
    #retrieve nodes
    predictions = [res.node.source_node.node_id for res in retriever.retrieve(text)]
    #compute total matching documents per query
    num_match = 0
    if len(result) > 0:
      num_match = len(set(result)&set(predictions))/len(result)
    query_sample.at[row, 'match'] = num_match

In [None]:
#Recall
query_sample['match'].mean()

In [None]:
#load chroma index from disk
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")
chroma_vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
chroma_index = VectorStoreIndex.from_vector_store(vector_store=chroma_vector_store,embed_model=embed_model)

In [None]:
retriever2 = chroma_index.as_retriever()
retriever2.retrieve("what are the latest tennis news?")

In [None]:
#zip to save in local
!zip -r /content/chroma_db.zip /content/chroma_db
