In [None]:
%pip install llama_index
%pip install llama-index-embeddings-huggingface
%pip install chromadb
%pip install llama-index-vector-stores-chroma



In [None]:
import pandas as pd
import chromadb
from tqdm import tqdm
from llama_index.core import VectorStoreIndex,Document,StorageContext, load_index_from_storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.node_parser import SentenceSplitter
import ast

In [None]:
CORPUS_FILENAME = "corpus_clean.csv"
corpus_df = pd.read_csv(CORPUS_FILENAME)
# Convert the DataFrame to a list of Document objects
documents = [Document(text=row['text'], doc_id=row['uuid']) for _, row in corpus_df.iterrows()]

In [None]:
# Chunk the documents into nodes of 256 tokens, with 20 overlapping tokens between nodes
node_parser = SentenceSplitter(chunk_size=256, chunk_overlap=20)
nodes = node_parser.get_nodes_from_documents(documents)

In [None]:
#setup vector store (ChromaDB)
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
# Initialize the embedding model (https://huggingface.co/BAAI/bge-small-en-v1.5)
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Create the vector store index from nodes
index = VectorStoreIndex(nodes, embed_model=embed_model,storage_context=storage_context, show_progress=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/427 [00:00<?, ?it/s]

In [None]:
retriever = index.as_retriever()
retriever.retrieve("what are the latest tennis news?")

[NodeWithScore(node=TextNode(id_='61e6d09d-0e54-462c-bf6b-29f2ebc9731c', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='c95251a0282e42439f8685f94cadac69', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='ba419c9cce372f33f85b14d4cfd13145d76c4ef566b97288f5e23984453dd3c6'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='a8a0cd51-efd5-4184-9d8f-1f9ba170bb2e', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='9b5d674da9d47d514350dfcf603e64daa117c5413b258cc23de6712af4f1af31'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='023e56fa-6691-4f6d-afc1-2bd957cb291e', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='0a84fceeebdcf4d38e3828c725516f41fa859f81d06d8c1d83f0c36d7ff8b94b')}, text='netflix. dont remember seeing boris on there. guy is ridiculous. ive made my career off the court without the help of the others.  not once have i taken slams aw

In [None]:
#Import queries dataset to test the retriever
queries = pd.read_csv("queries.csv")

In [None]:
queries.shape

(2330, 2)

In [None]:
#Retrieve uuids for a sample of queries
query_sample = queries.sample(500,random_state=42).copy().reset_index(drop=True)
for row in tqdm(range(query_sample.shape[0])):
    text = query_sample.loc[row, 'query']
    result = ast.literal_eval(query_sample.loc[row, 'result'])
    #retrieve nodes
    predictions = [res.node.source_node.node_id for res in retriever.retrieve(text)]
    #compute total matching documents per query
    num_match = 0
    if len(result) > 0:
      num_match = len(set(result)&set(predictions))/len(result)
    query_sample.at[row, 'match'] = num_match

100%|██████████| 500/500 [00:56<00:00,  8.88it/s]


In [None]:
#Recall
query_sample['match'].mean()

0.27283333333333337

In [None]:
#load chroma index from disk
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")
chroma_vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
chroma_index = VectorStoreIndex.from_vector_store(vector_store=chroma_vector_store,embed_model=embed_model)

In [None]:
retriever2 = chroma_index.as_retriever()
retriever2.retrieve("what are the latest tennis news?")

[NodeWithScore(node=TextNode(id_='61e6d09d-0e54-462c-bf6b-29f2ebc9731c', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='c95251a0282e42439f8685f94cadac69', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='ba419c9cce372f33f85b14d4cfd13145d76c4ef566b97288f5e23984453dd3c6'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='a8a0cd51-efd5-4184-9d8f-1f9ba170bb2e', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='9b5d674da9d47d514350dfcf603e64daa117c5413b258cc23de6712af4f1af31'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='023e56fa-6691-4f6d-afc1-2bd957cb291e', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='0a84fceeebdcf4d38e3828c725516f41fa859f81d06d8c1d83f0c36d7ff8b94b')}, text='netflix. dont remember seeing boris on there. guy is ridiculous. ive made my career off the court without the help of the others.  not once have i taken slams aw

In [None]:
#zip to save in local
!zip -r /content/chroma_db.zip /content/chroma_db


  adding: content/chroma_db/ (stored 0%)
  adding: content/chroma_db/chroma.sqlite3 (deflated 73%)
  adding: content/chroma_db/3d79670a-5af3-41c0-a442-63304861bc53/ (stored 0%)
  adding: content/chroma_db/3d79670a-5af3-41c0-a442-63304861bc53/header.bin (deflated 55%)
  adding: content/chroma_db/3d79670a-5af3-41c0-a442-63304861bc53/data_level0.bin (deflated 12%)
  adding: content/chroma_db/3d79670a-5af3-41c0-a442-63304861bc53/link_lists.bin (deflated 80%)
  adding: content/chroma_db/3d79670a-5af3-41c0-a442-63304861bc53/index_metadata.pickle (deflated 43%)
  adding: content/chroma_db/3d79670a-5af3-41c0-a442-63304861bc53/length.bin (deflated 70%)
