This is the code used in yt-history-api.py to get the results from ChromaDB

In [None]:
import ipywidgets
from tqdm.autonotebook import tqdm, trange
from sentence_transformers import SentenceTransformer
import pandas as pd

import chromadb

### Select relevant documents from ChromaDB

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cuda')

In [3]:
number_of_results = 15

In [4]:
chroma_client = chromadb.PersistentClient(path="chroma")
collection = chroma_client.get_or_create_collection(name='yt-history')

In [None]:
query = "LLM with RAG"
qembeddings = [model.encode(query).astype('float').tolist()]


In [6]:
results = collection.query(
    query_embeddings=qembeddings, 
    n_results=number_of_results*10 # 10x more results in order to re-rank them
)

In [7]:
res_dicts = results['metadatas'][0]
df = pd.DataFrame(res_dicts).drop_duplicates(subset='video_id')
df.head(number_of_results)
# We can see that we have some relevant results, and some totally irrelevant ones

Unnamed: 0,datetime,paragraph_number,title,titleUrl,video_id
0,2024-05-11 14:35:54,20,Vector Search RAG Tutorial – Combine Your Data...,https://www.youtube.com/watch?v=JEBDfGqrAUA,JEBDfGqrAUA
1,2024-05-02 19:04:59,0,"Reliable, fully local RAG agents with LLaMA3",https://www.youtube.com/watch?v=-ROS6gfYIts,-ROS6gfYIts
2,2024-05-09 05:21:22,0,Learn RAG From Scratch – Python AI Tutorial fr...,https://www.youtube.com/watch?v=sVcwVQRHIc8,sVcwVQRHIc8
3,2024-05-05 21:46:49,0,GraphRAG: LLM-Derived Knowledge Graphs for RAG,https://www.youtube.com/watch?v=r09tJfON6kE,r09tJfON6kE
5,2024-04-30 20:00:34,13,Python RAG Tutorial (with Local LLMs): AI For ...,https://www.youtube.com/watch?v=2TJxpyO3ei4,2TJxpyO3ei4
6,2024-05-17 06:36:50,0,How to Improve LLMs with RAG (Overview + Pytho...,https://www.youtube.com/watch?v=Ylz779Op9Pw,Ylz779Op9Pw
12,2024-04-17 04:16:22,23,The ULTIMATE Toxic Deck! | Finally Affliction ...,https://www.youtube.com/watch?v=REMzzA4vamc,REMzzA4vamc
14,2024-05-12 19:25:58,3,Function Calling with Local Models & LangChain...,https://www.youtube.com/watch?v=Ss_GdU0KqE0,Ss_GdU0KqE0
15,2024-05-10 21:55:42,0,Convert Any Webpage Into LLM Dataset - Local a...,https://www.youtube.com/watch?v=gn42k6OuJSc,gn42k6OuJSc
17,2024-05-01 15:30:43,0,Unleash the power of Local LLM's with Ollama x...,https://www.youtube.com/watch?v=IJYC6zf86lU,IJYC6zf86lU


### Re-Ranking

pip install -U FlagEmbedding

We will use the re-ranking technique in order to sort our results from the most relevant to theleast relevant.  
We will use code from here:  
https://huggingface.co/BAAI/bge-reranker-v2-m3 

In [None]:
from FlagEmbedding import FlagReranker

In [None]:
reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True, device='cuda')

In [None]:
pairs = [[query, doc] for doc in results['documents'][0]]

In [None]:
scores = reranker.compute_score(pairs, normalize=False)

Compute Scores: 100%|██████████| 1/1 [00:01<00:00,  1.24s/it]


In [None]:
# Sort results by relevance score

sorted_results = sorted(
    zip(
        results['ids'][0], 
        scores
        ), 
        key=lambda x: x[1], reverse=True
    )


In [None]:
sorted_ids = [row[0] for row in sorted_results]

In [None]:
res_ids = results['ids'][0] # chromadb results ids
doc_list = results['documents'][0] # chromadb results documents
res_meta_dicts = results['metadatas'][0] # chromadb results metadatas
df_meta = pd.DataFrame(res_meta_dicts) # create dataframe from metadatas
df_meta.drop('datetime', axis=1, inplace=True) # drop datetime
df_meta.insert(len(df_meta.columns), 'paragraph', doc_list)
df_meta.insert(0, 'id', res_ids) # add ids as first column in the dataframe
df_meta.set_index('id', inplace=True) # create index from ids
df_meta = df_meta.reindex(sorted_ids) # reindex dataframe with the sorted ids


In [None]:
df_meta = df_meta.drop_duplicates(subset='video_id') # drop duplicate videos
df_meta.reset_index(drop=True, inplace=True)
df_meta = df_meta.head(number_of_results) # keep only the first 15 videos

In [None]:
df_meta

Unnamed: 0,paragraph_number,title,titleUrl,video_id,paragraph
0,1,Python RAG Tutorial (with Local LLMs): AI For ...,https://www.youtube.com/watch?v=2TJxpyO3ei4,2TJxpyO3ei4,get up to speed with all of the basic concepts...
1,0,GraphRAG: LLM-Derived Knowledge Graphs for RAG,https://www.youtube.com/watch?v=r09tJfON6kE,r09tJfON6kE,GraphRAG: LLM-Derived Knowledge Graphs for RAG...
2,7,Unleash the power of Local LLM's with Ollama x...,https://www.youtube.com/watch?v=IJYC6zf86lU,IJYC6zf86lU,thing which is ask llama 2 running on oama wha...
3,0,Learn RAG From Scratch – Python AI Tutorial fr...,https://www.youtube.com/watch?v=sVcwVQRHIc8,sVcwVQRHIc8,Learn RAG From Scratch – Python AI Tutorial fr...
4,14,Run your own AI (but private),https://www.youtube.com/watch?v=WxYC9-hBM_g,WxYC9-hBM_g,it's accurate to make sure you got your stuff ...
5,0,How to Improve LLMs with RAG (Overview + Pytho...,https://www.youtube.com/watch?v=Ylz779Op9Pw,Ylz779Op9Pw,How to Improve LLMs with RAG (Overview + Pytho...
6,20,Vector Search RAG Tutorial – Combine Your Data...,https://www.youtube.com/watch?v=JEBDfGqrAUA,JEBDfGqrAUA,address these issues. RAG uses vector search t...
7,0,Advance RAG with Azure & LlamaIndex,https://www.youtube.com/watch?v=8Rbq6zHarZM,8Rbq6zHarZM,Advance RAG with Azure & LlamaIndex finally we...
8,6,LangGraph 101: it's better than LangChain,https://www.youtube.com/watch?v=qaWOwbFw3cs,qaWOwbFw3cs,Library can actually be use for but this is a ...
9,4,"Reliable, fully local RAG agents with LLaMA3",https://www.youtube.com/watch?v=-ROS6gfYIts,-ROS6gfYIts,constrained so it only ever traverses this pat...


In [None]:
df_meta[['title', 'titleUrl', 'paragraph']].to_dict('records')