! pip install -U sentence-transformers  
! pip install ipywidgets  
! pip install trange  
! pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121  
! pip install chromadb  

In [None]:
import ipywidgets
from tqdm.autonotebook import tqdm, trange
from sentence_transformers import SentenceTransformer
import pandas as pd
import torch


### Pre-processing

In [2]:
# Read the parquet file of the YouTube History into a pandas dataframe
df = pd.read_parquet('output_with_transcripts.parquet')

# Function to split text into chunks of max 256 words (recommended length for all-MiniLM-L6-v2)
def split_text(text, max_words=256):
    words = text.split()
    return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

# Fill NaN values with an empty string
df = df.fillna('')

# Create a new column 'video_id' by splitting 'titleURL' after '='
df['video_id'] = df['titleUrl'].apply(lambda x: x.split('=')[-1])

# Merge 'title' and 'transcript' into a new column 'corpus'
df['corpus'] = df['title'] + ' ' + df['transcript']

# Split 'corpus' into paragraphs no longer than 256 words
df['corpus'] = df['corpus'].apply(split_text)

# Explode the dataframe on 'corpus' to get each paragraph in a separate row
df = df.explode('corpus').reset_index(drop=True)

# Concatenate 'id' with the number of the split paragraph
df['paragraph_number'] = df.groupby('video_id').cumcount().astype(str)

# Find the maximum number of digits in 'paragraph_number'
max_digits = df['paragraph_number'].apply(len).max()

# Add zero padding to 'paragraph_number'
df['paragraph_number'] = df['paragraph_number'].apply(lambda x: x.zfill(max_digits))

# Concatenate 'id' with '_' and the zero padded 'paragraph_number'
df['id'] = df['video_id'] + '_' + df['paragraph_number']

df.head()



Unnamed: 0,title,titleUrl,datetime,transcript,video_id,corpus,paragraph_number,id
0,How to download your Youtube watch history,https://www.youtube.com/watch?v=dto8jGMxHxY,2024-06-03 08:23:47.724,,dto8jGMxHxY,How to download your Youtube watch history,0,dto8jGMxHxY_000
1,How to Build Data Pipelines for ML Projects (w...,https://www.youtube.com/watch?v=OnIQrDiTtRM,2024-06-03 07:44:20.899,when you think of machine learning fancy algor...,OnIQrDiTtRM,How to Build Data Pipelines for ML Projects (w...,0,OnIQrDiTtRM_000
2,How to Build Data Pipelines for ML Projects (w...,https://www.youtube.com/watch?v=OnIQrDiTtRM,2024-06-03 07:44:20.899,when you think of machine learning fancy algor...,OnIQrDiTtRM,is what connects these two things together NE ...,1,OnIQrDiTtRM_001
3,How to Build Data Pipelines for ML Projects (w...,https://www.youtube.com/watch?v=OnIQrDiTtRM,2024-06-03 07:44:20.899,when you think of machine learning fancy algor...,OnIQrDiTtRM,and columns for example if you're working with...,2,OnIQrDiTtRM_002
4,How to Build Data Pipelines for ML Projects (w...,https://www.youtube.com/watch?v=OnIQrDiTtRM,2024-06-03 07:44:20.899,when you think of machine learning fancy algor...,OnIQrDiTtRM,the extract process which is acquiring data fr...,3,OnIQrDiTtRM_003


### Calculate Embeddings

Chose to use the SentenceTransformer library and not the ChromDB embeddings in case we need more control.

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cuda')

In [None]:
# Create a tensor of embeddings from the corpus
corpus = df['corpus'].to_list()
embeddings = model.encode(corpus)

In [5]:
embeddings.shape

(8252, 384)

### Storing in a vector DB (ChromaDB)

https://docs.trychroma.com/guides

In [6]:
import chromadb

In [7]:
# Format the data for adding to ChromaDB

# Create a list of ids
ids = df['id'].to_list()
# Create a list of dictionaries from the dataframe for the metadata
df['datetime'] = df['datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')
metadata = df[['video_id', 'paragraph_number', 'title', 'titleUrl', 'datetime']].to_dict('records')


In [10]:
chroma_client = chromadb.PersistentClient(path="chroma")
collection = chroma_client.get_or_create_collection(name='yt-history')


In [11]:
# We use upsert instead of add, so the code works for future updates without adding the same data multiple times
collection.upsert(
    documents=corpus,
    embeddings=embeddings,
    metadatas=metadata,
    ids=ids
)

In [12]:
collection.count()

8252

In [13]:
results = collection.query(query_texts=["This is a query document"], n_results=2)

In [None]:
results