# Import Libraries


In [1]:
import numpy as np
import pandas as pd

# Step- I Data Ingestion

In [2]:
df=pd.read_csv('../data/0-10000eng_subtitles.csv',index_col=0).copy()
df

Unnamed: 0_level_0,name,pre-processed_content
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,the.message.(1976).eng.1cd,name god gracious merci muhammad messeng god h...
1,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,ah princess dawn terri blooney looney soldier ...
2,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,yumi cell episod extrem polit yumi yumi get ma...
3,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,yumi cell episod laptop first place mine mine ...
4,broker.(2022).eng.1cd,go throw away give birth pleas take care anyth...
...,...,...
9995,recess.(1997).eng.1cd,bell ring children cheer wa umph ah burp right...
9996,recess.(1997).eng.1cd,bell ring children cheer pop ah crash ah burp ...
9997,recess.(1997).eng.1cd,bell ring cheer yell whimper org deprec pleas ...
9998,recess.(1997).eng.1cd,bell ring children cheer wha use free code joi...


# Step-II Perform Chunking

In [3]:
#create two lists
new_id=[]
new_content=[]

#loop through the dataframe and chunk the content
def chunker(chunk_size,id,content):
    
    tokens=[token for token in content.split()]
    #get length of the tokens
    n=len(tokens)
    
    #initialize index and count
    index,count=0,0
    #print(id,content,tokens)
    while True:
        if index==0:
            si,ei=index,256
            if ei>n:
                new_id.append(f"{id}-{count}")
                new_content.append(" ".join(tokens[si:]))
                break
            else:
                new_id.append(f"{id}-{count}")
                new_content.append(" ".join(tokens[si:ei]))
                index=ei
        else:
            si,ei=index-chunk_size,index+256-chunk_size #chunking buffer
            if ei>n:
                new_id.append(f"{id}-{count}")
                new_content.append(" ".join(tokens[si:]))
                break
            else:
                new_id.append(f"{id}-{count}")
                new_content.append(" ".join(tokens[si:ei]))
                index=ei
        count+=1
        

In [4]:
for i in range(len(df)):
    chunker(25,i,df['pre-processed_content'][i])

new_df=pd.DataFrame({'id':new_id,'content':new_content})
new_df

Unnamed: 0,id,content
0,0-0,name god gracious merci muhammad messeng god h...
1,0-1,abu sofyan revel song begin abu sofyan invit p...
2,0-2,restrain nephew divid citi heart hous divid ge...
3,0-3,muhammad generous yes give share pass man with...
4,0-4,lash face teach mouth lesson whip whip cut whi...
...,...,...
116920,9999-2,candi spa guess got minut well thank ladi migh...
116921,9999-3,fallen trap door yeah gotten strap laboratori ...
116922,9999-4,tell futur hold besid rainbow happi man would ...
116923,9999-5,dinosaur earthquak gold mine believ clearanc t...


# Step-III Send to ChromaDB

In [26]:
#import chromadb
import chromadb

In [27]:
#create a persistent client
client = chromadb.PersistentClient(path="../data/chromaDB")

In [28]:
collection=client.get_or_create_collection(name='demo',metadata={"hnsw:space": "cosine"})


In [29]:
collection.add(documents=new_df['content'].to_list()[:10],ids=new_df['id'].to_list()[:10])

In [18]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import inflect
from tqdm import tqdm

# Download NLTK resources if not already downloaded
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')

def text_preprocessing(corpus, flag):
    # Compiled regular expressions for patterns
    pattern1 = re.compile(r'\d+\r\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}')
    pattern2 = re.compile(r'\r\nWatch any video online with Open-SUBTITLES\r\nFree Browser extension: osdb.link/ext\r\n\r\n\r\n')
    pattern3 = re.compile(r'Please rate this subtitle at www.osdb.link/agwma\r\nHelp other users to choose the best subtitles')
    pattern4 = re.compile(r'(\r\n)+')
    pattern5 = re.compile(r'<[/]?\w+>')
    # Stopwords
    stop_words = set(stopwords.words('english'))

    # remove timestamps
    corpus = re.sub(pattern1, '', corpus)

    # remov header and footer
    corpus = re.sub(pattern2, '', corpus)
    corpus = re.sub(pattern3, '', corpus)

    # remove extra line breaks
    corpus = re.sub(pattern4, '\r\n', corpus)

    # remove html tags
    corpus = re.sub(pattern5, '', corpus)

    # change  of numbers
    #p = inflect.engine()
    #corpus = re.sub(r'\d+', lambda x: p.number_to_words(x.group(0)), corpus)

    # remove special characters
    corpus = re.sub('[^a-zA-Z]', ' ', corpus)

    # convert to lower case
    corpus = corpus.lower()

    # removal of whitespaces
    corpus = ' '.join(corpus.split())

    # tokenize
    words = word_tokenize(corpus)

    # Stemming or Lemmatization
    if flag == "stemming":
        # stemming
        stemmer = SnowballStemmer(language='english')
        return ' '.join(stemmer.stem(word) for word in words if word not in stop_words)
    else:
        # lemmatization
        lemmatizer = WordNetLemmatizer()
        return ' '.join(lemmatizer.lemmatize(word) for word in words if word not in stop_words)





In [19]:
# create a function where user query returns top 10 relevant searches
def getResults(query,flag):
    
    # Pre process the query
    text=text_preprocessing(query, flag)
    print(text)
    
    #query with Chroma DB
    results = collection.query(
        query_texts=[text],
        n_results=20
    )
    
    #get distinct-parent ids
    ids=results['ids'][0]
    distinct_ids=set()
    for i in ids:
        distinct_ids.add(i.split('-')[0])
    
    #query these distinct ids
    count=0
    results=[]
    for ind in distinct_ids:
        if count>11: break
        results.append(df.loc[int(ind),'name'])
        count+=1
    
    return distinct_ids





In [None]:
getResults('Oh! God pls help me','stemming')

# Step-IV Create a pipeline
- Let's create a pipeline which would take a 10,000 row dataframe of pre-processed transcripts
- Chunk them and simultaneously upload to the vector database

In [31]:
import chromadb
import pandas as pd
#create a persistentClient
client=chromadb.PersistentClient(path="../data/chromaDB")

In [61]:
from tqdm import tqdm

def new_chunker(chunk_size, id, content,collection,new_id,new_content):
    
    tokens = [token for token in content.split()]
    # Get length of the tokens
    n = len(tokens)
    
    # Initialize index and count
    index, count = 0, 0
    # Print(id,content,tokens)
    while True:
        if index == 0:
            si, ei = index,256
            if ei > n:
                new_id.append(f"{id}-{count}")
                new_content.append(" ".join(tokens[si:]))
                break
            else:
                new_id.append(f"{id}-{count}")
                new_content.append(" ".join(tokens[si:ei]))
                index = ei
        else:
            si, ei = index - chunk_size, index + 256 - chunk_size  # chunking buffer
            if ei > n:
                new_id.append(f"{id}-{count}")
                new_content.append(" ".join(tokens[si:]))
                break
            else:
                new_id.append(f"{id}-{count}")
                new_content.append(" ".join(tokens[si:ei]))
                index = ei
        count += 1

In [33]:
def Chunker_Uploader(chunk_size,collection_name,file_path):
    
    #Data Ingestion
    df=pd.read_csv(file_path,index_col=0)
    #create or get a collection
    collection=client.get_or_create_collection(name=collection_name,metadata={"hnsw:space": "cosine"})

    #create two lists
    new_id=[]
    new_content=[]
    
    #chunk and upload
    for i in df.index:
        new_chunker(chunk_size,i,df.loc[i,'pre-processed_content'],collection,new_id,new_content)
        
      # Initialize tqdm
    progress_bar = tqdm(total=len(new_id), desc="Adding vectors")
    
    # Add chunks to the collection
    for i, (new_doc_id, new_doc_content) in enumerate(zip(new_id, new_content)):
        # Add document to the collection
        collection.add(ids=new_doc_id, documents=new_doc_content)
        
        # Update progress bar
        progress_bar.update(1)
    
    progress_bar.close()
   
    print('Vectors are added!!!')
    
    
    

In [14]:
Chunker_Uploader(25,'transcripts','../data/0-10000eng_subtitles.csv')

Adding vectors: 100%|██████████| 1125/1125 [00:45<00:00, 24.61it/s]

Vectors are added!!!





### Observation
- The above chunk_uploader uploads entire 1 lakh plus vectors in a single pass.
- But the ChromaDB suggest that in a single pass a maximum of 1 Lakh vectors can be added.Hence need to modify it

# Step-V Use the pipeline to add vectors to Chroma DB

In [13]:
import chromadb
import pandas as pd
#create a persistentClient
client=chromadb.PersistentClient(path="../data/chromaDB")

In [2]:
from tqdm import tqdm

def new_chunker(chunk_size, id, content,collection,new_id,new_content):
    
    tokens = [token for token in content.split()]
    # Get length of the tokens
    n = len(tokens)
    
    # Initialize index and count
    index, count = 0, 0
    # Print(id,content,tokens)
    while True:
        if index == 0:
            si, ei = index, 256
            if ei > n:
                new_id.append(f"{id}-{count}")
                new_content.append(" ".join(tokens[si:]))
                break
            else:
                new_id.append(f"{id}-{count}")
                new_content.append(" ".join(tokens[si:ei]))
                index = ei
        else:
            si, ei = index - chunk_size, index + 256 - chunk_size  # chunking buffer
            if ei > n:
                new_id.append(f"{id}-{count}")
                new_content.append(" ".join(tokens[si:]))
                break
            else:
                new_id.append(f"{id}-{count}")
                new_content.append(" ".join(tokens[si:ei]))
                index = ei
        count += 1

In [20]:
def Chunker_Uploader1(chunk_size,collection_name,file_path):
    
    #Data Ingestion
    df=pd.read_csv(file_path,index_col=0)
    df=df.iloc[:11000,:]
    #create or get a collection
    collection=client.get_or_create_collection(name=collection_name,metadata={"hnsw:space": "cosine"})

    #create two lists
    new_id=[]
    new_content=[]
    print('Chunking Began')
    #chunk and upload
    for i in df.index:
        new_chunker(chunk_size,i,df.loc[i,'pre-processed_content'],collection,new_id,new_content)
    
    print('Chunking Completed')
    
    pass_size = 10000
    for i in range(0, len(new_id), pass_size):
        si = i
        ei = min(si + pass_size, len(new_id))  # Ensure ei does not exceed the list length
        curr_id = new_id[si:ei]
        curr_content = new_content[si:ei]
        
        # Initialize tqdm
        progress_bar = tqdm(total=len(new_id), desc="Adding vectors")
        
        # Add chunks to the collection
        for i, (new_doc_id, new_doc_content) in enumerate(zip(curr_id, curr_content)):
            # Add document to the collection
            collection.add(ids=new_doc_id, documents=new_doc_content)
            
            # Update progress bar
            progress_bar.update(1)
        
        progress_bar.close()
    
        print('Vectors are added!!!')
        
    
    

In [17]:
from tqdm import tqdm
import pandas as pd

def Chunker_Uploader(chunk_size, collection_name, file_path):
    # Data Ingestion
    df = pd.read_csv(file_path, index_col=0)
    df = df.iloc[:11000, :]  # Limiting the dataframe for demonstration

    # Create or get a collection
    collection = client.get_or_create_collection(name=collection_name, metadata={"hnsw:space": "cosine"})

    # Create two lists
    new_id = []
    new_content = []

    print('Chunking Began')
    # Chunk and upload
    for i in df.index:
        new_chunker(chunk_size, i, df.loc[i, 'pre-processed_content'], collection, new_id, new_content)

    print('Chunking Completed')
    pass_size = 10000
    for i in range(0, len(new_id), pass_size):
        si = i
        ei = min(si + pass_size, len(new_id))  # Ensure ei does not exceed the list length
        curr_id = new_id[si:ei]
        curr_content = new_content[si:ei]

        # Initialize tqdm with chunk size
        progress_bar = tqdm(total=len(curr_id), desc="Adding vectors", unit="doc")

        # Add chunks to the collection
        for new_doc_id, new_doc_content in zip(curr_id, curr_content):
            # Add document to the collection
            collection.add(ids=new_doc_id, documents=new_doc_content)

            # Update progress bar
            progress_bar.update(1)

        progress_bar.close()

    print('Vectors are added!!!')


In [None]:
Chunker_Uploader1(25,'demo','../data/Pre-processed_content/0-10000eng_subtitles.csv')

In [2]:
client.delete_collection('demo')

In [17]:
collection=client.get_or_create_collection('transcripts')
collection.count()

6239

### Observation
- Chunks are being uploaded to the Chroma DB
