This notebook aims to find the optimized process to 
- Chunking
- Vectorization or Embeddings

# Import Libraries

In [2]:
import pandas as pd
import numpy as np

# Step-1 Data Ingestion

In [11]:
df=pd.read_csv('../data/0-10000eng_subtitles.csv',index_col=0).copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   name                   10000 non-null  object
 1   pre-processed_content  10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB


In [12]:
df

Unnamed: 0_level_0,name,pre-processed_content
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,the.message.(1976).eng.1cd,name god gracious merci muhammad messeng god h...
1,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,ah princess dawn terri blooney looney soldier ...
2,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,yumi cell episod extrem polit yumi yumi get ma...
3,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,yumi cell episod laptop first place mine mine ...
4,broker.(2022).eng.1cd,go throw away give birth pleas take care anyth...
...,...,...
9995,recess.(1997).eng.1cd,bell ring children cheer wa umph ah burp right...
9996,recess.(1997).eng.1cd,bell ring children cheer pop ah crash ah burp ...
9997,recess.(1997).eng.1cd,bell ring cheer yell whimper org deprec pleas ...
9998,recess.(1997).eng.1cd,bell ring children cheer wha use free code joi...


# Step- 2 Chunking

In [13]:
#create two lists
new_id=[]
new_content=[]

#loop through the dataframe and chunk the content
def chunker(chunk_size,id,content):
    
    tokens=[token for token in content.split()]
    #get length of the tokens
    n=len(tokens)
    
    #initialize index and count
    index,count=0,0
    #print(id,content,tokens)
    while True:
        if index==0:
            si,ei=index,256
            if ei>n:
                new_id.append(f"{id}-{count}")
                new_content.append(" ".join(tokens[si:]))
                break
            else:
                new_id.append(f"{id}-{count}")
                new_content.append(" ".join(tokens[si:ei]))
                index=ei
        else:
            si,ei=index-chunk_size,index+256-chunk_size #chunking buffer
            if ei>n:
                new_id.append(f"{id}-{count}")
                new_content.append(" ".join(tokens[si:]))
                break
            else:
                new_id.append(f"{id}-{count}")
                new_content.append(" ".join(tokens[si:ei]))
                index=ei
        count+=1
        

In [5]:
for i in range(len(df)):
    chunker(25,i,df['pre-processed content'][i])

In [6]:
new_df=pd.DataFrame({'id':new_id,'content':new_content})
new_df

Unnamed: 0,id,content
0,0-0,name god gracious merci muhammad messeng god h...
1,0-1,poet joy kit love kin wine cake abound skill a...
2,0-2,talk tell uncl protect child protect still say...
3,0-3,ammar see god kaaba everi day afraid listen pe...
4,0-4,pleasant idea slave beggar give pretens bilal ...
...,...,...
1261,99-9,exeunt flourish work think still reboot need l...
1262,99-10,hey spirit gon na chant name right lay lay rea...
1263,99-11,notic lay lay pleas hear real teenag girl avat...
1264,99-12,life best besti anyon could ask hiccup tri alw...


In [7]:
for i in range(3):
    print(new_df.loc[i,'content'])

name god gracious merci muhammad messeng god heraclius emperor byzantium greet follow righteous guidanc bid hear divin call messeng god peopl accept islam salvat speak new prophet arabia like john baptist came king herod desert cri salvat muqawqi patriarch alexandria kisra emperor persia muhammad call call god accept islam salvat embrac islam come desert smell camel goat tell persia kneel muhammad messeng god gave author god sent muhammad merci mankind scholar historian islam univers al azhar cairo high islam congress shiat lebanon maker film honour islam tradit hold imperson prophet offend spiritu messag therefor person mohammad shown six hundr year christ die europ sunk dark age everywher old civil fall muhammad born mecca arabia mecca rich trade citi rule merchant whose wealth multipli uniqu privileg hous god everi year time great fair desert priest brought idol imag god custodi kaaba holi shrine abraham kaaba becom hous idolotri host fewer three hundr sixti differ god mecca six hun

In [11]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       1266 non-null   object
 1   content  1266 non-null   object
dtypes: object(2)
memory usage: 19.9+ KB


### Observation
- Chunking has been performed with buffer size of 25 tokens

# Step-2 Send thsee tokens to Chroma DB

In [8]:
import chromadb
chroma_client = chromadb.Client()

In [9]:
collection = chroma_client.create_collection(name="demo")

In [12]:
collection.add(
    documents=new_df['content'].to_list(),
    ids=new_df['id'].to_list(),
)

In [13]:
dialogue='may ask give lion judah know begin friendship '
results = collection.query(
    query_texts=[dialogue],
    n_results=2
)
results

{'ids': [['0-6', '0-15']],
 'distances': [[1.1682987213134766, 1.3359113931655884]],
 'metadatas': [[None, None]],
 'embeddings': None,
 'documents': [['may ask give lion judah know begin friendship begin certain runaway slave escap us kingdom slave go back doubt would return slave us howev free men among rebel rebel disturb arabia inform rebel religion one time anoth religion rebellion bodi slave world beaten dispos jesus christ shepherd soul men sheep arab betray religion father follow lunat call prophet put soul chain without hear good stiff next hang bow prophet muhammad man kneel god muhammad miracl jafar prophet light sky miracl inde true god given prophet sign miracl may recogn miracl muhammad holi quran book book written illiter attribut god think emperor enough mind petti cost god set tongu fire upon head christ apostl could speak mani languag world knew miracl happen time heard enough made poor case suffer persecut mecca muhammad told us go abyssinia land righteous king man w

In [17]:
#Just return id
ids=results['ids'][0]
distinct_ids=set()
for i in ids:
    distinct_ids.add(i.split('-')[0])
distinct_ids

{'0'}

In [20]:
# Return the name of the series
for ind in distinct_ids:
    print(df.loc[int(ind),'name'])

the.message.(1976).eng.1cd


In [25]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import inflect
from tqdm import tqdm

# Download NLTK resources if not already downloaded
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')

def text_preprocessing(corpus, flag):
    # Compiled regular expressions for patterns
    pattern1 = re.compile(r'\d+\r\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}')
    pattern2 = re.compile(r'\r\nWatch any video online with Open-SUBTITLES\r\nFree Browser extension: osdb.link/ext\r\n\r\n\r\n')
    pattern3 = re.compile(r'Please rate this subtitle at www.osdb.link/agwma\r\nHelp other users to choose the best subtitles')
    pattern4 = re.compile(r'(\r\n)+')
    pattern5 = re.compile(r'<[/]?\w+>')
    # Stopwords
    stop_words = set(stopwords.words('english'))

    # remove timestamps
    corpus = re.sub(pattern1, '', corpus)

    # remov header and footer
    corpus = re.sub(pattern2, '', corpus)
    corpus = re.sub(pattern3, '', corpus)

    # remove extra line breaks
    corpus = re.sub(pattern4, '\r\n', corpus)

    # remove html tags
    corpus = re.sub(pattern5, '', corpus)

    # change  of numbers
    #p = inflect.engine()
    #corpus = re.sub(r'\d+', lambda x: p.number_to_words(x.group(0)), corpus)

    # remove special characters
    corpus = re.sub('[^a-zA-Z]', ' ', corpus)

    # convert to lower case
    corpus = corpus.lower()

    # removal of whitespaces
    corpus = ' '.join(corpus.split())

    # tokenize
    words = word_tokenize(corpus)

    # Stemming or Lemmatization
    if flag == "stemming":
        # stemming
        stemmer = SnowballStemmer(language='english')
        return ' '.join(stemmer.stem(word) for word in words if word not in stop_words)
    else:
        # lemmatization
        lemmatizer = WordNetLemmatizer()
        return ' '.join(lemmatizer.lemmatize(word) for word in words if word not in stop_words)





In [47]:
# create a function where user query returns top 10 relevant searches
def getResults(query,flag):
    
    # Pre process the query
    text=text_preprocessing(query, flag)
    print(text)
    
    #query with Chroma DB
    results = collection.query(
        query_texts=[text],
        n_results=30
    )
    
    #get distinct-parent ids
    ids=results['ids'][0]
    distinct_ids=set()
    for i in ids:
        distinct_ids.add(i.split('-')[0])
    
    #query these distinct ids
    count=0
    results=[]
    for ind in distinct_ids:
        if count>20: break
        results.append(df.loc[int(ind),'name'])
        count+=1
    
    return results





In [48]:
getResults("A former TV showgirl, now in full physical and mental decline",'stemmatization')

former tv showgirl full physical mental decline


['my.unfamiliar.family.s01.e09.episode.1.9.(2020).eng.1cd',
 'my.unfamiliar.family.s01.e02.episode.1.2.(2020).eng.1cd',
 'my.unfamiliar.family.(2020).eng.1cd',
 'my.unfamiliar.family.(2020).eng.1cd',
 'that.girl.lay.lay.s02.e01.aint.that.a.glitch.part.one.(2022).eng.1cd',
 'westworld.s04.e05.zhuangzi.(2022).eng.1cd',
 'that.girl.lay.lay.s02.e01.aint.that.a.glitch.part.one.(2022).eng.1cd',
 'my.unfamiliar.family.s01.e05.episode.1.5.(2020).eng.1cd',
 'price.check.(2012).eng.1cd',
 'my.unfamiliar.family.s01.e03.episode.1.3.(2020).eng.1cd',
 'my.unfamiliar.family.(2020).eng.1cd',
 'the.governor.s02.e04.episode.2.4.(1996).eng.1cd',
 'survivor.s03.e16.survivor.back.from.africa.(2002).eng.1cd',
 'trying.s02.e01.a.nice.boy.(2021).eng.1cd',
 'my.unfamiliar.family.s01.e12.episode.1.12.(2020).eng.1cd',
 'doctor.lawyer.s01.e14.episode.1.14.(2022).eng.1cd',
 'the.governor.s02.e06.episode.2.6.(1996).eng.1cd',
 'the.great.beauty.(2013).eng.1cd',
 'the.governor.s02.e02.episode.2.2.(1996).eng.1cd',
 'm

# Observation
- The performance is not that great

## Step -2.1 Let's send lemmatized tokens and check the performance

In [67]:
lem_df=df.copy()
lem_df['pre-processed content']=df['decoded_content'].apply(lambda x:text_preprocessing(x,'lemmatization'))
lem_df.head()

Unnamed: 0,name,decoded_content,pre-processed content
0,the.message.(1976).eng.1cd,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an...",name god gracious merciful muhammad messenger ...
1,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,"1\r\n00:00:29,359 --> 00:00:32,048\r\nAh! Ther...",ah princess dawn terry blooney looney soldier ...
2,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,"1\r\n00:00:53,200 --> 00:00:56,030\r\n<i>Yumi'...",yumi cell episode extremely polite yumi yumi g...
3,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an...",yumi cell episode laptop first place mine mine...
4,broker.(2022).eng.1cd,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch...",going throw away give birth please take care a...


In [68]:
#create two lists
new_id=[]
new_content=[]
#chunker
for i in range(len(lem_df)):
    chunker(25,i,lem_df['pre-processed content'][i])

new_lem_df=pd.DataFrame({'id':new_id,'content':new_content})
new_lem_df
new_lem_df.head()

Unnamed: 0,id,content
0,0-0,name god gracious merciful muhammad messenger ...
1,0-1,abu sofyan revel song begin abu sofyan invite ...
2,0-2,restrain nephew dividing city heart house divi...
3,0-3,muhammad generous yes give share pas man witho...
4,0-4,lash face teach mouth lesson whip whip cut whi...


In [69]:
collection=chroma_client.get_or_create_collection(name='demo2')

In [70]:
collection.add(
    documents=new_lem_df['content'].to_list(),
    ids=new_lem_df['id'].to_list(),
)

In [74]:

# create a function where user query returns top 10 relevant searches
getResults("Hello there",'lemmatization')



hello


['animal.kingdom.s06.e08.revelation.(2022).eng.1cd',
 'yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd',
 'trying.s02.e03.big.heads.(2021).eng.1cd',
 'parallel.world.pharmacy.s01.e03.the.chief.royal.pharmacian.and.the.reincarnated.pharmacologist.(2022).eng.1cd',
 'my.unfamiliar.family.(2020).eng.1cd',
 'trying.s02.e02.the.sun.on.your.back.(2021).eng.1cd',
 'survivor.(2000).eng.1cd',
 'the.governor.s01.e02.episode.1.2.(1995).eng.1cd',
 'my.unfamiliar.family.s01.e07.episode.1.7.(2020).eng.1cd',
 'my.unfamiliar.family.s01.e05.episode.1.5.(2020).eng.1cd',
 'rudrabinar.obhishaap.s02.e05.saat.surer.mejaj.(2022).eng.1cd',
 'the.governor.s01.e02.episode.1.2.(1995).eng.1cd',
 'animal.kingdom.s06.e08.revelation.(2022).eng.1cd',
 'rudrabinar.obhishaap.s02.e01.swaralipir.kut.taan.(2022).eng.1cd',
 'reetur.s01.e01.disappointment.and.recruitment.(2019).eng.1cd',
 'my.unfamiliar.family.s01.e01.episode.1.1.(2020).eng.1cd',
 'rudrabinar.obhishaap.s02.e03.anandagarher.akhhyan.(2022).eng.1cd',
 'trying.s

In [52]:
df.head()

Unnamed: 0,name,decoded_content,pre-processed content
0,the.message.(1976).eng.1cd,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an...",name god gracious merci muhammad messeng god h...
1,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,"1\r\n00:00:29,359 --> 00:00:32,048\r\nAh! Ther...",ah princess dawn terri blooney looney soldier ...
2,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,"1\r\n00:00:53,200 --> 00:00:56,030\r\n<i>Yumi'...",yumi cell episod extrem polit yumi yumi get ma...
3,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an...",yumi cell episod laptop first place mine mine ...
4,broker.(2022).eng.1cd,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch...",go throw away give birth pleas take care anyth...
