# Step-1 Connect to Chroma Client

In [1]:
import chromadb
chroma_client = chromadb.Client()

# Step-2 Create a Collection

In [2]:
collection = chroma_client.create_collection(name="demo")

# Step-3 Add some text documents to the Collection


- Chroma will store your text, and handle tokenization, embedding, and indexing automatically
- If you have already generated embeddings yourself, you can load them directly in:

In [13]:
collection.add(
    documents=["This is a document", "This is another document","This is not a document"],
    metadatas=[{"source": "my_source"}, {"source": "my_source"},{"source": "my_source"}],
    ids=["id1", "id2","id3"]
)

Insert of existing embedding ID: id1
Insert of existing embedding ID: id2
Add of existing embedding ID: id1
Add of existing embedding ID: id2


In [None]:
#collection.add(
    #embeddings=[[1.2, 2.3, 4.5], [6.7, 8.2, 9.2]],
    #documents=["This is a document", "This is another document","this is not a document"],
    #metadatas=[{"source": "my_source"}, {"source": "my_source"},{"source": "my_source"}],
    #ids=["id1", "id2","id3"]
#)

In [14]:
results = collection.query(
    query_texts=["This is a query document"],
    n_results=2
)
results

{'ids': [['id1', 'id3']],
 'distances': [[0.7111214399337769, 0.8618925213813782]],
 'metadatas': [[{'source': 'my_source'}, {'source': 'my_source'}]],
 'embeddings': None,
 'documents': [['This is a document', 'This is not a document']],
 'uris': None,
 'data': None}

In [2]:
#create a persistent chroma db
import chromadb
import pandas as pd
#create a persistentClient
client=chromadb.PersistentClient(path=r"C:\Users\dsai9\Projects\Semantic Search Engine\data\ChromaDB_8000+_bkp")

In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from tqdm import tqdm

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

def text_preprocessing(corpus, flag):
    # Compiled regular expressions for patterns
    pattern1 = re.compile(r'\d+\r\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}')
    pattern2 = re.compile(r'\r\nWatch any video online with Open-SUBTITLES\r\nFree Browser extension: osdb.link/ext\r\n\r\n\r\n')
    pattern3 = re.compile(r'Please rate this subtitle at www.osdb.link/agwma\r\nHelp other users to choose the best subtitles')
    pattern4 = re.compile(r'(\r\n)+')
    pattern5 = re.compile(r'<[/]?\w+>')
    # Stopwords
    stop_words = set(stopwords.words('english'))

    # remove timestamps
    corpus = re.sub(pattern1, '', corpus)

    # remov header and footer
    corpus = re.sub(pattern2, '', corpus)
    corpus = re.sub(pattern3, '', corpus)

    # remove extra line breaks
    corpus = re.sub(pattern4, '\r\n', corpus)

    # remove html tags
    corpus = re.sub(pattern5, '', corpus)

    # change  of numbers
    #p = inflect.engine()
    #corpus = re.sub(r'\d+', lambda x: p.number_to_words(x.group(0)), corpus)

    # remove special characters
    corpus = re.sub('[^a-zA-Z]', ' ', corpus)

    # convert to lower case
    corpus = corpus.lower()

    # removal of whitespaces
    corpus = ' '.join(corpus.split())

    # tokenize
    words = word_tokenize(corpus)

    # Stemming or Lemmatization
    if flag == "stemming":
        # stemming
        stemmer = SnowballStemmer(language='english')
        return ' '.join(stemmer.stem(word) for word in words if word not in stop_words)
    else:
        # lemmatization
        lemmatizer = WordNetLemmatizer()
        return ' '.join(lemmatizer.lemmatize(word) for word in words if word not in stop_words)





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dsai9\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dsai9\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dsai9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# create a function where user query returns top 10 relevant searches
def getResults(query,flag):
    
    # Pre process the query
    text=text_preprocessing(query, flag)
    print(text)
    
      # Create or get a collection
    collection = client.get_or_create_collection(name='transcripts', metadata={"hnsw:space": "cosine"})
    #query with Chroma DB
    results = collection.query(
        query_texts=[text],
        n_results=30
    )
    
    #get distinct-parent ids
    ids=results['ids'][0]
    distinct_ids=set()
    for i in ids:
        distinct_ids.add(i.split('-')[0])
    print(distinct_ids)
    
    #query these distinct ids
    count=0
    results=[]
    for ind in distinct_ids:
        if count>20: break
        #results.append(df.loc[int(ind),'name'])
        #count+=1
    
    return distinct_ids





In [6]:
getResults('Hey God!! Pls help me','stemming')

hey god pls help
{'80479', '81339', '81786', '80394', '81550', '80177', '80158', '81380', '80159', '80076', '80033', '82111', '81001', '80138', '81430', '81074', '80679', '82057', '81631', '81907', '80355', '81653', '81586', '81622', '80247', '80366', '80574', '81431', '80228', '80560'}


{'80033',
 '80076',
 '80138',
 '80158',
 '80159',
 '80177',
 '80228',
 '80247',
 '80355',
 '80366',
 '80394',
 '80479',
 '80560',
 '80574',
 '80679',
 '81001',
 '81074',
 '81339',
 '81380',
 '81430',
 '81431',
 '81550',
 '81586',
 '81622',
 '81631',
 '81653',
 '81786',
 '81907',
 '82057',
 '82111'}

In [2]:
import pandas as pd

In [14]:
pd.read_csv('../data/Pre-processed_content/50000-60000eng_subtitles.csv',index_col=0)

Unnamed: 0_level_0,name,pre-processed_content
index,Unnamed: 1_level_1,Unnamed: 2_level_1
60000,black.sunday.(1977).eng.1cd,use free code joinnow attent pleas passeng fli...
60001,hapritza.hagdola.(1970).eng.1cd,gentlemen get eas way right two still interrog...
60002,witness.in.the.war.zone.(1987).eng.1cd,org deprec pleas implement rest api hello hell...
60003,disengagement.(2007).eng.1cd,cigarett happen french dutch paper prefer mm f...
60004,the.romantics.s01.e01.the.boy.from.jalandhar.(...,katrina kaif hi nice meet lar hi interview hi ...
...,...,...
69995,sky.high.s01.e07.en.la.boca.del.lobo.(2023).en...,advertis product brand contact today sky high ...
69996,sky.high.s01.e01.los.vivos.y.los.muertos.(2023...,inspir real event charact fiction dramat purpo...
69997,sky.high.s01.e02.nuevas.amistades.viejos.enemi...,sky high seri million famili carri heart rip c...
69998,sky.high.s01.e03.cuentas.pendientes.(2023).eng...,sky high seri post offic offic manag deliveri ...
