### Importing the saved csv file

In [9]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.23.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.31.1-py

In [10]:
import sqlite3
import pandas as pd
import zipfile
import io
import re
import nltk
import torch
import random
import string
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import chromadb
from chromadb.utils import embedding_functions
from nltk.tokenize import word_tokenize

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
path='/content/drive/MyDrive/Enhancing-Search-Engine-Relavance-for-Video-Subtitles/Data/eng_subtitles.csv'

In [6]:
df = pd.read_csv(path)

In [7]:
df.head()

Unnamed: 0,num,name,content
0,9251120,maybe.this.time.(2014).eng.1cd,watch any video online with opensubtitles free...
1,9211589,down.the.shore.s01.e10.and.justice.for.all.(19...,oh i know that its getting late but i dont wan...
2,9380845,uncontrollably.fond.s01.e07.heartache.(2016).e...,timing and subtitles by the uncontrollable lov...
3,9301436,screen.two.s13.e04.the.precious.blood.(1996).e...,apiopensubtitlesorg is deprecated please imple...
4,9408707,battlebots.(2015).eng.1cd,oh no not the minibots oh you leave those litt...


In [13]:
df["content"].fillna("", inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["content"].fillna("", inplace=True)


### Chunking function

In [8]:
chunk_size  = 500
overlap =100

def chunk_text(text):
    chunks = []
    start =0
    while start < len(text):
        chunk = text[start:start + chunk_size]
        chunks.append(chunk.lower())
        start+=chunk_size -overlap
    return chunks

### Store the embeddings in chromadb


In [9]:
def store_embeddings_in_chromadb(subtitles_df):

    print("Initializing ChromaDB...")
    client = chromadb.PersistentClient(path="/content/drive/MyDrive/Enhancing-Search-Engine-Relavance-for-Video-Subtitles/Data/En_seach_engine_subtitles.db")
    collection = client.get_or_create_collection(name="chromadb_En_sub_embeddings")

    print("Loading embedding model...")
    model = SentenceTransformer("all-MiniLM-L6-v2", device='cuda' if torch.cuda.is_available() else 'cpu')

    print("Cleaning text...")
    subtitles_df['cleaned_content'] = subtitles_df['content']

    batch_size = 100
    print(f"Processing {len(subtitles_df)} subtitles in batches of {batch_size}...")

    for start in range(0, len(subtitles_df), batch_size):
        print(f"Processing batch {start} to {start + batch_size}...")
        end = start + batch_size
        batch = subtitles_df.iloc[start:end]

        all_chunks, all_ids, all_metadatas = [], [], []

        for num, name, content in zip(batch['num'], batch['name'], batch['cleaned_content']):
            chunks = chunk_text(content)
            all_chunks.extend(chunks)
            all_ids.extend([f"{num}_{i}" for i in range(len(chunks))])
            all_metadatas.extend([{"name": name, "content": chunk} for chunk in chunks])

        print(f"Encoding {len(all_chunks)} chunks...")
        with torch.no_grad():
            embeddings = model.encode(all_chunks, batch_size=32, show_progress_bar=True).tolist()

        print("Adding embeddings to ChromaDB...")
        collection.add(
            ids=all_ids,
            embeddings=embeddings,
            metadatas=all_metadatas
        )

    print("Embedding storage complete.")
    return collection

In [10]:
collection = store_embeddings_in_chromadb(df)

Initializing ChromaDB...
Loading embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Cleaning text...
Processing 20624 subtitles in batches of 100...
Processing batch 0 to 100...
Encoding 6839 chunks...


Batches:   0%|          | 0/214 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 100 to 200...
Encoding 6176 chunks...


Batches:   0%|          | 0/193 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 200 to 300...
Encoding 6083 chunks...


Batches:   0%|          | 0/191 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 300 to 400...
Encoding 6562 chunks...


Batches:   0%|          | 0/206 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 400 to 500...
Encoding 6746 chunks...


Batches:   0%|          | 0/211 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 500 to 600...
Encoding 7553 chunks...


Batches:   0%|          | 0/237 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 600 to 700...
Encoding 6451 chunks...


Batches:   0%|          | 0/202 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 700 to 800...
Encoding 6786 chunks...


Batches:   0%|          | 0/213 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 800 to 900...
Encoding 6751 chunks...


Batches:   0%|          | 0/211 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 900 to 1000...
Encoding 6371 chunks...


Batches:   0%|          | 0/200 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1000 to 1100...
Encoding 6657 chunks...


Batches:   0%|          | 0/209 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1100 to 1200...
Encoding 6494 chunks...


Batches:   0%|          | 0/203 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1200 to 1300...
Encoding 6565 chunks...


Batches:   0%|          | 0/206 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1300 to 1400...
Encoding 6549 chunks...


Batches:   0%|          | 0/205 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1400 to 1500...
Encoding 6738 chunks...


Batches:   0%|          | 0/211 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1500 to 1600...
Encoding 6707 chunks...


Batches:   0%|          | 0/210 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1600 to 1700...
Encoding 5849 chunks...


Batches:   0%|          | 0/183 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1700 to 1800...
Encoding 6370 chunks...


Batches:   0%|          | 0/200 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1800 to 1900...
Encoding 6568 chunks...


Batches:   0%|          | 0/206 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1900 to 2000...
Encoding 5904 chunks...


Batches:   0%|          | 0/185 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2000 to 2100...
Encoding 6153 chunks...


Batches:   0%|          | 0/193 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2100 to 2200...
Encoding 5936 chunks...


Batches:   0%|          | 0/186 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2200 to 2300...
Encoding 6935 chunks...


Batches:   0%|          | 0/217 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2300 to 2400...
Encoding 6375 chunks...


Batches:   0%|          | 0/200 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2400 to 2500...
Encoding 6542 chunks...


Batches:   0%|          | 0/205 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2500 to 2600...
Encoding 6148 chunks...


Batches:   0%|          | 0/193 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2600 to 2700...
Encoding 6551 chunks...


Batches:   0%|          | 0/205 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2700 to 2800...
Encoding 6600 chunks...


Batches:   0%|          | 0/207 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2800 to 2900...
Encoding 6719 chunks...


Batches:   0%|          | 0/210 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2900 to 3000...
Encoding 6374 chunks...


Batches:   0%|          | 0/200 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3000 to 3100...
Encoding 6336 chunks...


Batches:   0%|          | 0/198 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3100 to 3200...
Encoding 6168 chunks...


Batches:   0%|          | 0/193 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3200 to 3300...
Encoding 6450 chunks...


Batches:   0%|          | 0/202 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3300 to 3400...
Encoding 5960 chunks...


Batches:   0%|          | 0/187 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3400 to 3500...
Encoding 6740 chunks...


Batches:   0%|          | 0/211 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3500 to 3600...
Encoding 6412 chunks...


Batches:   0%|          | 0/201 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3600 to 3700...
Encoding 6118 chunks...


Batches:   0%|          | 0/192 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3700 to 3800...
Encoding 5945 chunks...


Batches:   0%|          | 0/186 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3800 to 3900...
Encoding 5973 chunks...


Batches:   0%|          | 0/187 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3900 to 4000...
Encoding 6479 chunks...


Batches:   0%|          | 0/203 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4000 to 4100...
Encoding 6493 chunks...


Batches:   0%|          | 0/203 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4100 to 4200...
Encoding 6348 chunks...


Batches:   0%|          | 0/199 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4200 to 4300...
Encoding 6523 chunks...


Batches:   0%|          | 0/204 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4300 to 4400...
Encoding 6558 chunks...


Batches:   0%|          | 0/205 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4400 to 4500...
Encoding 6922 chunks...


Batches:   0%|          | 0/217 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4500 to 4600...
Encoding 6913 chunks...


Batches:   0%|          | 0/217 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4600 to 4700...
Encoding 6162 chunks...


Batches:   0%|          | 0/193 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4700 to 4800...
Encoding 6968 chunks...


Batches:   0%|          | 0/218 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4800 to 4900...
Encoding 6244 chunks...


Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4900 to 5000...
Encoding 6064 chunks...


Batches:   0%|          | 0/190 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5000 to 5100...
Encoding 6842 chunks...


Batches:   0%|          | 0/214 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5100 to 5200...
Encoding 6165 chunks...


Batches:   0%|          | 0/193 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5200 to 5300...
Encoding 6503 chunks...


Batches:   0%|          | 0/204 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5300 to 5400...
Encoding 6327 chunks...


Batches:   0%|          | 0/198 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5400 to 5500...
Encoding 6657 chunks...


Batches:   0%|          | 0/209 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5500 to 5600...
Encoding 6201 chunks...


Batches:   0%|          | 0/194 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5600 to 5700...
Encoding 7040 chunks...


Batches:   0%|          | 0/220 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5700 to 5800...
Encoding 6639 chunks...


Batches:   0%|          | 0/208 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5800 to 5900...
Encoding 6449 chunks...


Batches:   0%|          | 0/202 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5900 to 6000...
Encoding 6509 chunks...


Batches:   0%|          | 0/204 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6000 to 6100...
Encoding 6380 chunks...


Batches:   0%|          | 0/200 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6100 to 6200...
Encoding 6448 chunks...


Batches:   0%|          | 0/202 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6200 to 6300...
Encoding 6492 chunks...


Batches:   0%|          | 0/203 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6300 to 6400...
Encoding 6075 chunks...


Batches:   0%|          | 0/190 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6400 to 6500...
Encoding 7194 chunks...


Batches:   0%|          | 0/225 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6500 to 6600...
Encoding 5903 chunks...


Batches:   0%|          | 0/185 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6600 to 6700...
Encoding 6161 chunks...


Batches:   0%|          | 0/193 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6700 to 6800...
Encoding 5858 chunks...


Batches:   0%|          | 0/184 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6800 to 6900...
Encoding 6221 chunks...


Batches:   0%|          | 0/195 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6900 to 7000...
Encoding 6591 chunks...


Batches:   0%|          | 0/206 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7000 to 7100...
Encoding 6561 chunks...


Batches:   0%|          | 0/206 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7100 to 7200...
Encoding 5957 chunks...


Batches:   0%|          | 0/187 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7200 to 7300...
Encoding 5892 chunks...


Batches:   0%|          | 0/185 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7300 to 7400...
Encoding 5899 chunks...


Batches:   0%|          | 0/185 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7400 to 7500...
Encoding 6336 chunks...


Batches:   0%|          | 0/198 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7500 to 7600...
Encoding 5901 chunks...


Batches:   0%|          | 0/185 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7600 to 7700...
Encoding 6414 chunks...


Batches:   0%|          | 0/201 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7700 to 7800...
Encoding 6630 chunks...


Batches:   0%|          | 0/208 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7800 to 7900...
Encoding 5997 chunks...


Batches:   0%|          | 0/188 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7900 to 8000...
Encoding 6995 chunks...


Batches:   0%|          | 0/219 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 8000 to 8100...
Encoding 8157 chunks...


Batches:   0%|          | 0/255 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 8100 to 8200...
Encoding 6094 chunks...


Batches:   0%|          | 0/191 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 8200 to 8300...
Encoding 6513 chunks...


Batches:   0%|          | 0/204 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 8300 to 8400...
Encoding 6242 chunks...


Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 8400 to 8500...


TypeError: object of type 'float' has no len()

In [14]:
print(df.isnull().sum())


num                0
name               0
content            0
cleaned_content    1
dtype: int64


In [15]:
# Check how many documents exist in the collection
print(collection.count())


NameError: name 'collection' is not defined

In [6]:
model = SentenceTransformer("all-MiniLM-L6-v2", device='cuda' if torch.cuda.is_available() else 'cpu')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [3]:
query_text="and about you were doing this interview so i can get to know you and make everything clear to avoid confusion we dont need another mu mu werent you misunderstanding misunderstanding oh mmhmm so why do you want to put up a bar so that i can make monica happy because you love her yeah you love her why what do you like about miss monica"

In [15]:
def search_subtitles(query_text):
    """Reload ChromaDB and search for query embeddings."""
    # Reconnect to ensure fresh data
    chroma_client = chromadb.PersistentClient(path="/content/drive/MyDrive/Enhancing-Search-Engine-Relavance-for-Video-Subtitles/Data/En_seach_engine_subtitles.db")
    collection = chroma_client.get_or_create_collection(name="chromadb_En_sub_embeddings")

    # Encode query and perform search
    query_embedding = model.encode([query_text])[0].tolist()
    results = collection.query(query_embeddings=[query_embedding], n_results=5)

    return results["metadatas"][0] if "metadatas" in results else []


In [16]:
results=search_subtitles(query_text)

In [17]:
print(results)

[]
