In [None]:
import pandas as pd
import regex as re

In [None]:
df = pd.read_csv(r'/content/drive/MyDrive/subtitles.csv')

In [None]:
df.head()

In [None]:
df.drop(columns='num',inplace = True)

In [None]:
df.info()

In [None]:

# pattern for links
pattern_for_link = r'(https?://\S+|www\.\S+)'

In [None]:
def preprocess_links(data):
  d = re.sub(pattern_for_link,'',data)
  return d

In [None]:
df['filecontent'] = df['filecontent'].apply(preprocess_links)

In [None]:
pattern_for_tags = r'<.*?>'

In [None]:
def remove_tags(data):
  d = re.sub(pattern_for_tags,'',data)
  return d

In [None]:
df['filecontent'] = df['filecontent'].apply(remove_tags)

In [None]:
pattern_for_timestamp = r'[0-9:0-9-->0-9+\r\n\ ]'

In [None]:
def remove_timestamp(data):
  d = re.sub(pattern_for_timestamp,' ',data)
  sentences = ' '.join(d.split())
  return sentences

In [None]:
df['filecontent'] = df['filecontent'].apply(remove_timestamp)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28000 entries, 0 to 27999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         28000 non-null  object
 1   filecontent  28000 non-null  object
dtypes: object(2)
memory usage: 437.6+ KB


In [None]:
sentences_to_remove = [
    "Watch any video online with Open SUBTITLES Free Browser extension osdb link ext",
    "Advertise your product or brand here contact www OpenSubtitles org today",
    "Subtitles by MemoryOnSmells Support us and become VIP member nto remove all ads from",
    "Subtitles by"
]

In [None]:
pattern_sentence = '|'.join(re.escape(sentence) for sentence in sentences_to_remove)

In [None]:
def remove_specific_sentences(text, sentences_to_remove):
    pattern = '|'.join(re.escape(sentence) for sentence in sentences_to_remove)
    cleaned_text = re.sub(pattern_sentence, '', text)
    return cleaned_text.strip()

In [None]:
df['filecontent'] = df['filecontent'].apply(lambda x : remove_specific_sentences(x,sentences_to_remove))

In [None]:
df.head()

Unnamed: 0,name,filecontent
0,the.message.(1976).eng.1cd,", , , , In the name of God, the most gracious..."
1,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,", , Ah! There's Princess Dawn and Terry with t..."
2,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,", , Yumi's Cells , , Episode Extremely Polite ..."
3,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,", , , , Yumi's Cells , , Episode Laptop , , F..."
4,broker.(2022).eng.1cd,"ï»¿ , , , , If you're going to throw it away,..."


In [None]:
pattern = r'\.eng\.1cd|\W'

In [None]:
def preprocess_name(data):
  cleaned_sentence = re.sub(pattern,' ',data)
  return cleaned_sentence.strip()

In [None]:
df['name'] = df['name'].apply(preprocess_name)

In [None]:
df.head()

Unnamed: 0,name,filecontent
0,the message 1976,", , , , In the name of God, the most gracious..."
1,here comes the grump s01 e09 joltin jack in bo...,", , Ah! There's Princess Dawn and Terry with t..."
2,yumis cells s02 e13 episode 2 13 2022,", , Yumi's Cells , , Episode Extremely Polite ..."
3,yumis cells s02 e14 episode 2 14 2022,", , , , Yumi's Cells , , Episode Laptop , , F..."
4,broker 2022,"ï»¿ , , , , If you're going to throw it away,..."


In [None]:
name_pattern = r'episode\s\d+\s\d+'

In [None]:
def remove_name(e):
  data = re.sub(name_pattern,'',e)
  return data

In [None]:
df['name'] = df['name'].apply(remove_name)

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
lem = WordNetLemmatizer()
stopword  = stopwords.words('english')

In [None]:
remove_spl_char = '[^a-zA-Z0-9\s.]'

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
def preprocess_text(data):
  data = data.lower()
  data = re.sub(remove_spl_char,'',data)
  data = word_tokenize(data)
  data = [word for word in data if word not in stopword]
  data = ' '.join(data)
  data = re.sub(r'\s+', ' ', data)

  return data

In [None]:
df['filecontent']  = df['filecontent'].apply(preprocess_text)

In [None]:
df['name'] = df['name'].apply(preprocess_text)

In [None]:
df.to_csv('cleaned_subtitles',index = False)

In [None]:
df = pd.read_csv(r'/content/drive/MyDrive/cleaned_subtitles')

In [None]:
df.head()

Unnamed: 0,name,filecontent
0,the message 1976,in the name of god the most gracious the most...
1,here comes the grump s01 e09 joltin jack in bo...,ah theres princess dawn and terry with the bl...
2,yumis cells s02 e13 2022,yumis cells episode extremely polite yumi yum...
3,yumis cells s02 e14 2022,yumis cells episode laptop first place is min...
4,broker 2022,if youre going to throw it away then dont giv...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28000 entries, 0 to 27999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         28000 non-null  object
 1   filecontent  28000 non-null  object
dtypes: object(2)
memory usage: 437.6+ KB


In [None]:
df.head()

Unnamed: 0,name,filecontent
0,the message 1976,in the name of god the most gracious the most...
1,here comes the grump s01 e09 joltin jack in bo...,ah theres princess dawn and terry with the bl...
2,yumis cells s02 e13 2022,yumis cells episode extremely polite yumi yum...
3,yumis cells s02 e14 2022,yumis cells episode laptop first place is min...
4,broker 2022,if youre going to throw it away then dont giv...


In [None]:
def chunk_text(text, chunk_size=300):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i+chunk_size])
    return chunks

In [None]:
df['chunks'] = df['filecontent'].apply(chunk_text)

In [None]:
df.head()

Unnamed: 0,name,filecontent,chunks
0,the message 1976,in the name of god the most gracious the most...,[in the name of god the most gracious the most...
1,here comes the grump s01 e09 joltin jack in bo...,ah theres princess dawn and terry with the bl...,[ah theres princess dawn and terry with the bl...
2,yumis cells s02 e13 2022,yumis cells episode extremely polite yumi yum...,[yumis cells episode extremely polite yumi yum...
3,yumis cells s02 e14 2022,yumis cells episode laptop first place is min...,[yumis cells episode laptop first place is min...
4,broker 2022,if youre going to throw it away then dont giv...,[if youre going to throw it away then dont giv...


In [None]:
! pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [None]:
from sentence_transformers import SentenceTransformer, util

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def get_embeddings(text):
    return model.encode(text)

In [None]:
df['file_embeddings'] = df['filecontent'].apply(get_embeddings)

In [None]:
df.to_csv('embeddings.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28000 entries, 0 to 27999
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   name             28000 non-null  object
 1   filecontent      28000 non-null  object
 2   file_embeddings  28000 non-null  object
dtypes: object(3)
memory usage: 656.4+ KB


In [None]:
df.to_csv('embedding')

In [None]:
df.head()

Unnamed: 0,name,filecontent,file_embeddings
0,the message 1976,in the name of god the most gracious the most...,"[-0.03536038, 0.13778429, -0.06654229, -0.0635..."
1,here comes the grump s01 e09 joltin jack in bo...,ah theres princess dawn and terry with the bl...,"[-0.07688845, -0.009821554, 0.052715614, -0.08..."
2,yumis cells s02 e13 2022,yumis cells episode extremely polite yumi yum...,"[-0.14252928, -0.14375976, 0.049089976, -0.038..."
3,yumis cells s02 e14 2022,yumis cells episode laptop first place is min...,"[-0.11169809, -0.04651466, 0.076606795, -0.033..."
4,broker 2022,if youre going to throw it away then dont giv...,"[-0.058077488, 0.0010472588, -0.008644974, -0...."


In [None]:
def query(text):
  text= preprocess_text(text)
  vector_embedding = get_embeddings(text)
  return vector_embedding


In [None]:
text = 'the queens umbrella'

In [None]:
query_embedding = query(text)

In [None]:
len(a)

384

In [None]:
! pip install annoy

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.3-cp310-cp310-linux_x86_64.whl size=552451 sha256=0ab70c563ef336569c906dd812700ad47180122ce807745a9a03d38c5b205a32
  Stored in directory: /root/.cache/pip/wheels/64/8a/da/f714bcf46c5efdcfcac0559e63370c21abe961c48e3992465a
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.3


In [None]:
from annoy import AnnoyIndex
import numpy as np

In [None]:
embedding_dim = len(df['file_embeddings'][2])
annoy_index = AnnoyIndex(embedding_dim, 'angular')

for i, row in df.iterrows():
    annoy_index.add_item(i, row['file_embeddings'] + [i])


annoy_index.build(10)

annoy_index.save('chroma_index.ann')

True

In [None]:
annoy_index = AnnoyIndex(embedding_dim, 'angular')
annoy_index.load('chroma_index.ann')
num_similar_items = 10
similar_indices = annoy_index.get_nns_by_vector(query_embedding, num_similar_items)

In [None]:
similar_indices

[140, 744, 362, 1384, 245, 2546, 2543, 2192, 3079, 4044]

In [None]:
for index in similar_indices:
    similar_item = df.iloc[index]
    print("Name:", similar_item['name'])
    print("File content:", similar_item['filecontent'])
    print()

Name: a year of the quiet sun 1984

Name: halloween hero 2020
File content:  advertise your product or brand here contact today hey get out of here stop right there dont move what do you want you want to shoot me you wanna shoot me just shoot me pull the trigger forget you man forget you who do you think you are i have the gun forget you man yeah whatever you want just take it and get out you wanna kill me youre gonna do what i say partner shoot me then i keep it real and shoot you wanna kill me shoot me then good night turn around turn around now you move and im gonna shoot you in your head start moving to the house you know what i got news for you youre not going in my house shoot me right in the back of the head right now cause if youre not gonna do it im gonna take care of you yeah you wanna kill me huh you wanna kill me take the gun take it right here im gonna leave come on pull it pull it get the hell out of here words cant express how much i miss her if only i could hold her han

Unnamed: 0,file_embeddings
