# Import Libraries

In [1]:
import pandas as pd
import chromadb
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import collections
# Download NLTK resources if not already downloaded
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')

# Create Pipeline

In [2]:
timestamp_pattern=r'\d+\r?\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}'
dialogue_pattern=r"Dialogue: (Marked=)?\d+,\d{1,2}:\d{2}:\d{2}\.\d{2},\d{1,2}:\d{2}:\d{2}\.\d{2},\*?(\w+)?,(\w+)?,\d{1,4},\d{1,4},\d{1,4},,"
header_pattern1=r'Watch any video online with Open-SUBTITLES\r\nFree Browser extension: osdb.link/ext\r\n\r\n\r\n'#header
header_pattern2=r'\r\nAdvertise your product or brand here\r\ncontact www.OpenSubtitles.org today'
footer_pattern=r'Please rate this subtitle at www.osdb.link/[0-9a-z]+\r\nHelp other users to choose the best subtitles' #footer
header_pattern = r'\[Script Info\].*?\[Events\]\r\n.*?\r\n(Subtitle:)?'
link_pattern=r'(https?:\/\/)?(www\.)?(\w+)(\.\w+)'
linebreak_pattern=r'(\r?\n)+'
punctuation_pattern=r'[^a-zA-Z]'

In [3]:

def text_preprocessing(corpus,flag):


    if re.search(timestamp_pattern,corpus): #movie or tv show
        #remove timestamps
        corpus=re.sub(timestamp_pattern,'',corpus)
        #remove header and footer
        corpus=re.sub(header_pattern1,'',corpus)
        corpus=re.sub(footer_pattern,'',corpus)

    else: #anime
        #removing header
        corpus=re.sub(header_pattern,'',corpus,flags=re.DOTALL)
        #removing dialogue
        corpus=re.sub(dialogue_pattern,'',corpus)
        
    #remove linebreaks
    corpus=re.sub(linebreak_pattern,' ',corpus)
    
    #remove links
    corpus=re.sub(link_pattern,'',corpus)
    
    #remove punctuation
    corpus=re.sub(punctuation_pattern,' ',corpus)
    
    
    
    #convert to lower case
    corpus=corpus.lower()

    if flag == "stemming":
        stemmer = SnowballStemmer(language='english')
        preprocess_func = stemmer.stem
    else:
        lemmatizer = WordNetLemmatizer()
        preprocess_func = lemmatizer.lemmatize
    
    # Tokenize text
    words = word_tokenize(corpus.lower())
    
    # Precompute stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    
    # Process tokens
    processed_words = [preprocess_func(word) for word in words if word not in stop_words]
    
    # Join processed words
    return ' '.join(processed_words)

In [4]:

# create a function where user query returns top 10 relevant searches
def getResults(query,flag):
    
    # Pre process the query
    text=text_preprocessing(query, flag)
    print(text)
    

    #create a persistentClient
    client=chromadb.PersistentClient(path="../data/ChromaDB")
     #create or get a collection
    collection=client.get_or_create_collection(name='transcripts',metadata={"hnsw:space": "cosine"})

    #read the names file
    df=pd.read_csv('../data/names.csv', index_col=0)
    
    #query with Chroma DB
    results = collection.query(
        query_texts=[text],
        n_results=20
    )
    
    #get distinct names
    ids=results['ids'][0]
    names=[]
    hashmap=collections.defaultdict()
    for i in ids:
        parent_id=i.split('-')[0]
        df.loc[int(parent_id),'name']
        if parent_id not in hashmap:
            hashmap[parent_id]=1
            names.append(df.loc[int(parent_id),'name'])
    
    return names[:10]






In [5]:
names=getResults('In the name of God, the most gracious, the most merciful','stemming')
print(names)

name god gracious merci


Add of existing embedding ID: 70000-0
Add of existing embedding ID: 70001-0
Add of existing embedding ID: 70002-0
Add of existing embedding ID: 70003-0
Add of existing embedding ID: 70003-1
Add of existing embedding ID: 70003-2
Add of existing embedding ID: 70003-3
Add of existing embedding ID: 70003-4
Add of existing embedding ID: 70003-5
Add of existing embedding ID: 70003-6
Add of existing embedding ID: 70003-7
Add of existing embedding ID: 70003-8
Add of existing embedding ID: 70003-9
Add of existing embedding ID: 70003-10
Add of existing embedding ID: 70003-11
Add of existing embedding ID: 70003-12
Add of existing embedding ID: 70003-13
Add of existing embedding ID: 70003-14
Add of existing embedding ID: 70003-15
Add of existing embedding ID: 70003-16
Add of existing embedding ID: 70003-17
Add of existing embedding ID: 70003-18
Add of existing embedding ID: 70003-19
Add of existing embedding ID: 70003-20
Add of existing embedding ID: 70003-21
Add of existing embedding ID: 71626-8


['queen.margot.(1994).eng.1cd', 'a.d.the.bible.continues.s01.e06.the.persecution.(2015).eng.1cd', 'noah.(2019).eng.1cd', 'a.d.the.bible.continues.s01.e06.the.persecution.(2015).eng.1cd', 'amor.(2016).eng.1cd', 'smyrna.(2021).eng.1cd', 'samson.and.delilah.(1949).eng.1cd', 'jonah.the.musical.(2017).eng.1cd', 'the.other.boleyn.girl.(2008).eng.1cd', 'studio.one.s04.e30.pontius.pilate.(1952).eng.1cd', 'magic.men.(2014).eng.1cd', 'david.and.bathsheba.(1951).eng.1cd', 'the.crusades.(1935).eng.1cd', 'whether.the.weather.is.fine.(2021).eng.1cd', 'fauda.s04.e12.episode.4.12.(2022).eng.1cd', 'medusa.(2021).eng.1cd', 'solomon.and.sheba.(1959).eng.1cd', 'mystery.campion.s01.e05.the.case.of.the.late.pig.part.1.(1989).eng.1cd', 'the.popes.exorcist.(2023).eng.1cd']


In [6]:
#creating a results dataFrame
result_df=pd.DataFrame({'Name':names},index=[i+1 for i in range(len(names))])

In [7]:
result_df

Unnamed: 0,Name
1,queen.margot.(1994).eng.1cd
2,a.d.the.bible.continues.s01.e06.the.persecutio...
3,noah.(2019).eng.1cd
4,a.d.the.bible.continues.s01.e06.the.persecutio...
5,amor.(2016).eng.1cd
6,smyrna.(2021).eng.1cd
7,samson.and.delilah.(1949).eng.1cd
8,jonah.the.musical.(2017).eng.1cd
9,the.other.boleyn.girl.(2008).eng.1cd
10,studio.one.s04.e30.pontius.pilate.(1952).eng.1cd


# Name processing

I am utilizing gemini free api to get more information about the search results

### Installing Libraries

In [8]:

#!pip install -q -U google-generativeai

### Importing Libraries

In [9]:
# import libraries
import google.generativeai as genai

  from .autonotebook import tqdm as notebook_tqdm


### Load API Key

In [10]:
with open('../GEMINI_API_KEY.txt', 'r') as f:
    api_key = f.read().strip()

genai.configure(api_key=api_key)

 ### Initializing model object with system instruction

In [11]:
sys_instruction='''
        for the given list of strings. You should be able to parse following things
        
        1.name
        2.Type(movie or tv show or anime)
        3.year
        4.season(return blank if it's a movie)
        5. episode(return blank if it's a movie)
        5. language
        
        I always want response to be in json format. So crosscheck this instruction twice.
        
        Sometimes
        
'''
model = genai.GenerativeModel('gemini-1.5-pro-latest',system_instruction=sys_instruction)

### Creating a function to get response from the LLM

In [13]:
def getDetails(lst):
    prompt=sys_instruction+f'Instruction: List of strings are:\n{lst}'
    
    response=model.generate_content(prompt)
    
    return response
    

In [14]:
names1=[
  "teen.wolf.s02.e01.omega.(2012).eng.1cd",
  "teen.wolf.s02.e02.shape.shifted.(2012).eng.1cd",
  "animal.kingdom.s06.e08.revelation.(2022).eng.1cd",
  "mystery.campion.s01.e02.look.to.the.lady.part.2.(1989).eng.1cd",
  "animal.kingdom.s06.e08.revelation.(2022).eng.1cd",
  "the.collini.case.(2019).eng.1cd",
  "mind.over.murder.s01.e06.episode.1.6.(2022).eng.1cd",
  "animal.kingdom.s06.e08.revelation.(2022).eng.1cd",
  "teen.wolf.s03.e07.currents.(2013).eng.1cd",
  "animal.kingdom.s06.e08.revelation.(2022).eng.1cd"
]

In [15]:
response=getDetails(names)
response.text

'```json\n[\n  {\n    "name": "queen margot",\n    "type": "movie",\n    "year": 1994,\n    "season": null,\n    "episode": null,\n    "language": "eng"\n  },\n  {\n    "name": "a.d.the.bible.continues",\n    "type": "tv show",\n    "year": 2015,\n    "season": "s01",\n    "episode": "e06",\n    "language": "eng"\n  },\n  {\n    "name": "noah",\n    "type": "movie",\n    "year": 2019,\n    "season": null,\n    "episode": null,\n    "language": "eng"\n  },\n  {\n    "name": "a.d.the.bible.continues",\n    "type": "tv show",\n    "year": 2015,\n    "season": "s01",\n    "episode": "e06",\n    "language": "eng"\n  },\n  {\n    "name": "amor",\n    "type": "movie",\n    "year": 2016,\n    "season": null,\n    "episode": null,\n    "language": "eng"\n  },\n  {\n    "name": "smyrna",\n    "type": "movie",\n    "year": 2021,\n    "season": null,\n    "episode": null,\n    "language": "eng"\n  },\n  {\n    "name": "samson.and.delilah",\n    "type": "movie",\n    "year": 1949,\n    "season": nu

### Transforming markdown response to jsono response

In [16]:
import json

markdown_content=response.text

# Extract JSON content from the Markdown file (assuming JSON content is enclosed in ```json ... ```)
json_start = markdown_content.find('```json') + len('```json')
json_end = markdown_content.find('```', json_start)
json_content = markdown_content[json_start:json_end]

# Parse the extracted JSON content
data = json.loads(json_content)

# Access the parsed data
print(data)

[{'name': 'queen margot', 'type': 'movie', 'year': 1994, 'season': None, 'episode': None, 'language': 'eng'}, {'name': 'a.d.the.bible.continues', 'type': 'tv show', 'year': 2015, 'season': 's01', 'episode': 'e06', 'language': 'eng'}, {'name': 'noah', 'type': 'movie', 'year': 2019, 'season': None, 'episode': None, 'language': 'eng'}, {'name': 'a.d.the.bible.continues', 'type': 'tv show', 'year': 2015, 'season': 's01', 'episode': 'e06', 'language': 'eng'}, {'name': 'amor', 'type': 'movie', 'year': 2016, 'season': None, 'episode': None, 'language': 'eng'}, {'name': 'smyrna', 'type': 'movie', 'year': 2021, 'season': None, 'episode': None, 'language': 'eng'}, {'name': 'samson.and.delilah', 'type': 'movie', 'year': 1949, 'season': None, 'episode': None, 'language': 'eng'}, {'name': 'jonah.the.musical', 'type': 'movie', 'year': 2017, 'season': None, 'episode': None, 'language': 'eng'}, {'name': 'the.other.boleyn.girl', 'type': 'movie', 'year': 2008, 'season': None, 'episode': None, 'language'

### Converting to a dataframe

In [17]:
results_df=pd.DataFrame(data)
results_df

Unnamed: 0,name,type,year,season,episode,language
0,queen margot,movie,1994,,,eng
1,a.d.the.bible.continues,tv show,2015,s01,e06,eng
2,noah,movie,2019,,,eng
3,a.d.the.bible.continues,tv show,2015,s01,e06,eng
4,amor,movie,2016,,,eng
5,smyrna,movie,2021,,,eng
6,samson.and.delilah,movie,1949,,,eng
7,jonah.the.musical,movie,2017,,,eng
8,the.other.boleyn.girl,movie,2008,,,eng
9,studio.one,tv show,1952,s04,e30,eng


# Querying subtitles

Displaying subtitles for the selected show

In [18]:
with open(r'C:\Users\dsai9\Projects\Semantic Search Engine\data\space.battleship.yamato.the.new.voyage.(1979).eng.1cd.srt','r',encoding='utf-8') as file:
    
    # Read the file and convert its content to a string
    content = file.read()

    # Decode the string using the unicode_escape encoding
    #decoded_content = content.encode('utf-8').decode('unicode_escape')