### Reading a database using sqlite 3

### In this part of the code we will be reading a database from sqlite3

In [2]:
import sqlite3
import pandas as pd #Importing the nescessary libraries
conn = sqlite3.connect("eng_subtitles_database.db") # connecting to sqlite 3

In [3]:
query = 'SELECT * FROM zipfiles' #Query writing i.e selecting all columns from zipfiles
df = pd.read_sql_query(query, conn) # Reading the sqlite query using pandas
conn.close() #closnig connection
df.head()

Unnamed: 0,num,name,content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...


### This part of the code visualizes the processing of decoding and cleaning

In [4]:
from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

In [5]:
import zipfile
import io
def decomp_decode(data):
    with zipfile.ZipFile(io.BytesIO(data)) as zip_file:
        # Extract the first file in the ZIP archive
        file_list = zip_file.namelist()
        first_file = file_list[0]
        decompressed_data = zip_file.read(first_file)
    return decompressed_data.decode('latin-1')

In [6]:
df['content'] = df['content'].progress_apply(lambda x : decomp_decode(x))

100%|██████████| 82498/82498 [01:30<00:00, 915.70it/s] 


In [7]:
df['content'].head()

0    1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an...
1    1\r\n00:00:29,359 --> 00:00:32,048\r\nAh! Ther...
2    1\r\n00:00:53,200 --> 00:00:56,030\r\n<i>Yumi'...
3    1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an...
4    ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch...
Name: content, dtype: object

In [8]:
import re

def clean_data(data): # data is the entire text file entry in the dataframe

    # removing timestamps
    data = re.sub("\d{2}:\d{2}:\d{2},\d{3}\s-->\s\d{2}:\d{2}:\d{2},\d{3}"," ",  data)

    # removing index no. of dialogues
    data = re.sub(r'\n?\d+\r', "", data)

    # removing escape sequences like \n \r
    data = re.sub('\r|\n', "", data)

    # removing <i> and </i>
    data = re.sub('<i>|</i>', "", data)
    # removing links
    data = re.sub("(?:www\.)osdb\.link\/[\w\d]+|www\.OpenSubtitles\.org|osdb\.link\/ext|api\.OpenSubtitles\.org|OpenSubtitles\.com", " ",data)

    # Converting to lower case
    data = data.lower()

    # return
    return data

In [9]:
df['content'] = df['content'].progress_apply(lambda x: clean_data(x))

100%|██████████| 82498/82498 [20:33<00:00, 66.86it/s]   


####

#####

In [2]:
import pandas as pd
df = pd.read_csv('Output_Result_V2.csv')
df.head()

Unnamed: 0,num,name,content
0,9180533,the.message.(1976).eng.1cd,watch any video online with open-subtitlesfre...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,ah! there's princessdawn and terry with the b...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,yumi's cells 2 episode extremely polite yumi ...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,watch any video online with open-subtitlesfre...
4,9180600,broker.(2022).eng.1cd,ï»¿ watch any video online with open-subtitles...


In [3]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [5]:
CV = CountVectorizer() # COunt vectorizer
TF_M = CV.fit_transform(df['content']) #Tranformer Matrix

In [6]:
CV

In [8]:
TF_T = TfidfTransformer() #Tranformer 
TFIDF_M = TF_T.fit_transform(TF_M) #Matrix

In [9]:
TF_T

In [10]:
TFIDF_M

<40000x1754436 sparse matrix of type '<class 'numpy.float64'>'
	with 48750439 stored elements in Compressed Sparse Row format>

In [20]:
User_Input = input("Enter your line: ")
User_Vector = CV.transform([User_Input])
User_TFIDF = TF_T.transform(User_Vector)

Enter your line: broker


In [21]:
Score = cosine_similarity(User_TFIDF, TFIDF_M)

In [22]:
# Retrieve top similar documents

Top_Picks = Score.argsort()[0][::-1]
Top_n = 3
retrieved_documents = [df['content'][idx] for idx in Top_Picks[:Top_n]]

print("Top", Top_n, "documents similar to query:", User_Input)
for i, doc in enumerate(retrieved_documents, 1):
    print("Document \n", i, ":", doc)

Top 3 documents similar to query: broker
Document 
 1 :  but it's saturday. it is? then i'll askall the terrorists who threatento destroy our world to take the day off. i just mean i had plansto spend the day with aj. and aj is? my daughter.ah, children. the time goes so fast, but we're allmaking sacrifices. excuse me, waiter? more champagnein the next one. uh, yeah, remind me.what's your sacrifice? calling you on a saturday. it's a simple pickup.an hour max. - especially with...- no, no, no. please don't say it. agent archer as backup. archer on a saturday? as in the dayafter friday night? i don't see the problem. obviously.have you talked to him yet? no, actually. he hasn't picked up the phonefor some reason. oh, yeah, "for some reason."  hey, archer. no.  support us and become vip member to remove all ads from   yes,a shower was necessary, lana. it was a night of many fluids. first of all, gross.it really was. second of all, hurry up. i had a whole special dayplanned with aj. coney 

In [23]:
User_Input = input("Enter your line: ")
User_Vector = CV.transform([User_Input])
User_TFIDF = TF_T.transform(User_Vector)

Enter your line: broker


In [24]:
Score = cosine_similarity(User_TFIDF, TFIDF_M)

In [25]:
def generate_summarized_documents(User_Input, retrieved_documents):
    summarized_documents = {}
    for i, doc in enumerate(retrieved_documents, 1):
        summary = "Summary: " + doc[:] + "" if len(doc) > 150 else "Summary: " + doc
        summarized_documents["Document " + str(i)] = summary
    return summarized_documents

In [26]:
Top_Picks = Score.argsort()[0][::-1]
Top_n = 3
retrieved_documents = [df['content'][idx] for idx in Top_Picks[:Top_n]]

In [27]:
# summarized documents
summarized_docs = generate_summarized_documents(User_Input, retrieved_documents)

# Print the summarized documents
for doc, summary in summarized_docs.items():
    print(doc + ":")
    print(summary)
    print()

Document 1:
Summary:  but it's saturday. it is? then i'll askall the terrorists who threatento destroy our world to take the day off. i just mean i had plansto spend the day with aj. and aj is? my daughter.ah, children. the time goes so fast, but we're allmaking sacrifices. excuse me, waiter? more champagnein the next one. uh, yeah, remind me.what's your sacrifice? calling you on a saturday. it's a simple pickup.an hour max. - especially with...- no, no, no. please don't say it. agent archer as backup. archer on a saturday? as in the dayafter friday night? i don't see the problem. obviously.have you talked to him yet? no, actually. he hasn't picked up the phonefor some reason. oh, yeah, "for some reason."  hey, archer. no.  support us and become vip member to remove all ads from   yes,a shower was necessary, lana. it was a night of many fluids. first of all, gross.it really was. second of all, hurry up. i had a whole special dayplanned with aj. coney island,a fancy tea place. moist yet

In [28]:
import joblib

In [29]:
joblib.dump(CV, 'Count_Vectorizer.joblib')

['Count_Vectorizer.joblib']

In [31]:
joblib.dump(TF_T, 'TF_IDF_transformer.joblib')

['TF_IDF_transformer.joblib']

In [32]:
joblib.dump(TFIDF_M, 'TFIDF_Matrix.joblib')

['TFIDF_Matrix.joblib']

In [33]:
joblib.dump(Score, 'Cosine_Similarity_Scores.joblib')

['Cosine_Similarity_Scores.joblib']