In [2]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [34]:
!python -m pip install --upgrade faiss faiss-gpu
!pip install datasketch

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.1.post2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (89.7 MB)
[K     |████████████████████████████████| 89.7 MB 17 kB/s 
Installing collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.1.post2


In [4]:
# Importing packages
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sn
import warnings # Current version of Seaborn generates a bunch of warnings that will be ignored.
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [54]:
df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Dataset/wiki_movie_plots_deduped.csv')
df.head(10)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
5,1903,Alice in Wonderland,American,Cecil Hepworth,May Clark,unknown,https://en.wikipedia.org/wiki/Alice_in_Wonderl...,"Alice follows a large white rabbit down a ""Rab..."
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...
8,1905,The Little Train Robbery,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Little_Train...,The opening scene shows the interior of the ro...
9,1905,The Night Before Christmas,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Night_Before...,Scenes are introduced using lines of the poem....


In [55]:
df.drop(columns=["Origin/Ethnicity", "Cast", "Genre"], inplace= True)
df["text"] = df["Title"] + " " + df["Plot"]
df.head(10)

Unnamed: 0,Release Year,Title,Director,Wiki Page,Plot,text
0,1901,Kansas Saloon Smashers,Unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...",Kansas Saloon Smashers A bartender is working ...
1,1901,Love by the Light of the Moon,Unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...","Love by the Light of the Moon The moon, painte..."
2,1901,The Martyred Presidents,Unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...","The Martyred Presidents The film, just over a ..."
3,1901,"Terrible Teddy, the Grizzly King",Unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,"Terrible Teddy, the Grizzly King Lasting just ..."
4,1902,Jack and the Beanstalk,"George S. Fleming, Edwin S. Porter",https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,Jack and the Beanstalk The earliest known adap...
5,1903,Alice in Wonderland,Cecil Hepworth,https://en.wikipedia.org/wiki/Alice_in_Wonderl...,"Alice follows a large white rabbit down a ""Rab...",Alice in Wonderland Alice follows a large whit...
6,1903,The Great Train Robbery,Edwin S. Porter,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...,The Great Train Robbery The film opens with tw...
7,1904,The Suburbanite,Wallace McCutcheon,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...,The Suburbanite The film is about a family who...
8,1905,The Little Train Robbery,Edwin Stanton Porter,https://en.wikipedia.org/wiki/The_Little_Train...,The opening scene shows the interior of the ro...,The Little Train Robbery The opening scene sho...
9,1905,The Night Before Christmas,Edwin Stanton Porter,https://en.wikipedia.org/wiki/The_Night_Before...,Scenes are introduced using lines of the poem....,The Night Before Christmas Scenes are introduc...


Get first 1000 rows for performance purposes

In [57]:
df = df[0:1000]
df.shape

(1000, 6)

# **LSH**

In [9]:
import re
import time
from datasketch import MinHash, MinHashLSHForest

In [10]:
def preprocess(text):
    text = re.sub(r'[^\w\s]','',text)
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

In [11]:
def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for text in data['text']:
        tokens = preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

In [12]:
def predict(text, database, perms, num_results, forest):
    start_time = time.time()
    
    tokens = preprocess(text)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    result = database.iloc[idx_array]['Title']
    
    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result

In [13]:
permutations = 128

num_recommendations = 1

In [58]:
forest = get_forest(df, permutations)

It took 4.507079362869263 seconds to build forest.


In [59]:
num_recommendations = 5
title = 'The Godfather'
result = predict(title, df, permutations, num_recommendations, forest)
print('\n Top Recommendation(s) is(are) \n', result)

It took 0.00950765609741211 seconds to query forest.

 Top Recommendation(s) is(are) 
 25      The Lure of the Gown
193    Are Crooks Dishonest?
Name: Title, dtype: object


# **Exhaustive Search**

In [35]:
import faiss

In [61]:
df['Index'] = df.index

In [62]:
tfidf = TfidfVectorizer(analyzer='word',ngram_range=(1, 3),min_df=0,stop_words='english')
X_tfidf = tfidf.fit_transform(df['text'])
X_tfidf

<1000x228586 sparse matrix of type '<class 'numpy.float64'>'
	with 307021 stored elements in Compressed Sparse Row format>

In [63]:
vector = X_tfidf.toarray()
vector[0].shape[0]

228586

In [37]:
class ExactIndex():
    def __init__(self, vectors, labels):
        self.dimension = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels    
   
    def build(self):
        self.index = faiss.IndexFlatL2(self.dimension,)
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
        # I expect only query on one vector thus the slice
        return [self.labels[i] for i in indices[0]]

In [64]:
%%time
index = ExactIndex(vector, df["Index"])
index.build()

CPU times: user 512 ms, sys: 1.56 s, total: 2.08 s
Wall time: 2.07 s


In [65]:
query_index = 5
print(f"The most similar items to item {df['Index'][query_index]} are:")

items_list = index.query(np.array([vector[query_index]]).astype('float32'))
df.loc[df['Index'].isin(items_list)]

The most similar items to item 5 are:


Unnamed: 0,Release Year,Title,Director,Wiki Page,Plot,text,Index
5,1903,Alice in Wonderland,Cecil Hepworth,https://en.wikipedia.org/wiki/Alice_in_Wonderl...,"Alice follows a large white rabbit down a ""Rab...",Alice in Wonderland Alice follows a large whit...,5
150,1916,Sherlock Holmes,Arthur Berthelet,https://en.wikipedia.org/wiki/Sherlock_Holmes_...,"A prince, the heir apparent to a large empire,...","Sherlock Holmes A prince, the heir apparent to...",150
170,1917,His Wedding Night,Roscoe Arbuckle,https://en.wikipedia.org/wiki/His_Wedding_Night,"Arbuckle plays a drug store clerk, soda jerk, ...",His Wedding Night Arbuckle plays a drug store ...,170
225,1918,Moonshine,Unknown,https://en.wikipedia.org/wiki/Moonshine_(1918_...,"Set in the Virginia Hills, Fatty and Buster pl...","Moonshine Set in the Virginia Hills, Fatty and...",225
495,1924,Helen's Babies,William A. Seiter,https://en.wikipedia.org/wiki/Helen%27s_Babies...,Toodie and Budge are identified as the two bes...,Helen's Babies Toodie and Budge are identified...,495
526,1925,Confessions of a Queen,Victor Sjostrom,https://en.wikipedia.org/wiki/Confessions_of_a...,The King of Illyris (Lewis Stone) marries a ne...,Confessions of a Queen The King of Illyris (Le...,526
895,1930,Borrowed Wives,Frank R. Strayer,https://en.wikipedia.org/wiki/Borrowed_Wives,Peter Foley (Rex Lease) is a beneficiary of hi...,Borrowed Wives Peter Foley (Rex Lease) is a be...,895
917,1930,Extravagance,Phil Rosen,https://en.wikipedia.org/wiki/Extravagance_(film),Alice Kendall is the darling of her social set...,Extravagance Alice Kendall is the darling of h...,917
935,1930,He Knew Women,Hugh Herbert,https://en.wikipedia.org/wiki/He_Knew_Women,"Geoffrey Clarke is a poor poet, who has his ey...","He Knew Women Geoffrey Clarke is a poor poet, ...",935
995,1930,Playing Around,Mervyn LeRoy,https://en.wikipedia.org/wiki/Playing_Around,Alice White plays the part of a working class ...,Playing Around Alice White plays the part of a...,995


# **Product Quantization**

# **References**



1.   https://www.learndatasci.com/tutorials/building-recommendation-engine-locality-sensitive-hashing-lsh-python/
2. https://towardsdatascience.com/comprehensive-guide-to-approximate-nearest-neighbors-algorithms-8b94f057d6b6

