#import the necessary libraries

In [47]:
!python -m spacy download en_core_web_md
!pip install spacy_universal_sentence_encoder
!pip install faiss-cpu
!pip install datasketch
!pip install annoy
!pip install nmslib

2022-11-21 01:16:15.502599: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1-py3-none-any.whl (42.8 MB)
[K     |████████████████████████████████| 42.8 MB 1.8 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/

# Importing required modules

In [48]:
import pandas as pd
import numpy as np
import time
from datasketch import MinHash, MinHashLSHForest
import re
import faiss
import annoy
import spacy_universal_sentence_encoder
import nmslib
from annoy import AnnoyIndex
import spacy


#import the input dataset

In [49]:
netflix = pd.read_csv("/content/netflix1.csv")

In [50]:
netflix.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries
1,s3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,s6,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,s14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,TV-PG,91 min,"Children & Family Movies, Comedies"
4,s8,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies"


In [51]:
netflix.shape

(8790, 10)

In [52]:
netflix1 = netflix.drop(['show_id', 'type', 'director','country','date_added','rating','duration'], axis=1)

In [53]:
netflix1.head()

Unnamed: 0,title,release_year,listed_in
0,Dick Johnson Is Dead,2020,Documentaries
1,Ganglands,2021,"Crime TV Shows, International TV Shows, TV Act..."
2,Midnight Mass,2021,"TV Dramas, TV Horror, TV Mysteries"
3,Confessions of an Invisible Girl,2021,"Children & Family Movies, Comedies"
4,Sankofa,1993,"Dramas, Independent Movies, International Movies"


In [54]:
netflix['listed_in']

0                                           Documentaries
1       Crime TV Shows, International TV Shows, TV Act...
2                      TV Dramas, TV Horror, TV Mysteries
3                      Children & Family Movies, Comedies
4        Dramas, Independent Movies, International Movies
                              ...                        
8785                    International TV Shows, TV Dramas
8786                                             Kids' TV
8787    International TV Shows, Romantic TV Shows, TV ...
8788                                             Kids' TV
8789                                             Kids' TV
Name: listed_in, Length: 8790, dtype: object

#LSH

In [55]:

#load english language module of spacy
nlp = spacy.load('en_core_web_md')

# get stop words list
stopwords = nlp.Defaults.stop_words

# set permutations
permutations = 128


In [56]:
#lets create shringles based on whitespaces and then remove stop words 
def preprocess(text):
    text = re.sub(r'[^\w\s]','',text)
    #convert to lower case
    tokens = text.lower()
    #split into tokens
    tokens = tokens.split()
    cleaned=[]
    for x in tokens:
      if not x in (stopwords):      
          cleaned.append('{a}'.format(a=x))
    #return the canonical form
    return ' '.join(cleaned)

In [57]:
def get_forest(final_dataset, perms):
    minhash = []
    for text in netflix1['listed_in']:
        tokens = preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
    forest = MinHashLSHForest(num_perm=perms)
    for i,m in enumerate(minhash):
        forest.add(i,m)
    forest.index()    
    return forest

In [58]:
def predict(text, database, perms, num_results, forest):
    tokens = preprocess(text)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))    
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None 
    result = database.iloc[idx_array]['title']    
    return result

In [59]:
#get the forest
forest = get_forest(netflix1, permutations)


In [60]:
# choose the no.of recommendations
num_recommendations = 10

#select the title for which we need recommendations
title = 'Midnight Mass'

print("The Most Similar Movie To: '{title}' are listed below:".format(title=title))

#get the results by running predict
result = predict(title, netflix1, permutations, num_recommendations, forest)
print("\n")
print("#"*100)
for x,y in enumerate(result):
    print("{x}.) {y}".format(x=x+1,y=y))

The Most Similar Movie To: 'Midnight Mass' are listed below:


####################################################################################################
1.) Good Witch
2.) A Boy Called Po
3.) Elizabeth Harvest
4.) A Monster Calls
5.) At First Light
6.) Charmed
7.) Rememory
8.) IO
9.) The Magicians
10.) Trese After Dark


#Google Universal Sentence Encoder is used to create vectors for using in other ANN alogrithms

In [78]:
# going to use medium English language module of universal sentence encoder using Spacy
nlp = spacy_universal_sentence_encoder.load_model('en_use_md')

In [79]:
#picking the listed_in categories for all movies and converting it to vector
listed_in = netflix1.iloc[:,2].values
vector_for_netflix_title=[]
for x in listed_in:
  vector_for_netflix_title.append(nlp(x).vector)

In [80]:
#picking the listed_in for all movies and converting it to vector
listed_in = netflix1.iloc[:,2].values
vector_for_netflix_title=[]
for x in listed_in:
  vector_for_netflix_title.append(nlp(x).vector)
#creating dictionary with title and listed_in 
final=dict()
final['title'] = netflix1.iloc[:,0].values
final['listed_in']=np.array(vector_for_netflix_title)

In [81]:
final['title']

array(['Dick Johnson Is Dead', 'Ganglands', 'Midnight Mass', ...,
       'Zindagi Gulzar Hai', 'Yoko', 'YOM'], dtype=object)

In [82]:
#view the shape of vector created by universal sentence encoder
final['listed_in'].shape

(8790, 512)

#Exhaustive Search

In [83]:
#exaustive search class
class Exhaustive():
    def __init__(self, vectors, labels):
      #get the shape of the vector
        self.dimension = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels    
   
    def build(self):
          #faiss.IndexFlatL2 slices the input vectors in chunks smaller than blocksize_add and calls add_core
        self.index = faiss.IndexFlatL2(self.dimension,)
        self.index.add(self.vectors)
        
    def query(self, vectors, k=11):
        distances, indices = self.index.search(vectors, k)
        # I expect only query on one vector thus the slice
        return [self.labels[i] for i in indices[0]]

In [84]:
#create object for the class and call the build function
index = Exhaustive(final["listed_in"], final['title'])
index.build()

In [85]:
#extract the top 10 similar movies for a title
movie_listed_in = final['listed_in'][0:1]
print("The Most Similar movies To: '{movie_title}' are listed below:".format(movie_title=final['title'][0]))
print("\n")
print("#"*100)
for x,y in enumerate(index.query(movie_listed_in)):
  if x!=0:
    print("{x}.) {y}".format(x=x,y=y))

The Most Similar movies To: 'Dick Johnson Is Dead' are listed below:


####################################################################################################
1.) My Heroes Were Cowboys
2.) 9to5: The Story of a Movement
3.) Why Did You Kill Me?
4.) Headspace: Unwind Your Mind
5.) Final Account
6.) Ya no estoy aquí: Una conversación entre Guillermo del Toro y Alfonso Cuarón
7.) Bob Ross: Happy Accidents, Betrayal & Greed
8.) Lady Boss: The Jackie Collins Story
9.) Bill Hicks: Reflections
10.) Fantastic Fungi


#TREE
#Annoy

In [86]:
#annoy class
class AnnoyIndex():
    def __init__(self, vectors, labels):
        self.dimension = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels    
   
    def build(self, number_of_trees=5):
        self.index = annoy.AnnoyIndex(self.dimension)
        for i, vec in enumerate(self.vectors):
            self.index.add_item(i, vec.tolist())
        self.index.build(number_of_trees)
        
    def query(self, vector, k=11):
        indices = self.index.get_nns_by_vector(
              vector.tolist(), 
              k)                                           
        return [self.labels[i] for i in indices]

In [87]:
#build the annoy index
index = AnnoyIndex(final["listed_in"], final["title"])
index.build()

  if __name__ == '__main__':


In [88]:
#extract the top 10 similar movie for a title

movie_listed_in=index.query(final["listed_in"][0])

print("The Most Similar movies To: '{movie_title}' are listed below:".format(movie_title=movie_listed_in[0]))
print("\n")
print("#"*100)
for x,y in enumerate(movie_listed_in):
  if x!=0:
    print("{x}.) {y}".format(x=x,y=y))

The Most Similar movies To: 'Dick Johnson Is Dead' are listed below:


####################################################################################################
1.) My Heroes Were Cowboys
2.) 9to5: The Story of a Movement
3.) Why Did You Kill Me?
4.) Headspace: Unwind Your Mind
5.) Final Account
6.) Ya no estoy aquí: Una conversación entre Guillermo del Toro y Alfonso Cuarón
7.) Bob Ross: Happy Accidents, Betrayal & Greed
8.) Lady Boss: The Jackie Collins Story
9.) Bill Hicks: Reflections
10.) Fantastic Fungi


#HNSW

In [89]:
#hnsw class created using using nmslib
class NMSLIBIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels
    def build(self):
        self.index = nmslib.init(method='hnsw', space='cosinesimil')
        self.index.addDataPointBatch(self.vectors)
        self.index.createIndex({'post': 2})
        
    def query(self, vector, k=11):
        indices = self.index.knnQuery(vector, k=k)
        return [self.labels[i] for i in indices[0]]

In [90]:
#built the grpah
index = NMSLIBIndex(final["listed_in"], final['title'])
index.build()

In [91]:
#extract the top 10 similar movies for a title

movie_listed_in=index.query(final["listed_in"][0])

print("The Most Similar movies To: '{movie_title}' are listed below:".format(movie_title=movie_listed_in[0]))
print("\n")
print("#"*100)
for x,y in enumerate(movie_listed_in):
  if x!=0:
    print("{x}.) {y}".format(x=x,y=y))

The Most Similar movies To: 'Get Me Roger Stone' are listed below:


####################################################################################################
1.) Life 2.0
2.) Print the Legend
3.) My Heroes Were Cowboys
4.) 7 Yards: The Chris Norton Story
5.) Maynard
6.) Reversing Roe
7.) Star Men
8.) Sustainable
9.) The Bad Kids
10.) Catching the Sun


#Product Quantization

In [92]:
#product quantization class using faiss
class IVPQIndex():
    def __init__(self, vectors, labels):
        self.dimension = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels    
    def build(self, 
              number_of_partition=8, 
              search_in_x_partitions=2, 
              subvector_size=8):
        quantizer = faiss.IndexFlatL2(self.dimension)
        self.index = faiss.IndexIVFPQ(quantizer, 
                                      self.dimension, 
                                      number_of_partition, 
                                      search_in_x_partitions, 
                                      subvector_size)
        self.index.train(self.vectors)
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
        # I expect only query on one vector thus the slice
        return [self.labels[i] for i in indices[0]]

In [93]:
#build the index
index = IVPQIndex(final["listed_in"], final['title'])
index.build()

In [94]:
#extract the top 10 similar movies for a title
movie_listed_in = final['listed_in'][0:1]
print("The Most Similar movies To: '{movie_title}' are listed below:".format(movie_title=final['title'][0]))
print("\n")
print("#"*100)
for x,y in enumerate(index.query(movie_listed_in)):
  if x!=0:
    print("{x}.) {y}".format(x=x,y=y))

The Most Similar movies To: 'Dick Johnson Is Dead' are listed below:


####################################################################################################
1.) My Heroes Were Cowboys
2.) 9to5: The Story of a Movement
3.) Why Did You Kill Me?
4.) Headspace: Unwind Your Mind
5.) Final Account
6.) Ya no estoy aquí: Una conversación entre Guillermo del Toro y Alfonso Cuarón
7.) Bob Ross: Happy Accidents, Betrayal & Greed
8.) Lady Boss: The Jackie Collins Story
9.) Bill Hicks: Reflections
