In [3]:
import sys
import os

# Get the current working directory of the Jupyter notebook
notebook_directory = os.getcwd()

# Assuming the notebook is in the 'bin/' folder, add the parent directory to sys.path
parent_directory = os.path.dirname(notebook_directory)
sys.path.append(parent_directory)

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
import random

In [1]:
import spacy

#Run the following commands on conda:
# conda install spacy
# python -m spacy download en_core_web_sm

In [7]:
print(sys.executable)

c:\Users\detab\anaconda3\python.exe


In [4]:
#I needed to download these files for word-edit functions like stopwords and lemmatization to work. 
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

#This is needed for removing names from the text (#todo)
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\detab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\detab\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\detab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\detab\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
#Hello World code for TF-IDF:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Example documents
documents = ['the sky is blue', 'the sun is bright', 'the sun in the sky is bright', 'we can see the shining sun, the bright sun']

# Create the transform
vectorizer = TfidfVectorizer()

# Tokenize and build vocab
tfidf_matrix = vectorizer.fit_transform(documents)

# Compute cosine similarity between all pairs
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

#print(cosine_sim)

**Overall Recommender System:**

Context: The current group preferences (filters), and overall movie data set + properties

Input: All movies voted on by a user

Outputs: Next M = 10 movies to recommend to the user. (Say M = 5 or 10, so the user doesn't have to wait for loading times after every vote)

**Recommender Algorithm:**

Content-based filtering with TFIDF and Cosine Similarity

1. Preprocess data:
    - Get all movie overview strings
    - Tokenize the strings (break into words)
    - Clean up data not useful for comparison (stopwords, numbers, etc.)
    - Stemming/ Lemmatization (reduce words to root form)
    <p> <br> </p>
2. TF-IDF vector of words:
    - Convert all the descriptions into vectors using TF-IDF
    - Convert categorical features like genre into binary features using one-hot encoding
    - Normalize numerical features such as release year and user ratings to ensure they are on the same scale as other features (0-1)
    - Combine all 3 into one total vector describing the movie
    <p> <br> </p>
3. Calculate user profile as a weighted average vector of the feature vectors of all liked movies so far. Should be same size as the vector for each movie.
    - We could later introduce logic to use disliked movies in algorithm, though I don't think we should.
    <p> <br> </p>
4. Generate recommendations:
    - Whenever user makes a vote: (or N votes, to be more efficient), recalculate user profile vector.
    - Whenever client requests next M top movies: Calculate cosine similarity between current user profile and every candidate movie in database. Specifically, candidate movies = all movies matching group filters and not yet swiped by user.
    - Time complexity = O(No. of movies x no. of features per movie). i.e. Linear time wrt total matrix size.
    - Return the top M = 10 movies with highest cosine similarity.
     <p> <br> </p>
5. Handle new users who have not swiped yet:
    - Initial recommendation just filters by group filters and sorts by IMDB ratings.
    - Future versions can try to present a more diverse set of initial movies to get better user input, leading to better subsequent recommendations.
    <p> <br> </p>

In [6]:
#Load movie dataset
df = pd.read_csv("../amf.csv")

df['original_title'] = df['original_title'].fillna('')
df['overview'] = df['overview'].fillna('')

In [7]:
#Get string columns as lists. We won't use title for TF-IDF, just for verification purposes
id = df['id'].tolist()
titles = df['original_title'].tolist()
overviews = df['overview'].tolist()

print(overviews[:5])

["Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.", "When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures.", "A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is

In [8]:
#Lemmatization stuff

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
    
def lemmatize_sentence(sentence, lemmatizer):
    # Tokenize the sentence and find the POS tag for each token
    nltk_tagged = pos_tag(word_tokenize(sentence))  
    # Tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            # if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            # else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [12]:
#Function to delete people's names from descriptions (like Harry, Ron, etc.)

def remove_people_names(text):
    # Create a spaCy document
    doc = nlp(text)
    
    # Generate a list of entities that are NOT people
    entities = [ent.text for ent in doc.ents if ent.label_ != 'PERSON']
    # Generate a list of entities that are people to replace them from the original text
    people = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']

    # Replace people's names with an empty string
    for person in people:
        text = text.replace(person, '')

    # Rejoin entities that are not people to form the processed text
    # This step may or may not be necessary based on how you want to use the result
    #text = ' '.join(entities)
    
    return text

In [19]:
#Removes stops, punctuations, digits, and double spaces.
def remove_stops(text, stops):
    words = text.split()
    final = []
    for word in words:
        if word not in stops:
            final.append(word)
    final = " ".join(final)
    final = final.translate(str.maketrans("", "", string.punctuation))
    final = "".join([i for i in final if not i.isdigit()])
    while "  " in final:
        final = final.replace("  ", " ")
    return (final)


#take in a list of strings and clean them up for use in TF-IDF
def clean_docs(docs):
    lemmatizer = WordNetLemmatizer()
    stops = stopwords.words("english")
    final = []
    for doc in docs:
        clean_doc = remove_people_names(doc)
        clean_doc = lemmatize_sentence(clean_doc, lemmatizer)
        clean_doc = remove_stops(clean_doc, stops)
        #Handling weird issue where apostrophe-s ('s) --> s as separate words in cleaned version
        clean_doc = clean_doc.replace(' s ', ' ')
        final.append(clean_doc)
    return (final)

In [17]:
#FYI - Stop words that will be deleted by the remove_stops function:
stops = stopwords.words("english")
print(stops)
print(len(stops))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [20]:
#[10 mins to run] Get the cleaned overviews that will be fed into the TF-IDF function
cleaned_overviews = clean_docs(overviews)
print(cleaned_overviews[:5])

['Led Woody toys live happily room birthday bring onto scene Afraid lose place heart Woody plots But circumstance separate Woody owner duo eventually learn put aside difference ', 'When sibling discover enchanted board game open door magical world unwittingly invite adult trap inside game year living room hope freedom finish game prove risky three find run giant rhinoceros evil monkey terrifying creature ', 'A family wedding reignite ancient feud nextdoor neighbor fishing buddy Meanwhile sultry Italian divorcée open restaurant local bait shop alarm local worry ll scare fish away But less interested seafood cook hot time ', 'Cheated mistreat step woman hold breath wait elusive good man break string lessthanstellar lover Friends confidant Vannah talk determine find good way breathe ', 'Just recover daughter wedding receive news pregnant wife expect He plan sell home plan like change arrival grandchild kid ']


In [21]:
#Generate vectorizer model. Takes about 11 seconds
vectorizer = TfidfVectorizer(
                                lowercase=True,
                                max_features= 5000,
                                max_df=0.8,
                                min_df=5,
                                ngram_range = (1,3),
                                stop_words = "english"

                            )

vectors = vectorizer.fit_transform(cleaned_overviews)

feature_names = vectorizer.get_feature_names_out()

In [22]:
dense_vectors = vectors.toarray()
df = pd.DataFrame(dense_vectors, columns=feature_names)
print(df)


       abandon  abandoned  abduct  ability  able  aboard  abortion  abroad  \
0          0.0        0.0     0.0      0.0   0.0     0.0       0.0     0.0   
1          0.0        0.0     0.0      0.0   0.0     0.0       0.0     0.0   
2          0.0        0.0     0.0      0.0   0.0     0.0       0.0     0.0   
3          0.0        0.0     0.0      0.0   0.0     0.0       0.0     0.0   
4          0.0        0.0     0.0      0.0   0.0     0.0       0.0     0.0   
...        ...        ...     ...      ...   ...     ...       ...     ...   
45461      0.0        0.0     0.0      0.0   0.0     0.0       0.0     0.0   
45462      0.0        0.0     0.0      0.0   0.0     0.0       0.0     0.0   
45463      0.0        0.0     0.0      0.0   0.0     0.0       0.0     0.0   
45464      0.0        0.0     0.0      0.0   0.0     0.0       0.0     0.0   
45465      0.0        0.0     0.0      0.0   0.0     0.0       0.0     0.0   

       abruptly  absence  ...  young woman  younger  youngster 

In [23]:
#Top values from TF-IDF tester

top_values = df.iloc[892].sort_values(ascending=False)[:10]
print(top_values)

thing          0.326274
man make       0.288789
yellow         0.282622
make friend    0.273409
wicked         0.269847
lion           0.266609
make way       0.247814
make           0.244373
witch          0.230557
magical        0.223757
Name: 892, dtype: float64


In [24]:
print(vectors[:10])

  (0, 1194)	0.2699305269195095
  (0, 240)	0.3086276239876073
  (0, 2481)	0.18587933355357103
  (0, 1492)	0.21911256371901605
  (0, 1332)	0.27882724925029095
  (0, 3150)	0.21142878669386758
  (0, 3918)	0.24550681415803086
  (0, 715)	0.2614000968288274
  (0, 2032)	0.20934670381420872
  (0, 3271)	0.1825649931923838
  (0, 2598)	0.18073770574407927
  (0, 86)	0.29548835408556207
  (0, 3840)	0.21961735653572254
  (0, 513)	0.17759551521439162
  (0, 428)	0.2592311502134675
  (0, 3766)	0.22932245261784862
  (0, 1994)	0.280089595775914
  (0, 2555)	0.15207718971230008
  (1, 986)	0.17053828436053783
  (1, 4423)	0.20975821986765586
  (1, 2857)	0.23210543873502254
  (1, 1497)	0.14654184637445242
  (1, 1882)	0.1833631053916531
  (1, 3782)	0.1293226747954075
  (1, 3733)	0.24166340432981923
  :	:
  (8, 1625)	0.09833464947821229
  (8, 3335)	0.1734037653669758
  (8, 1024)	0.1977971464318253
  (8, 4181)	0.21009246792756595
  (8, 312)	0.17530337472957144
  (8, 3935)	0.19987707280674943
  (8, 4472)	0.1538610

In [25]:
#This calculates Cosines similarity between 2 vectors (movies).

#Note: Cosine similarity expects 2D matrices. 
#To do cosine similarity on vectors, remember to reshape the vector in the  shape (1, N), where N is the vector length.
def get_cosine_similarity(movie_vector_1, movie_vector_2):

    cosine_sim = cosine_similarity(movie_vector_1, movie_vector_2)
    return cosine_sim

In [26]:
#Testing Cosine Similarity

movie_vector_1 = vectors[0] #Toy Story
movie_vector_2 = vectors[1] #Jumanji

print(get_cosine_similarity(movie_vector_1, movie_vector_2))

movie_vector_1 = vectors[4766] #Harry Potter 1 (TPS)
movie_vector_2 = vectors[5678] #Harry Potter 2 (TCoS)

print(get_cosine_similarity(movie_vector_1, movie_vector_2))

movie_vector_1 = vectors[4766] #Harry Potter 1 (TPS)
movie_vector_2 = vectors[892] #The Wizard of Oz
print(get_cosine_similarity(movie_vector_1, movie_vector_2))

[[0.03847733]]
[[0.0178446]]
[[0.00891219]]


In [27]:
def get_top_movies_cosine(tfidf_matrix, movie_index, movie_titles, top_n=5):
    
    # Compute cosine similarity between the movie at movie_index and all movies in the matrix
    cosine_similarities = cosine_similarity(tfidf_matrix[movie_index], tfidf_matrix).flatten()
    
    # Get the indices of the top_n movies with the highest cosine similarity scores
    # Use argsort and reverse it with [::-1] to get the indices in descending order of similarity
    # Skip the first one as it is the movie itself with a similarity of 1
    similar_indices = cosine_similarities.argsort()[::-1][1:top_n+1]
    
    # Get the scores for the top_n movies
    similar_scores = cosine_similarities[similar_indices]
    
    # Combine indices and scores into a list of tuples and return
    top_movies = [(movie_titles[index], index, score) for index, score in zip(similar_indices, similar_scores)]

    print(f"Top similar movies to {titles[movie_index]}:\n")
    for num, (title, index, score) in enumerate(top_movies, start = 1):
        print(f"{num}. \"{title}\" at ROW {index} with similarity score: {score}")

    return top_movies

In [33]:
get_top_movies_cosine(vectors, 162, titles, 10)

Top similar movies to Die Hard: With a Vengeance:

1. "Broadway Melody of 1940" at ROW 10175 with similarity score: 0.3140144503354083
2. "Loose Cannons" at ROW 6411 with similarity score: 0.2807426588650285
3. "If These Knishes Could Talk: The Story of the NY Accent" at ROW 37657 with similarity score: 0.26426619815728375
4. "The Transfiguration" at ROW 43275 with similarity score: 0.26344914917684265
5. "Khiladi 786" at ROW 40343 with similarity score: 0.2590249725437656
6. "Les Ripoux" at ROW 44766 with similarity score: 0.2573206898414055
7. "Texas Killing Fields" at ROW 19018 with similarity score: 0.23704803406060815
8. "Strictly Ballroom" at ROW 1147 with similarity score: 0.2257942291751761
9. "Shoot the Moon" at ROW 5989 with similarity score: 0.21235341819954417
10. "Strike Force" at ROW 31825 with similarity score: 0.21160490910240262


[('Broadway Melody of 1940', 10175, 0.3140144503354083),
 ('Loose Cannons', 6411, 0.2807426588650285),
 ('If These Knishes Could Talk: The Story of the NY Accent',
  37657,
  0.26426619815728375),
 ('The Transfiguration', 43275, 0.26344914917684265),
 ('Khiladi 786', 40343, 0.2590249725437656),
 ('Les Ripoux', 44766, 0.2573206898414055),
 ('Texas Killing Fields', 19018, 0.23704803406060815),
 ('Strictly Ballroom', 1147, 0.2257942291751761),
 ('Shoot the Moon', 5989, 0.21235341819954417),
 ('Strike Force', 31825, 0.21160490910240262)]

In [None]:
print(f"Top 5 similar movies to movie {movie_index}:\n")
for index, score in similar_movies:
    print(f"Movie {index} with similarity score: {score}\n")