In [None]:
import sys
import os

# Get the current working directory of the Jupyter notebook
notebook_directory = os.getcwd()

# Assuming the notebook is in the 'bin/' folder, add the parent directory to sys.path
parent_directory = os.path.dirname(notebook_directory)
sys.path.append(parent_directory)

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
import random

In [None]:
import spacy

#Run the following commands on terminal:
# conda install spacy
# python -m spacy download en_core_web_sm

In [None]:
#I needed to download these files for word-edit functions like stopwords and lemmatization to work. 
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

#This is needed for removing names from the text (#todo)
nlp = spacy.load("en_core_web_sm")

In [None]:
#Hello World code for TF-IDF:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Example documents
documents = ['the sky is blue', 'the sun is bright', 'the sun in the sky is bright', 'we can see the shining sun, the bright sun']

# Create the transform
vectorizer = TfidfVectorizer()

# Tokenize and build vocab
tfidf_matrix = vectorizer.fit_transform(documents)

# Compute cosine similarity between all pairs
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

#print(cosine_sim)

**Overall Recommender System:**

Context: The current group preferences (filters), and overall movie data set + properties

Input: All movies voted on by a user

Outputs: Next M = 10 movies to recommend to the user. (Say M = 5 or 10, so the user doesn't have to wait for loading times after every vote)

**Recommender Algorithm:**

Content-based filtering with TFIDF and Cosine Similarity

1. Preprocess data:
    - Get all movie overview strings
    - Tokenize the strings (break into words)
    - Clean up data not useful for comparison (stopwords, numbers, etc.)
    - Stemming/ Lemmatization (reduce words to root form)
    <p> <br> </p>
2. TF-IDF vector of words:
    - Convert all the descriptions into vectors using TF-IDF
    - Convert categorical features like genre into binary features using one-hot encoding
    - Normalize numerical features such as release year and user ratings to ensure they are on the same scale as other features (0-1)
    - Combine all 3 into one total vector describing the movie
    <p> <br> </p>
3. Calculate user profile as a weighted average vector of the feature vectors of all liked movies so far. Should be same size as the vector for each movie.
    - We could later introduce logic to use disliked movies in algorithm, though I don't think we should.
    <p> <br> </p>
4. Generate recommendations:
    - Whenever user makes a vote: (or N votes, to be more efficient), recalculate user profile vector.
    - Whenever client requests next M top movies: Calculate cosine similarity between current user profile and every candidate movie in database. Specifically, candidate movies = all movies matching group filters and not yet swiped by user.
    - Time complexity = O(No. of movies x no. of features per movie). i.e. Linear time wrt total matrix size.
    - Return the top M = 10 movies with highest cosine similarity.
     <p> <br> </p>
5. Handle new users who have not swiped yet:
    - Initial recommendation just filters by group filters and sorts by IMDB ratings.
    - Future versions can try to present a more diverse set of initial movies to get better user input, leading to better subsequent recommendations.
    <p> <br> </p>

In [None]:
#Load movie dataset
df = pd.read_csv("../amf.csv")

df['original_title'] = df['original_title'].fillna('')
df['overview'] = df['overview'].fillna('')

In [None]:
#Get string columns as lists. We won't use title for TF-IDF, just for verification purposes
id = df['id'].tolist()
titles = df['original_title'].tolist()
overviews = df['overview'].tolist()

print(overviews[:5])

In [None]:
from imdb import Cinemagoer
import timeit

# Create an instance of the Cinemagoer class
cg = Cinemagoer()

# Function to get movie description by IMDb ID
def get_movie_description(imdb_id):
    # Get movie data by IMDb ID
    start = timeit.default_timer()
    movie = cg.get_movie(imdb_id)
    end = timeit.default_timer()
    print("get_movie_description took {} seconds to run".format(end - start))
    result = {}
    for info in movie.current_info:
        if info in movie:
            result[info] = movie[info]
    return result


imdb_id = '0111161'  # Example IMDb ID 'tt0111161' for "The Shawshank Redemption"
print(get_movie_description(imdb_id))

In [None]:
#Lemmatization stuff

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
    
def lemmatize_sentence(sentence, lemmatizer):
    # Tokenize the sentence and find the POS tag for each token
    nltk_tagged = pos_tag(word_tokenize(sentence))  
    # Tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            # if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            # else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [None]:
#Function to delete people's names from descriptions (like Harry, Ron, etc.)

def remove_people_names(text):
    # Create a spaCy document
    doc = nlp(text)
    
    # Generate a list of entities that are NOT people
    entities = [ent.text for ent in doc.ents if ent.label_ != 'PERSON']
    # Generate a list of entities that are people to replace them from the original text
    people = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']

    # Replace people's names with an empty string
    for person in people:
        text = text.replace(person, '')

    # Rejoin entities that are not people to form the processed text
    # This step may or may not be necessary based on how you want to use the result
    #text = ' '.join(entities)
    
    return text

In [None]:
#Removes stops, punctuations, digits, and double spaces.
def remove_stops(text, stops):
    words = text.split()
    final = []
    for word in words:
        if word not in stops:
            final.append(word)
    final = " ".join(final)
    final = final.translate(str.maketrans("", "", string.punctuation))
    final = "".join([i for i in final if not i.isdigit()])
    while "  " in final:
        final = final.replace("  ", " ")
    return (final)


#take in a list of strings and clean them up for use in TF-IDF
def clean_docs(docs):
    lemmatizer = WordNetLemmatizer()
    stops = stopwords.words("english")
    final = []
    for doc in docs:
        clean_doc = doc
        #clean_doc = remove_people_names(doc)
        clean_doc = lemmatize_sentence(clean_doc, lemmatizer)
        clean_doc = remove_stops(clean_doc, stops)
        #Handling weird issue where apostrophe-s ('s) --> s as separate words in cleaned version
        clean_doc = clean_doc.replace(' s ', ' ')
        final.append(clean_doc)
    return (final)

In [None]:
#FYI - Stop words that will be deleted by the remove_stops function:
stops = stopwords.words("english")
print(stops)
print(len(stops))

In [None]:
#[10 mins to run] Get the cleaned overviews that will be fed into the TF-IDF function
cleaned_overviews = clean_docs(overviews)
print(cleaned_overviews[:5])

In [None]:
def get_cleaned_synopsis_for_movies(imdb_movie_id):
    imdb_movie_id = imdb_movie_id.replace("tt", "")
    movie_synopsis = get_movie_description(imdb_movie_id)["synopsis"] #List of strings
    return clean_docs(movie_synopsis)[0]

imdb_movie_ids = ["tt1457767", "tt0468569"]
cleaned_synopsis_list = [get_cleaned_synopsis_for_movies(imdb_movie_id) for imdb_movie_id in imdb_movie_ids]
print(cleaned_synopsis_list)


In [None]:
#Generate vectorizer model. Takes about 11 seconds
vectorizer = TfidfVectorizer(
                                lowercase=True,
                                max_features= 5000,
                                max_df=0.8,
                                min_df=5,
                                ngram_range = (1,3),
                                stop_words = "english"

                            )

vectors = vectorizer.fit_transform(cleaned_overviews)

feature_names = vectorizer.get_feature_names_out()

In [None]:
dense_vectors = vectors.toarray()
df = pd.DataFrame(dense_vectors, columns=feature_names)
print(df)


In [None]:
#Top values from TF-IDF tester

top_values = df.iloc[892].sort_values(ascending=False)[:10]
print(top_values)

In [None]:
print(vectors[:10])

In [None]:
synopsis_vectors = vectorizer.fit_transform(cleaned_synopsis_list)
synopsis_feature_names = vectorizer.get_feature_names_out()
dense_synopsis_vectors = synopsis_vectors.toarray()
synopsis_df = pd.DataFrame(dense_synopsis_vectors, columns=synopsis_feature_names)
print(synopsis_df)

In [None]:
#This calculates Cosines similarity between 2 vectors (movies).

#Note: Cosine similarity expects 2D matrices. 
#To perform cosine similarity on vectors, remember to reshape the vector in the 2D shape (1, N), where N is the vector length.
#to-do: Update this function to become a weighted cosine, using weights from a file.
def get_cosine_similarity(movie_vector_1, movie_vector_2):

    cosine_sim = cosine_similarity(movie_vector_1, movie_vector_2)
    return cosine_sim

In [None]:
#Testing Cosine Similarity

movie_vector_1 = vectors[0] #Toy Story
movie_vector_2 = vectors[1] #Jumanji

print(get_cosine_similarity(movie_vector_1, movie_vector_2))

movie_vector_1 = vectors[4766] #Harry Potter 1 (TPS)
movie_vector_2 = vectors[5678] #Harry Potter 2 (TCoS)

print(get_cosine_similarity(movie_vector_1, movie_vector_2))

movie_vector_1 = vectors[4766] #Harry Potter 1 (TPS)
movie_vector_2 = vectors[892] #The Wizard of Oz
print(get_cosine_similarity(movie_vector_1, movie_vector_2))

In [None]:
#Get the top movies relating to a given movie vector using cosine similarity. 
#2 use cases for this:
# 1. given_movie_vector = a specific movie's TF-IDF vector. This will return top movies relating to that movie.
# 2. given_movie_vector = user_profile's vector. This will return top movies recommended for this user. 

def get_top_movies_cosine(tfidf_matrix, given_movie_vector, movie_titles, top_n=5):
    
    # Compute cosine similarity between the movie at movie_index and all movies in the matrix
    cosine_similarities = get_cosine_similarity(given_movie_vector, tfidf_matrix).flatten()
    
    # Get the indices of the top_n movies with the highest cosine similarity scores
    # Use argsort and reverse it with [::-1] to get the indices in descending order of similarity
    # Skip the first one as it is the movie itself with a similarity of 1
    similar_indices = cosine_similarities.argsort()[::-1][1:top_n+1]
    
    # Get the scores for the top_n movies
    similar_scores = cosine_similarities[similar_indices]
    
    # Combine indices and scores into a list of tuples and return
    top_movies = [(movie_titles[index], index, score) for index, score in zip(similar_indices, similar_scores)]

    print(f"Top similar movies to the provided movie vector:\n")
    for num, (title, index, score) in enumerate(top_movies, start = 1):
        print(f"{num}. \"{title}\" at ROW {index} with similarity score: {score}")

    return top_movies

In [None]:
get_top_movies_cosine(vectors, vectors[162], titles, 10);

In [None]:
#Calculate updated user profile after they have voted on M movies. 
# M = 1 means immediate feedback loop. But it may not be ideal. It might bias our recommendations towards our initial dataset (High exploit, low explore)
# I think M = 5 or 10 might be better. 
# An even better idea is a hybrid of the above. M = 10 inititally, and after some votes M --> 1. 

def update_user_profile_batch(user_profile, movie_vectors, ratings, M):
    """
    Update the user profile based on a batch of movie ratings.

    :param user_profile: scipy.sparse matrix, the current user profile vector (1, N)
    :param movie_vectors: list of scipy.sparse matrices, the TF-IDF vectors of the rated movies [(1, N), (1, N), ...]
    :param ratings: list of str, the ratings for each movie ('like' or 'dislike')
    :param M: int, the number of ratings to process before updating the profile
    :return: scipy.sparse matrix, the updated user profile vector (1, N)
    """
    dislike_factor = 1/3 #we can tweak this to see impact on recommendations. 

    if len(movie_vectors) != len(ratings):
        raise ValueError("The number of movie vectors and ratings must be the same")

    if len(movie_vectors) < M:
        raise ValueError("The number of movie vectors must be at least M")

    # Initialize a temporary profile change vector
    profile_change = csr_matrix((1, user_profile.shape[1]))

    # Process each movie vector and rating
    for movie_vector, rating in zip(movie_vectors, ratings):
        if rating == 'like':
            profile_change += movie_vector
        elif rating == 'dislike':
            profile_change -= (dislike_factor * movie_vector)
        else:
            raise ValueError("Rating must be 'like' or 'dislike'")

    # Update the user profile after processing M ratings
    updated_profile = user_profile + profile_change

    # Normalize the updated profile
    updated_profile = normalize(updated_profile, norm='l2', axis=1)

    return updated_profile


In [None]:
#Example usage of User Profile Update:

# In our app, we should initialize user_profile as a 1-D sparse matrix of zeros when the User() is created.
# i.e. user_profile should be a property of the User() object.

VECTOR_LENGTH = vectors.shape[1] #This could be assigned as a global variable. Once we settle on an algorithm, this should not change. 

user_profile = csr_matrix((1, VECTOR_LENGTH)) #Sparse matrix for quick maths. (e.g. 2 + 2 is 4. Minus 1 that's 3)
print(type(user_profile), user_profile.shape)

movie_vectors = [vectors[i] for i in range(5)]  # Replace with actual indices of movies the user rated
ratings = ['like', 'dislike', 'like', 'like', 'dislike']  # Example ratings

#For display purposes:
print('Displaying rated movies:')
for i, _ in enumerate(movie_vectors):
    print(f"{i}. {titles[i]} - {ratings[i]}")

# Update the profile based on user ratings of M movies
M = 5
user_profile = update_user_profile_batch(user_profile, movie_vectors, ratings, M)

In [None]:
#Now that the user profile has been updated, get the top 10 recommendations for this user:
print(get_top_movies_cosine(vectors, user_profile, titles, 10))

In [None]:
print("testing git merge conflicts after removing cell outputs")