In [1]:
import pandas as pd 
import re
import numpy as np
import ipywidgets as widgets
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#First we import the data sets require for the application
movies = pd.read_csv("movies.csv")

ratings = pd.read_csv("ratings.csv")

In [3]:
#Analize the data types in the data set
movies.dtypes

movieId     int64
title      object
genres     object
dtype: object

In [4]:
#Previewing the movies data
#Descriptive method
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
#Previewing the ratigns data
#Descriptive method
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [6]:
#Funtion to clean the titles so the endusers can type the titles without using special characters.
def clean_term(term):
    return re.sub("[^a-zA-Z0-9 ]", "", term)

In [7]:
#Clean all the titles from the movies data set and adding them as an additional column for faster search and match.
movies["cleanTitle"] = movies["title"].apply(clean_term)

#Clean all the genres from the movies data set and adding them as an additional column for faster search and match.
movies["cleanGenres"] = movies["genres"].apply(clean_term)

In [8]:
#Helps the search funtions look at sets of two words instead of just one, makes search better
#This is the main model
vectorizer_title = TfidfVectorizer(ngram_range=(1,2)) 
#Secondary model just in case the data is incomplete
vectorizer_genres = TfidfVectorizer(ngram_range=(1,2)) 

#Machine Learning
#Matrix of the cleanTitle column from the movies data set
#Turns all the clean titles into a set of vectors for later comparison with the user's input
#This is the training of the main model
tfidf_title = vectorizer_title.fit_transform(movies["cleanTitle"])

#Machine Learning
#Matrix of the cleanGenres column from the movies data set
#Turns all the clean genres into a set of vectors for later comparison with the user's input
#This is the training of the secondary model
tfidf_genres = vectorizer_genres.fit_transform(movies["cleanGenres"])

In [9]:
#Function to search 5 movie titles that matches the user's input
def search_title(title):
    
    #Remove special characters from the user's input
    title = clean_term(title)
    
    #Turn the user's input into a set of vectors
    query_vec = vectorizer_title.transform([title])
    
    #Compare the user's input to the clean title matrix and return the list with the similarity scores for all the movies.
    similarity = cosine_similarity(query_vec, tfidf_title).flatten()
    
    #get the indices of the 5 more similar movie titles
    indices = np.argpartition(similarity, -5)[-5:]
    
    # Find the 5 most similar movies using the indices
    results = movies.iloc[indices][::-1]
    
    #Returns 5 movies with similar titles to the user's input.
    return results

In [10]:
#Function to search 5 movies with similar genres when the movie lacks reliable ratings
def search_genres(genres):
    
    #Remove special characters from the user's input
    genre = clean_term(genres)
    
    #Turn the user's input into a set of vectors 
    query_vec = vectorizer_genres.transform([genres])
    
    #Compare the user's input to the clean generes matrix and return the list with the similarity scores for all the movies.
    similarity = cosine_similarity(query_vec, tfidf_genres).flatten()
    
    #get the indices of the 5 more similar movie genres
    indices = np.argpartition(similarity, -5)[-5:]
    
    # Find the 5 most similar movies using the indices
    results = movies.iloc[indices][::-1]
    
    #Returns 5 movies with similar genres to the user's input.
    return results 

In [11]:
#Function to find similar movies to the user's input based on the scores other user's provided for other movies
#Compares the similar users ratings to the general users ratings and returns the movies with the biggest difference
def find_similar_movies(movie_id):
    
    #Find similar users who rated the searched movie with a score of 4 or higher
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()
    
    #display(similar_users)
    #Find movies that the similar users rated with a score of 4 or higher
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >= 4)]["movieId"]
    
    #Get the percentage of similar users who like every recommended movie
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    #Narrow down the recommended movies to only the ones liked by more than 10% of the similar users
    similar_user_recs = similar_user_recs[similar_user_recs > .1]
    
    #Find all the users who rated the recommended movies with a score of 4 or higher
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] >= 4)]
    #Get the percentage of all the users who like every recommended movie
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    #Combines the "similar users" percentages and the "all users" percentages in a list
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    #Creates a score based on the difference of how similar vs all the users liked the searched movie
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    #Sorts the scores, the higher the better
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    
    #Get the top 10 movies with the highest similarity score and merge with the movies dataset using the movieId
    results = rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]
    
    #Check if there is any results before attempting to create the charts
    if (not(results.empty)):
        
        #Visual line chart of the relation between the percentage of similar users that liked the search movie, 
        #vs all the users who liked the search movie
        #Descriptive method
        rec_percentages.head(30).round(2).plot(
            x="score", 
            y=["similar", "all"], 
            figsize=(20,10), 
            color=["b", "r"],
            fontsize=12,
            xlabel="Similarity Score",
            ylabel="% of User Group \nthat liked the movie",
            title="Correlation of \n\"similar users\" \nand \n\"all the users\" \nthat liked the searched movie"
        )
        
        #Visual bar chart of the relation between the percentage of similar users that liked the search movie, 
        #vs all the users who liked the search movie
        #Descriptive method
        rec_percentages.head(30).round(2).plot(
            x="score", 
            y=["similar", "all"], 
            kind="bar", 
            figsize=(20,10), 
            color=["b", "r"],
            fontsize=12,
            xlabel="Similarity Score",
            ylabel="% of User Group \nthat liked the movie",
            title="Correlation of \n\"similar users\" \nand \n\"all the users\" \nthat liked the searched movie"
        )
    
    #Returns the top 10 scored movies
    return results

In [12]:
#Lists used to populate the Search Combobox Widget
titles_list = []
new_titles = []
unique_titles = []

#Search combobox for the user's input
movie_name_input = widgets.Combobox(
    placeholder="Type a Movie Title",
    options=titles_list,
    description="Movie Title",
    ensure_options=True,
    disabled=False
)

#List to be display after the user's input is processed
recommendation_list = widgets.Output()

#Function to listen to the user's input and display the recommended movies
def on_type(data):
    with recommendation_list:
        
        #Clean the recommendation list
        recommendation_list.clear_output()
        
        #Create a title based on the user's input
        title = clean_term(data["new"])
        
        #Process the user's input only when is more than 4 characters
        if len(title) >= 5:
            
            #Close all the charts visuals before attempting to create new ones
            plt.close('all')

            #Get the 5 movies more similar to the user's input
            results = search_title(title)
            
            #Get a list of the 5 similar movies to populate the search combobox
            new_titles = results["cleanTitle"].tolist()
            
            #Clean the unique titles list to avoid populating the search combobox it with duplicated movie titles
            unique_titles = []
            
            #Populate the combo box with the updated titles list
            #Workaround to the issue found in Jupyter's Notebook
            #Updating the options for the combobox did not cleaned the list in the Javascript code, causing duplicated results
            for new_title in new_titles:
                if new_title not in titles_list:
                    titles_list.append(new_title)
                    unique_titles.append(new_title)
                    
            #Add only the movie titles not added previously to the combobox
            movie_name_input.options = unique_titles
            
            #Get the movie with the exact title's match
            movie = results[results["cleanTitle"].isin([title])]

            #Check if the movie matching the exact title was found
            #If no movie was founf then get the first movieId from the top movie of the 5 more similar titles
            if (movie.empty):
                movie_id = results.iloc[0]["movieId"]
            
            #If the movie matching the exact title was found then get the movieId
            else:
                movie_id = movie.iloc[0]["movieId"]
            
            #Find the top 10 recommended movies
            movies_found = find_similar_movies(movie_id)
            
            #String use to display a message when the ratings are not relieable enough
            message = ""
            
            #Check if the matching of movies failed due to lack of relieable scores
            if(movies_found.empty):
                
                #Get the genre from the selected movie
                movie_genres = results.iloc[0]["cleanGenres"]
                
                #If no genres are listed for the movie then the recommendation will be a list with similar titled movies
                if (movie_genres == "no genres listed"):
                    message = "No reliable scores found, this list of movies with similar titles"
                    movies_found = results[["title", "genres"]]
                
                #If genres are listed for the movie, then the movies with more similar genres will be recommended
                else:
                    display("Movie Genres: " + movie_genres)
                    message = "No reliable scores found, this list of movies with similar genres"
                    movies_found = search_genres(movie_genres)[["title", "genres"]]
                    
            #Displays the movie's title and the recommendations
            display("Searched movie: " + title)
            display(message)
            display("Recommendations:")
            
            #Non-descriptive method
            #Prediction of what movies the user will like using the ratigns dataset
            display(movies_found)
            
            #Display the charts
            #Descriptive methods
            plt.style.use("Solarize_Light2")
            plt.show()

#Observe the changes in the user's input search box 
movie_name_input.observe(on_type, names="value")

#Interactive Dashbaord
#User-firendly Interface
#Display the search box and the recommendation list for the default movie "Toy Story 1995"
display(movie_name_input, recommendation_list)

Combobox(value='', description='Movie Title', placeholder='Type a Movie Title')

Output()