In [101]:
import pandas as pd
movies = pd.read_csv("movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [102]:
#RegEx is a seq of characters that forms a search pattern
#used to check if a string contains the specified search pattern

import re

#Clean title of movies by removing ()
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    #re.sub() to return one or many matches with a string
    #^ returns a match without that character
    #a-zA-Z Returns a match for any character alphabetically between a and z, lower case OR upper case
    #space is added in to "[^a-zA-Z0-9 ]" to keep space in place
    return title
movies["clean_title"] = movies["title"].apply(clean_title)
#apply() used to akes a function as an input and applies this function to an entire DataFrame
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [1]:
#import machine learning lib
#used to turn titles into vector in matrix (for search engine)
#machine takes in words and compared with its set of numbers to find similarity
from sklearn.feature_extraction.text import TfidfVectorizer

#ngram_range used to look a combinations of 2 executive words together instead of sinle word
vectorizer = TfidfVectorizer(ngram_range=(1,2))

#use this vectorize to turn our set of titles into a matrix and set of numbers
tfidf = vectorizer.fit_transform(movies["clean_title"])


ModuleNotFoundError: No module named 'sklearn'

In [104]:
#Compute the similarity of the terms we enter
#Use cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

#write a search to take in words
def search(title):
    #clean titles the same way we did before
    title = clean_title(title)
    #use vectorize to turn the searched words into a set of numbers
    query_vec = vectorizer.transform([title])

    #find the similarity between our searched terms and all of the titles in our data
    #use cosine_similarity
    similarity = cosine_similarity(query_vec,tfidf).flatten()

    #Find the title that has the greatest similarity to the searched term
    #Use np.argpartition()
    #np.argpartition(name_,-5)[-5:] to find 5 most similar titles to our searched terms
    indices = np.argpartition(similarity, -5)[-5:]

    #index our movie data by these indices to get the actual titles
    #use iloc
    #With iloc() function, we can retrieve a particular value belonging to a row and column using the index values assigned to it.
    results = movies.iloc[indices][::-1]
    #Add [::-1] to reverse results so that the most similar one is in front
    return results

In [105]:
#Building an interactive search box with Jupyter
#widgets is interective thing to embed in notebooks
import ipywidgets as widgets
#import display to show different things in output
from IPython.display import display

#Create a input widget which is text
#value is the default text
#description is name of the box
movie_input = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)

#Make an output widget
#Use widgets.Output()
movie_list = widgets.Output()

#Write a function called on_type that will called whenever we type sth into the box
def on_type(data):
    with movie_list:
        #remove any output in there
        movie_list.clear_output()
        #grab title from the input
        title = data["new"] #new value entered in the input
        if len(title) > 5:
            display(search(title))

#Whenever we type sth in, it's gonna call on_type
#'value' is a specific type of event
movie_input.observe(on_type, names='value')

#displat input and output
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [106]:
#Reading in movie ratings data
#Goal: Find movies that are similar to our movies that we like
#read in ratings.csv file
ratings = pd.read_csv("ratings.csv")
#ratings (see data)
#ratings.dtypes (see data type)

In [107]:
#Finding users who liked the same movie
#first find users who watched the same movei and rated it over 5
#the users should be unique => unique()
movie_id = 1
similar_user = ratings[(ratings["movieId"]==movie_id) & (ratings["rating"] > 4)]["userId"].unique()
#similar_user

#Find other movies that they liked
similar_user_recs = ratings[(ratings["userId"].isin(similar_user)) & (ratings["rating"] > 4)]["movieId"]

#Find only the movies that greater than 10% of the users similar to us like
#value_count() counts how many times each movie appears
#then convert to % by dividing to number of similar users
similar_user_recs = similar_user_recs.value_counts() / len(similar_user)

#only take >10%
similar_user_recs = similar_user_recs[similar_user_recs > .1]
similar_user_recs


1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: movieId, Length: 113, dtype: float64

In [108]:
#find movies similar to our likes/ define the similarity to the movies we like
#goal: Finding how much all users like movies
#find users who rated the movies in similar_user_recs and rated it high
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

#What % of all users recommended each of these movies
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
#=> this is finding the % of all users who recommmended movies in similar_user_recs
#=> you want movie that has a big differential from similar_user_recs versus everbody
#all_users_recs

In [109]:
#Creating a recommendation score
#Goal: Compare the % between similar_user_recs and all_user_recs
#Use pandas concatenate (join 2 things together)
#Each series is a column
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis = 1)
#name our columns
rec_percentages.columns = ["similar", "all"]
rec_percentages

#WE want a big difference
#Set a score: dividing one to the other
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
#Sort these recommendations
    #use pandas sort (.sort_values) and rank DESCending
rec_percentages = rec_percentages.sort_values("score", ascending=False)
#rec_percentages

#take our top 10 recs and merge them with movies => get the title
rec_percentages.head(10).merge(movies,left_index=True, right_on="movieId")


Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


In [110]:
#Building a recommendation function
def find_similar_movie(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs >.1]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title","genres"]]


In [111]:
#Creating an interective recommendation widget
#input widget
movie_input_name = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled=False
)

#output widget
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movie(movie_id))
            
movie_input_name.observe(on_type, names="value")

display(movie_input_name, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()