In [1]:
import pandas as pd
import re

movies = pd.read_csv("./ml-25m/movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [2]:
#clean the movie title 
def clean_movietitle(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [3]:
movies["clean_movie_title"] = movies['title'].apply(clean_movietitle)
movies

Unnamed: 0,movieId,title,genres,clean_movie_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [4]:
#turning titles into numbers

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_movie_title"])

In [5]:
#display the most closest searches

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def movie_search(movie_title):
#     movie_title = "Toy Story 1995"
    # Clean the provided movie title using the function as defined above
    movie_title = clean_movietitle(movie_title)
    # Transform the cleaned title into a TF-IDF vector using the previously initialized vectorizer
    query_vector = vectorizer.transform([movie_title])
    # Calculate cosine similarity between the query vector and all movie vectors in the TF-IDF matrix
    movie_similarity  = cosine_similarity(query_vector, tfidf).flatten()
    # Find the indices of the top 5 most similar movies
    movie_indices = np.argpartition(movie_similarity, -5)[-5:]
    # Get the movie data for these indices and reverse the order
    results = movies.iloc[movie_indices][::-1]
    # Return the results
    return results


In [6]:
my_results = movie_search("Toy Story 1995")
my_results

Unnamed: 0,movieId,title,genres,clean_movie_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013


In [7]:
#Building the Search Engine

import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value = "Toy Story"
    , description = "Movie Title:"
    , disabled = False
)

movie_output = widgets.Output()

def on_type(data):
    with movie_output:
        movie_output.clear_output()
        title = data['new']
        if len(title) > 5:
            display(movie_search(title))

movie_input.observe(on_type, names='value')
display(movie_input, movie_output)

Text(value='Toy Story', description='Movie Title:')

Output()

In [8]:
#Find movies that are similar to a movie we like

ratings = pd.read_csv("./ml-25m/ratings.csv")
ratings.head()
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [9]:
#sample movie id
movie_id = 1

In [10]:
#find users who also liked the movies that we typed in and gave it a 4star upwards and also the othere movies they liked
same_interest_movieuser = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()
same_interest_movieuser

array([     3,      5,      8, ..., 162530, 162533, 162534])

In [11]:
#now find the users who liked the same movie as us and find any movies they rated 4star above
same_userinterest_movierecs = ratings[(ratings["userId"].isin(same_interest_movieuser)) & (ratings["rating"]>4)]["movieId"]
same_userinterest_movierecs

255             29
256             32
257             50
261            214
263            293
             ...  
24999248    101962
24999269    109487
24999326    164179
24999329    165549
24999348    177593
Name: movieId, Length: 2321248, dtype: int64

In [12]:
same_userinterest_movierecs.value_counts()
#here i can see that about 13K+ users like movie id 1, 5599 users like movieID 318

1         18835
318       15884
260       13870
296       13324
356       12169
          ...  
59290         1
44317         1
188811        1
188685        1
88934         1
Name: movieId, Length: 22464, dtype: int64

In [13]:
#find only the movies that is greater than 10% liked 

same_userinterest_movierecs.value_counts()/len(same_interest_movieuser)


1         0.499483
318       0.421226
260       0.367817
296       0.353337
356       0.322708
            ...   
59290     0.000027
44317     0.000027
188811    0.000027
188685    0.000027
88934     0.000027
Name: movieId, Length: 22464, dtype: float64

In [14]:
#check only the top 10 percent of the above (thise movies that they also liked)
same_userinterest_movierecs_10percent = same_userinterest_movierecs[same_userinterest_movierecs > 0.1]
same_userinterest_movierecs_10percent

255             29
256             32
257             50
261            214
263            293
             ...  
24999248    101962
24999269    109487
24999326    164179
24999329    165549
24999348    177593
Name: movieId, Length: 2321248, dtype: int64

In [15]:
#only 92 Movies of users who are similar to me liked

same_userinterest_movierecs = same_userinterest_movierecs.value_counts()/len(same_interest_movieuser)
same_userinterest_movierecs = same_userinterest_movierecs[same_userinterest_movierecs > .1]
same_userinterest_movierecs

1       0.499483
318     0.421226
260     0.367817
296     0.353337
356     0.322708
          ...   
1148    0.103609
1527    0.102867
4995    0.102522
778     0.102495
34      0.100162
Name: movieId, Length: 90, dtype: float64

In [16]:
#Find how much the users like the movies
#Out of the same_userinterest_movierecs (92 users), how many of these users that is are in our set of the  same_userinterest_movierecs
#users who have watched movies that were recommended to us
all_users = ratings[(ratings["movieId"].isin(same_userinterest_movierecs.index)) & (ratings["rating"] > 4)]
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000058,162541,4995,5.0,1240951903
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613


In [17]:
#what percentage of these users recommends this movies that are in similar movies recs
all_users_recs = all_users["movieId"].value_counts()/len(all_users["userId"].unique())
all_users_recs

318     0.345497
296     0.287399
2571    0.246370
356     0.237518
593     0.228071
          ...   
3114    0.054220
2716    0.053892
34      0.052729
1073    0.049232
1148    0.047922
Name: movieId, Length: 90, dtype: float64

In [18]:
#Creating a Recommendation Score
#Compare Percentages
combined_percentages = pd.concat([same_userinterest_movierecs, all_users_recs], axis=1)
combined_percentages.columns = ["similar", "all"]

#similar: how many users that are similar to us like the movie
#all: all user that liked the movies



In [19]:
combined_percentages

Unnamed: 0,similar,all
1,0.499483,0.125923
318,0.421226,0.345497
260,0.367817,0.224334
296,0.353337,0.287399
356,0.322708,0.237518
...,...,...
1148,0.103609,0.047922
1527,0.102867,0.066762
4995,0.102522,0.076403
778,0.102495,0.075473


In [20]:
#score
combined_percentages["score"] = combined_percentages["similar"]/combined_percentages["all"]
combined_percentages = combined_percentages.sort_values("score", ascending=False)

In [21]:
combined_percentages

Unnamed: 0,similar,all,score
1,0.499483,0.125923,3.966586
3114,0.170357,0.054220,3.141967
4886,0.166645,0.071489,2.331060
6377,0.166565,0.072960,2.282977
1073,0.111591,0.049232,2.266621
...,...,...,...
58559,0.180461,0.147871,1.220392
318,0.421226,0.345497,1.219189
4973,0.136148,0.113481,1.199744
2959,0.252380,0.218792,1.153517


In [22]:
#Take the top 10 recommendations and merge it to the movies on the right using the movieID
combined_percentages.head(10).merge(movies, left_index=True, right_on="movieId")


Unnamed: 0,similar,all,score,movieId,title,genres,clean_movie_title
0,0.499483,0.125923,3.966586,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.170357,0.05422,3.141967,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
4780,0.166645,0.071489,2.33106,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
6258,0.166565,0.07296,2.282977,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
1047,0.111591,0.049232,2.266621,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
8246,0.154207,0.069109,2.231373,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
580,0.151449,0.068159,2.221989,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
1120,0.103609,0.047922,2.162033,1148,Wallace & Gromit: The Wrong Trousers (1993),Animation|Children|Comedy|Crime,Wallace Gromit The Wrong Trousers 1993
359,0.18473,0.086585,2.133522,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994
587,0.12806,0.060551,2.1149,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991


In [23]:
#Building a Recommendation Function
def find_similar_movies(movie_id):
    #find movies recommendations of  users similar to us
    same_interest_movieuser = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    same_userinterest_movierecs = ratings[(ratings["userId"].isin(same_interest_movieuser)) & (ratings["rating"]>4)]["movieId"]
    
    #find movies recommendations of only 10% of these users similar to us
    same_userinterest_movierecs = same_userinterest_movierecs.value_counts()/len(same_interest_movieuser)
    same_userinterest_movierecs = same_userinterest_movierecs[same_userinterest_movierecs > .10]
    
    all_users = ratings[(ratings["movieId"].isin(same_userinterest_movierecs.index)) & (ratings["rating"] > 4)]
    all_users_recs = all_users["movieId"].value_counts()/len(all_users["userId"].unique())
    
    #now getting the score
    combined_percentages = pd.concat([same_userinterest_movierecs, all_users_recs], axis=1)
    combined_percentages.columns = ["similar", "all"]
    
    combined_percentages["score"] = combined_percentages["similar"]/combined_percentages["all"]
    combined_percentages = combined_percentages.sort_values("score", ascending=False)
    
    return combined_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]



In [24]:
#Building the widget to do it automatically

#Building the Search Engine

# import ipywidgets as widgets
# from IPython.display import display

movie_name_input = widgets.Text(
    value = "Toy Story"
    , description = "Movie Title:"
    , disabled = False
)

movie_recommendation_list = widgets.Output()

def on_type(data):
    with movie_recommendation_list:
        movie_recommendation_list.clear_output()
        title = data['new']
        if len(title) > 5:
            #search over the title that was typed in
            results = movie_search(title)
            #extract the movie ID
            movie_id = results.iloc[0]["movieId"]
            #Find the similar movies
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')
display(movie_name_input, movie_recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()