In [None]:
# Dataset retrieved from https//files.grouplens.org

In [109]:
# Reading in movies with pandas
import pandas as pd

movies = pd.read_csv("movies.csv")

In [110]:
# Cleaning movies with regex
import re

def clean_title(title): #Clean extra characters to optimize search
    return re.sub("[^a-zA-Z0-9 ]", "", title) #Removes any character that is not a letter, space, or number

In [111]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [112]:
# Creating a tfidf matrix
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [113]:
# Creating a search function with sklearn using inverse cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5) [-5:]
    results = movies.iloc[indices] [::-1]
    return results

In [114]:
# Building an interactive search box with Jupyter
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)

# Output widger
movie_list = widgets.Output()

# When anything is typed
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))
movie_input.observe(on_type,names='value')

display(movie_input,movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [115]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [116]:
movie_id=1

In [117]:
# First step of building reccomendation system: Finding users who liked the same movie
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)] ["userId"].unique()

In [118]:
similar_users

array([    36,     75,     86, ..., 162527, 162530, 162533])

In [119]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [120]:
similar_user_recs

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [121]:
# Step 2: Find only movies that great than 10% of the users who are similar to us liked
similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [122]:
similar_user_recs

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [123]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [124]:
similar_user_recs

movieId
1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: count, Length: 113, dtype: float64

In [125]:
#Find out all users who like a specific movie
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [126]:
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [127]:
#Step 3: Creating a reccomendation score using pandas concatenate method
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar","all"]

In [128]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [129]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [130]:
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


In [131]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


In [132]:
# Final Step: Building a reccomendation function
def find_similar_movies (movie_id):
    similar_users = ratings[(ratings ["movieId"] == movie_id) & (ratings["rating"] > 4)] ["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"]. isin(similar_users)) & (ratings["rating"] > 4)] ["movieId"]
    
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    all_users = ratings[(ratings ["movieId"]. isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users ["userId"] .unique())
    
    rec_percentages = pd.concat ([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages ["score"] = rec_percentages["similar"] / rec_percentages ["all"]
    
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId") [["score", "title","genres"]]

In [133]:
#Creating an interactive reccomendation widget

In [134]:
movie_name_input = widgets.Text(
    value = "Toy Story",
    description="Movie Title:",
    disabled=False
)

reccomendation_list = widgets.Output()

def on_type(data):
    with reccomendation_list:
        reccomendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names="value")

In [135]:
display(movie_name_input, reccomendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()