## Imports

In [29]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import ipywidgets as widgets
from IPython.display import display

## Read the data

In [4]:
movies_df = pd.read_csv("../data/movies.csv")

In [112]:
movies_df.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


## Preprocesing the data

### Cleaning movie titles with regex

In [7]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [8]:
movies_df["clean_title"] = movies_df["title"].apply(clean_title)

In [9]:
movies_df.head(5)

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


## Creating a TFIDF matrix

We need to convert the titles into sets of numbers so the computer can understand them

In [13]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

tfidf = vectorizer.fit_transform(movies_df["clean_title"])

## Creating a search function

We need to compute the similary between a term we enter and all of the movies in our list. For that we'll use the cosine similarity

In [25]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten() # Comparing our query term to each of the titles in the list
    # We need to find the titles with the greatest similarity to our search term
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies_df.iloc[indices][::-1]
    
    return results

In [26]:
result = search("Toy Story 1995")

In [27]:
result

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013


## Building an interactive search box with Jupyter

In [38]:
movie_input = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names="value")

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

## Reading in movie ratings data

Let's find movies that are similar to the movie we like

In [45]:
ratings = pd.read_csv("../data/ratings.csv")

In [46]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [47]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

## Finding users who liked the same movie

In [48]:
movie_id = 1

In [51]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()

In [52]:
similar_users

array([     3,      5,      8, ..., 162530, 162533, 162534])

In [84]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [85]:
similar_user_recs

255             29
256             32
257             50
261            214
263            293
             ...  
24999248    101962
24999269    109487
24999326    164179
24999329    165549
24999348    177593
Name: movieId, Length: 2321248, dtype: int64

In [86]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [87]:
similar_user_recs

1       0.499483
318     0.421226
260     0.367817
296     0.353337
356     0.322708
          ...   
1148    0.103609
1527    0.102867
4995    0.102522
778     0.102495
34      0.100162
Name: movieId, Length: 90, dtype: float64

## Finding how much all users like movies

In [88]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [89]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [90]:
all_user_recs

318     0.345497
296     0.287399
2571    0.246370
356     0.237518
593     0.228071
          ...   
3114    0.054220
2716    0.053892
34      0.052729
1073    0.049232
1148    0.047922
Name: movieId, Length: 90, dtype: float64

## Creating a recommendation score

In [91]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [92]:
rec_percentages

Unnamed: 0,similar,all
1,0.499483,0.125923
318,0.421226,0.345497
260,0.367817,0.224334
296,0.353337,0.287399
356,0.322708,0.237518
...,...,...
1148,0.103609,0.047922
1527,0.102867,0.066762
4995,0.102522,0.076403
778,0.102495,0.075473


In [93]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [94]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [95]:
rec_percentages

Unnamed: 0,similar,all,score
1,0.499483,0.125923,3.966586
3114,0.170357,0.054220,3.141967
4886,0.166645,0.071489,2.331060
6377,0.166565,0.072960,2.282977
1073,0.111591,0.049232,2.266621
...,...,...,...
58559,0.180461,0.147871,1.220392
318,0.421226,0.345497,1.219189
4973,0.136148,0.113481,1.199744
2959,0.252380,0.218792,1.153517


In [96]:
rec_percentages.head(10).merge(movies_df, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,0.499483,0.125923,3.966586,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.170357,0.05422,3.141967,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
4780,0.166645,0.071489,2.33106,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
6258,0.166565,0.07296,2.282977,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
1047,0.111591,0.049232,2.266621,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
8246,0.154207,0.069109,2.231373,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
580,0.151449,0.068159,2.221989,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
1120,0.103609,0.047922,2.162033,1148,Wallace & Gromit: The Wrong Trousers (1993),Animation|Children|Comedy|Crime,Wallace Gromit The Wrong Trousers 1993
359,0.18473,0.086585,2.133522,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994
587,0.12806,0.060551,2.1149,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991


## Building a recommendation function

In [100]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    
    return rec_percentages.head(10).merge(movies_df, left_index=True, right_on="movieId")[["score", "title", "genres"]]

## Creating an interactive recommendation widget

In [83]:
movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [101]:
find_similar_movies(1) # Toy Story (1995)

Unnamed: 0,score,title,genres
0,8.017414,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3021,5.225654,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
2264,4.405452,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy
14813,4.354038,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX
4780,3.320783,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
580,3.208539,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
6258,3.156862,Finding Nemo (2003),Adventure|Animation|Children|Comedy
587,2.99115,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
8246,2.972889,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy
359,2.954762,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX


In [102]:
find_similar_movies(2) # Jumanji (1995)

Unnamed: 0,score,title,genres
1,57.008249,Jumanji (1995),Adventure|Children|Fantasy
156,18.757121,Casper (1995),Adventure|Children
313,14.88039,"Santa Clause, The (1994)",Comedy|Drama|Fantasy
578,9.382034,Home Alone (1990),Children|Comedy
495,8.71198,Mrs. Doubtfire (1993),Comedy|Drama
362,8.666058,"Mask, The (1994)",Action|Comedy|Crime|Fantasy
2526,7.959267,"Mummy, The (1999)",Action|Adventure|Comedy|Fantasy|Horror|Thriller
721,7.539414,Twister (1996),Action|Adventure|Romance|Thriller
579,6.375923,Ghost (1990),Comedy|Drama|Fantasy|Romance|Thriller
312,6.231007,Stargate (1994),Action|Adventure|Sci-Fi


In [111]:
find_similar_movies(181599) # The Devil's Woods (2015)

Unnamed: 0,score,title,genres
25778,117765.0,The Fabulous Dorseys (1947),Drama
57244,117765.0,Maximum Impact (2017),Action|Comedy
50768,117765.0,Mama Dracula (1980),Comedy|Horror|Thriller
50732,117765.0,Inoperable,Horror
44991,117765.0,The Strange Woman (1946),Drama
45149,117765.0,Zombie Fight Club (2014),Action|Horror
45148,117765.0,Bigfoot vs. Zombies (2016),Horror
45145,117765.0,The Zombie King (2013),Comedy|Horror
45116,117765.0,Don't Hang Up (2017),Horror|Thriller
45011,117765.0,Mercury Man (2006),Action|Adventure
