In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import ipywidgets as widgets
from IPython.display import display

In [2]:
# Dataset from https://grouplens.org/datasets/movielens/   University of Minnesota
movies_df = pd.read_csv('./data/movies.csv')
ratings_df = pd.read_csv('./data/ratings.csv')

In [3]:
movies_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [4]:
ratings_df['timestamp']

0          964982703
1          964981247
2          964982224
3          964983815
4          964982931
             ...    
100831    1493848402
100832    1493850091
100833    1494273047
100834    1493846352
100835    1493846415
Name: timestamp, Length: 100836, dtype: int64

In [5]:
# ratings_df.info()
ratings_df['timestamp'] = pd.to_datetime(ratings_df['timestamp'], unit='s')
ratings_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,2000-07-30 18:45:03
1,1,3,4.0,2000-07-30 18:20:47
2,1,6,4.0,2000-07-30 18:37:04


In [12]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

movies_df['clean_title'] = movies_df['title'].apply(clean_title)

In [13]:
movies_df.head(3)

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995


In [14]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')

In [15]:
tfidf = vectorizer.fit_transform(movies_df['clean_title'])

In [16]:
print(type(tfidf))
print(tfidf.shape)  # Number of movies x Number of unique words

<class 'scipy.sparse._csr.csr_matrix'>
(9742, 29930)


In [17]:
# How a movie title is represented in the TF-IDF matrix
# from scipy.sparse import csr_matrix

# # Create a 2D array
# arr = [[0, 0, 5, 0, 0, 0], [0, 0, 0, 0, 11, 0, ], [0, 0, 0, 0, 0, 20]]

(0, 2, 5), (1, 4, 11), (2, 5, 20)

# # Convert the 2D array to a CSR matrix
# matrix = csr_matrix(arr)

# print(matrix)

((0, 2, 5), (1, 4, 11), (2, 5, 20))

In [18]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()  # cosine_similarity returns a 2D array
    indices = np.argpartition(similarity, -5)[-5:]
    similar_movies = movies_df.iloc[indices][::-1]
    return similar_movies

In [19]:
# type(similarity)
# similarity

movie_input = widgets.Text(
    placeholder='Enter a movie title',
    description='Movie Title:', 
    disabled=False)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        # print(data)
        title = data['new']
        if len(title) > 5:
            display(search(title))

# The observe method is used to set up a callback function that will be 
# called whenever a specified property of the widget changes.
movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='', description='Movie Title:', placeholder='Enter a movie title')

Output()

In [20]:
movie_id = 1
# Filter the ratings_df DataFrame to only include rows where the movieId is equal to movie_id 
# and the rating is greater than 4.0.
similar_users = ratings_df[(ratings_df['movieId'] == movie_id) & (ratings_df['rating'] > 4.0)]['userId'].unique()
similar_users

array([  7,  17,  31,  40,  43,  46,  57,  63,  71,  73,  96,  98, 145,
       151, 159, 166, 169, 171, 177, 201, 206, 220, 229, 234, 240, 247,
       252, 254, 269, 270, 273, 275, 280, 282, 288, 304, 328, 341, 347,
       353, 357, 364, 367, 378, 380, 382, 389, 396, 411, 438, 448, 451,
       453, 456, 460, 471, 484, 488, 533, 559, 562, 573, 584, 587, 610])

In [21]:
movies_df[movies_df['movieId'] == movie_id]

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995


In [22]:
similar_user_recs = ratings_df[ratings_df['userId'].isin(similar_users) & (ratings_df['rating'] >= 4.0)]['movieId']
# This count Series is divided by the number of similar users (len(similar_users)), resulting 
# in a Series where the values are the proportion of similar users who highly rated each movie.
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .2]

In [23]:
similar_user_recs

1        1.000000
318      0.523077
356      0.523077
593      0.492308
296      0.476923
           ...   
4995     0.215385
2918     0.215385
2716     0.215385
33794    0.215385
49272    0.215385
Name: movieId, Length: 79, dtype: float64

In [24]:
# Filter the ratings_df DataFrame to only include rows where the movieId is in the index of similar_user_recs 
# which contains movie IDs of movies recommended based on similar users' ratings) and the rating is greater than 4.0.
all_users = ratings_df[(ratings_df['movieId'].isin(similar_user_recs.index)) & (ratings_df['rating'] > 4.0)]
# all_users

In [25]:
# This will be used to understand the popularity of each movie among the users 
# who highly rated the recommended movies.
all_users_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())
all_users_recs 

318      0.365280
296      0.301989
356      0.280289
2571     0.271248
2959     0.235081
           ...   
1580     0.039783
733      0.037975
78499    0.037975
736      0.034358
500      0.030741
Name: movieId, Length: 79, dtype: float64

In [26]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ['similar_users', 'all_users']
# rec_percentages = rec_percentages.fillna(0)   ### Need to check data for NaN values TODO:
rec_percentages.dropna(inplace=True)
rec_percentages

Unnamed: 0,similar_users,all_users
1,1.000000,0.117541
318,0.523077,0.365280
356,0.523077,0.280289
593,0.492308,0.231465
296,0.476923,0.301989
...,...,...
4995,0.215385,0.077758
2918,0.215385,0.057866
2716,0.215385,0.054250
33794,0.215385,0.068716


In [27]:
rec_percentages['score'] = rec_percentages['similar_users'] / rec_percentages['all_users']
# rec_final = rec_percentages.sort_values('score', ascending=False)[:10]
rec_final = rec_percentages.sort_values('score', ascending=False)

In [28]:
rec_final.head(10).merge(movies_df, left_index=True, right_on='movieId') ## [['score', 'title', 'genre']]

Unnamed: 0,similar_users,all_users,score,movieId,title,genres,clean_title
0,1.0,0.117541,8.507692,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
436,0.246154,0.030741,8.00724,500,Mrs. Doubtfire (1993),Comedy|Drama,Mrs Doubtfire 1993
2355,0.430769,0.059675,7.218648,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
7355,0.261538,0.037975,6.887179,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
594,0.215385,0.034358,6.268826,736,Twister (1996),Action|Adventure|Romance|Thriller,Twister 1996
1183,0.246154,0.039783,6.187413,1580,Men in Black (a.k.a. MIB) (1997),Action|Comedy|Sci-Fi,Men in Black aka MIB 1997
3873,0.246154,0.039783,6.187413,5445,Minority Report (2002),Action|Crime|Mystery|Sci-Fi|Thriller,Minority Report 2002
3568,0.369231,0.063291,5.833846,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
785,0.230769,0.039783,5.800699,1028,Mary Poppins (1964),Children|Comedy|Fantasy|Musical,Mary Poppins 1964
546,0.261538,0.045208,5.785231,648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller,Mission Impossible 1996


In [29]:
def find_similar_movies(movie_id):
    similar_users = ratings_df[(ratings_df['movieId'] == movie_id) & (ratings_df['rating'] > 4.0)]['userId'].unique()
    similar_user_recs = ratings_df[ratings_df['userId'].isin(similar_users) & (ratings_df['rating'] >= 4.0)]['movieId']

    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .2]

    all_users = ratings_df[(ratings_df['movieId'].isin(similar_user_recs.index)) & (ratings_df['rating'] > 4.0)]
    all_users_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ['similar_users', 'all_users']

    rec_percentages['score'] = rec_percentages['similar_users'] / rec_percentages['all_users']

    rec_final = rec_percentages.sort_values('score', ascending=False)
    return rec_final.head(10).merge(movies_df, left_index=True, right_on='movieId')  ## ['score', 'title', 'genre']


In [30]:
movie_name_input = widgets.Text(
    placeholder='Enter a movie title',
    description='Movie Title:', 
    disabled=False)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data['new']
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]['movieId']
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)


Text(value='', description='Movie Title:', placeholder='Enter a movie title')

Output()