In [69]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import ipywidgets as widgets
from IPython.display import display

In [70]:
movies_df = pd.read_csv('./data/movies.csv')
ratings_df = pd.read_csv('./data/ratings.csv')

In [71]:
movies_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [72]:
# ratings_df.info()
ratings_df['timestamp'] = pd.to_datetime(ratings_df['timestamp'], unit='s')
ratings_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,2000-07-30 18:45:03
1,1,3,4.0,2000-07-30 18:20:47
2,1,6,4.0,2000-07-30 18:37:04


In [73]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

movies_df['clean_title'] = movies_df['title'].apply(clean_title)

In [74]:
movies_df.head(3)

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995


In [75]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')

In [76]:
tfidf = vectorizer.fit_transform(movies_df['clean_title'])

In [77]:
print(type(tfidf))
print(tfidf.shape)  # Number of movies x Number of unique words

<class 'scipy.sparse._csr.csr_matrix'>
(9742, 29930)


In [78]:
# How a movie title is represented in the TF-IDF matrix
# from scipy.sparse import csr_matrix

# # Create a 2D array
# arr = [[0, 0, 5, 0, 0, 0], [0, 0, 0, 0, 11, 0, ], [0, 0, 0, 0, 0, 20]]

# # Convert the 2D array to a CSR matrix
# matrix = csr_matrix(arr)

# print(matrix)

In [79]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    similar_movies = movies_df.iloc[indices][::-1]
    return similar_movies

In [80]:
# type(similarity)
# similarity

movie_input = widgets.Text(
    placeholder='Enter a movie title',
    description='Movie Title:', 
    disabled=False)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        # print(data)
        title = data['new']
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='', description='Movie Title:', placeholder='Enter a movie title')

Output()

In [81]:
movie_id = 1
# ratings_df[(ratings_df['movieId'] == movie_id) & (ratings_df['rating'] >= 5.0)]['userId'].count()
similar_users = ratings_df[(ratings_df['movieId'] == movie_id) & (ratings_df['rating'] >= 5.0)]['userId'].unique()
similar_users

array([ 31,  40,  43,  46,  57,  63,  71,  96, 145, 151, 166, 171, 177,
       201, 206, 220, 229, 234, 240, 247, 269, 270, 273, 275, 304, 328,
       341, 347, 353, 357, 364, 367, 380, 389, 396, 411, 448, 451, 453,
       456, 471, 533, 559, 573, 584, 587, 610])

In [83]:
similar_user_recs = ratings_df[ratings_df['userId'].isin(similar_users) & (ratings_df['rating'] >= 4.0)]['movieId']
movie_recs = similar_user_recs.value_counts() / len(similar_users)
movie_recs = movie_recs[movie_recs >= 0.4]
movie_recs 

1       1.000000
296     0.531915
260     0.489362
593     0.468085
588     0.468085
318     0.446809
364     0.425532
480     0.425532
356     0.425532
50      0.404255
1198    0.404255
Name: movieId, dtype: float64

In [84]:
all_users = ratings_df[(ratings_df['movieId'].isin(similar_user_recs.index)) & (ratings_df['rating'] > 4.0)]
# all_users

In [85]:
all_users_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())
all_users_recs 

6377      0.226852
5618      0.194444
68954     0.180556
4886      0.162037
4896      0.134259
            ...   
4902      0.004630
6335      0.004630
4888      0.004630
4920      0.004630
100159    0.004630
Name: movieId, Length: 131, dtype: float64

In [87]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ['similar_users', 'all_users']
# rec_percentages = rec_percentages.fillna(0)
rec_percentages['score'] = rec_percentages['similar_users'] / rec_percentages['all_users']
rec_percentages.sort_values('score', ascending=False)[:10]

Unnamed: 0,similar_users,all_users,score
52458,135887,0.00463,29351592.0
100611,101525,0.00463,21929400.0
25769,99007,0.00463,21385512.0
100507,86898,0.00463,18769968.0
88932,66097,0.00463,14276952.0
100553,93840,0.009259,10134720.0
100163,40278,0.00463,8700048.0
100159,39381,0.00463,8506296.0
100383,69275,0.009259,7481700.0
100083,26547,0.00463,5734152.0
