In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import ipywidgets as widgets
from IPython.display import display

In [2]:
movies_df = pd.read_csv('./data/movies.csv')
ratings_df = pd.read_csv('./data/ratings.csv')

In [3]:
movies_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [24]:
# ratings_df.info()
ratings_df['timestamp'] = pd.to_datetime(ratings_df['timestamp'], unit='s')
ratings_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,2000-07-30 18:45:03
1,1,3,4.0,2000-07-30 18:20:47
2,1,6,4.0,2000-07-30 18:37:04


In [5]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

movies_df['clean_title'] = movies_df['title'].apply(clean_title)

In [6]:
movies_df.head(3)

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995


In [15]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')

In [16]:
tfidf = vectorizer.fit_transform(movies_df['clean_title'])

In [25]:
print(type(tfidf))
print(tfidf.shape)  # Number of movies x Number of unique words

<class 'scipy.sparse._csr.csr_matrix'>
(9742, 29930)


In [None]:
# How a movie title is represented in the TF-IDF matrix
# from scipy.sparse import csr_matrix

# # Create a 2D array
# arr = [[0, 0, 5, 0, 0, 0], [0, 0, 0, 0, 11, 0, ], [0, 0, 0, 0, 0, 20]]

# # Convert the 2D array to a CSR matrix
# matrix = csr_matrix(arr)

# print(matrix)

In [19]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    similar_movies = movies_df.iloc[indices][::-1]
    return similar_movies

In [26]:
# type(similarity)
# similarity

movie_input = widgets.Text(
    placeholder='Enter a movie title',
    description='Movie Title:', 
    disabled=False)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        # print(data)
        title = data['new']
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='', description='Movie Title:', placeholder='Enter a movie title')

Output()

In [47]:
movie_id = 1
# ratings_df[(ratings_df['movieId'] == movie_id) & (ratings_df['rating'] >= 5.0)]['userId'].count()
similar_users = ratings_df[(ratings_df['movieId'] == movie_id) & (ratings_df['rating'] >= 5.0)]['userId'].unique()
similar_users

array([ 31,  40,  43,  46,  57,  63,  71,  96, 145, 151, 166, 171, 177,
       201, 206, 220, 229, 234, 240, 247, 269, 270, 273, 275, 304, 328,
       341, 347, 353, 357, 364, 367, 380, 389, 396, 411, 448, 451, 453,
       456, 471, 533, 559, 573, 584, 587, 610])

In [57]:
similar_user_recs = ratings_df[ratings_df['userId'].isin(similar_users) & (ratings_df['rating'] >= 4.0)]['movieId']
movie_recs = similar_user_recs.value_counts()
movie_recs[:5]

1      47
296    25
260    23
593    22
588    22
Name: movieId, dtype: int64