In [38]:
from surprise import SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

import pandas as pd
import numpy as np

In [39]:
links = pd.read_csv('../lecture-1/links.csv')
movies = pd.read_csv('../lecture-1/movies.csv')
ratings = pd.read_csv('../lecture-1/ratings.csv')
tags = pd.read_csv('../lecture-1/tags.csv')

In [40]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [41]:
movies_with_ratings.head(-1)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,9.649827e+08
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,8.474350e+08
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1.106636e+09
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1.510578e+09
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1.305696e+09
...,...,...,...,...,...,...
100848,193579,Jon Stewart Has Left the Building (2015),Documentary,184.0,3.5,1.537107e+09
100849,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184.0,4.0,1.537109e+09
100850,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184.0,3.5,1.537110e+09
100851,193585,Flint (2017),Drama,184.0,3.5,1.537110e+09


In [42]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [43]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [44]:
trainset, testset = train_test_split(data, test_size=0.2)

In [45]:
algo = SVD(n_factors=15, n_epochs=10)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a225204d0>

In [46]:
test_pred = algo.test(testset)

In [47]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8715


0.871518667022858

In [48]:
algo.predict(uid=184, iid='No Game No Life: Zero (2017)').est

3.611941505486304

In [49]:
current_user_id = 184
user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()

scores = []   
titles = []

for movie in movies_with_ratings.title.unique():
    if movie in user_movies:
        continue
        
    scores.append(algo.predict(uid=current_user_id, iid=movie).est)
    titles.append(movie)

In [50]:
sorted(scores)[-10:]

[4.351647753657753,
 4.361372871430738,
 4.363989219208977,
 4.3751655000579905,
 4.38611478764941,
 4.389986942715084,
 4.391196968699926,
 4.419292355492143,
 4.447798194046829,
 4.485551519293634]

In [51]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [52]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [55]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neigh = NearestNeighbors(n_neighbors=20, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

In [56]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [57]:
res

(array([[0.42079615, 0.53300564, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608, 0.6188388 , 0.62682864, 0.62682864]]),
 array([[6774, 9096, 3576,  863, 2302, 2608, 7865, 3582, 8361, 3302, 5737,
         6723, 5636, 3376, 7496, 5627, 9717, 2206, 6133, 5832]]))

In [58]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
6774,60074,Hancock (2008),Action|Adventure|Comedy|Crime|Fantasy
9096,143559,L.A. Slasher (2015),Comedy|Crime|Fantasy
3576,4899,Black Knight (2001),Adventure|Comedy|Fantasy
863,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy
2302,3052,Dogma (1999),Adventure|Comedy|Fantasy
2608,3489,Hook (1991),Adventure|Comedy|Fantasy
7865,94015,Mirror Mirror (2012),Adventure|Comedy|Fantasy
3582,4911,Jabberwocky (1977),Adventure|Comedy|Fantasy
8361,109042,Knights of Badassdom (2013),Adventure|Comedy|Fantasy
3302,4467,"Adventures of Baron Munchausen, The (1988)",Adventure|Comedy|Fantasy


In [59]:
movies_with_ratings.sort_values('timestamp', inplace=True)

In [60]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [26]:
# movies_with_ratings[movies_with_ratings]

In [61]:
title_genres = {}

for index, row in tqdm_notebook(movies.iterrows()):
    title_genres[row.title] = row.genres

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [62]:
def recommend_for_user(user_id):
    current_user_id = user_id
    user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()
    
    last_user_movie = user_movies[-1]
    
    movie_genres = title_genres[last_user_movie]
    
    movie_genres = change_string(movie_genres)

    predict = count_vect.transform([movie_genres])
    X_tfidf2 = tfidf_transformer.transform(predict)

    res = neigh.kneighbors(X_tfidf2, return_distance=True)
    
    movies_to_score = movies.iloc[res[1][0]].title.values

    scores = []
    titles = []

    for movie in movies_to_score:
        if movie in user_movies:
            continue

        scores.append(algo.predict(uid=current_user_id, iid=movie).est)
        titles.append(movie)
        
    
    best_indexes = np.argsort(scores)[-10:]
    for i in reversed(best_indexes):
        print(titles[i], scores[i])

In [63]:
movies_with_ratings[movies_with_ratings.userId == 184].sort_values('rating')

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
100777,184641,Fullmetal Alchemist 2018 (2017),Action|Adventure|Fantasy,184.0,0.5,1.537095e+09
100565,175475,The Emoji Movie (2017),Animation|Children|Comedy,184.0,0.5,1.537110e+09
100185,165635,The Thinning (2016),Thriller,184.0,1.0,1.537108e+09
97915,117895,Maze Runner: Scorch Trials (2015),Action|Thriller,184.0,1.0,1.537107e+09
100763,183911,The Clapper (2018),Comedy,184.0,1.5,1.537109e+09
...,...,...,...,...,...,...
99592,148626,"Big Short, The (2015)",Drama,184.0,5.0,1.537108e+09
100487,172705,Tickling Giants (2017),Documentary,184.0,5.0,1.537108e+09
100124,164179,Arrival (2016),Sci-Fi,184.0,5.0,1.537108e+09
96405,106782,"Wolf of Wall Street, The (2013)",Comedy|Crime|Drama,184.0,5.0,1.537099e+09


In [64]:
recommend_for_user(184)

Hunt for Red October, The (1990) 4.0187871231835945
Goldfinger (1964) 3.9504283419500386
Live and Let Die (1973) 3.8960781913232005
Dr. No (1962) 3.889132820370203
From Russia with Love (1963) 3.8314074372636355
Mission: Impossible III (2006) 3.729711853655254
Thunderball (1965) 3.7148464770845413
Mission: Impossible - Rogue Nation (2015) 3.6993848081289142
For Your Eyes Only (1981) 3.6868513536575214
Raiders of the Lost Ark: The Adaptation (1989) 3.6718675026030083
