In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

from surprise import Reader
from surprise import KNNBasic
from surprise import Dataset
from surprise.model_selection import cross_validate

In [14]:
anime_df = pd.read_csv('./data/anime.csv')
rating_df = pd.read_csv('./data/rating.csv')

anime_df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


### 1
To apply our recommendation system based by ratings given by the users, we need to drop the rows containing -1 value (films not rated by the users).
We are also going to merge two tables using anime_id. Thanks to that, we have the Anime title that can be displayed when recommending movies.

In [3]:
rating_df = rating_df[rating_df.rating >= 6]
rating_df

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10
81,1,11617,10
83,1,11757,10
101,1,15451,10
153,2,11771,10
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


In [4]:
rating_df = rating_df[:10]

In [5]:
rating_df.describe()

Unnamed: 0,user_id,anime_id,rating
count,10.0,10.0,10.0
mean,2.1,5943.8,9.2
std,0.994429,6346.623893,1.316561
min,1.0,20.0,6.0
25%,1.0,177.25,9.0
50%,2.5,4149.5,10.0
75%,3.0,11722.0,10.0
max,3.0,15451.0,10.0


In [6]:
reader = Reader(rating_scale=(1,10))

data = Dataset.load_from_df(rating_df[['user_id', 'anime_id', 'rating']], reader)

In [7]:
data

<surprise.dataset.DatasetAutoFolds at 0x7fb7c0b99b50>

In [8]:
trainset = data.build_full_trainset()

In [9]:
sim_options = {
               'user_based': True  # compute  similarities between items
}

algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fb7c0b99c40>

### 2
KNN recommendation algorithm does not perform well when dealing with a dataset this magnitude. 
The algorithm works fast when dealing with a relatively small number of rows (< 10 000). Fitting the algorithm to a larger amount of data results in obtaining a **IOPub data rate exceeded** error. 

We have a lot of anime film items and only 17 users. Therefore, in order to make the recommendation system more efficient, we're going to use user_based method, to find the similarities between users and not between items.

In [10]:
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

In [11]:
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(predictions, n=10)

We combine both tables to display the movies recommended for the given users.

In [12]:
for uid, user_ratings in top_n.items():
    print('\n\nUser', uid)
    print('Recommended movies:\n')
    for (iid, _) in user_ratings:
        print(anime_df[anime_df['anime_id'] == iid]['name'])



User 1
Recommended movies:

122    Kuroko no Basket
Name: name, dtype: object
841    Naruto
Name: name, dtype: object
803    Shaman King
Name: name, dtype: object
79    Slam Dunk
Name: name, dtype: object
15    Sen to Chihiro no Kamikakushi
Name: name, dtype: object
4444    Dragon Ball GT
Name: name, dtype: object


User 2
Recommended movies:

1709    Highschool of the Dead
Name: name, dtype: object
1057    High School DxD
Name: name, dtype: object
804    Sword Art Online
Name: name, dtype: object
724    High School DxD New
Name: name, dtype: object
841    Naruto
Name: name, dtype: object
803    Shaman King
Name: name, dtype: object
79    Slam Dunk
Name: name, dtype: object
15    Sen to Chihiro no Kamikakushi
Name: name, dtype: object
4444    Dragon Ball GT
Name: name, dtype: object


User 3
Recommended movies:

1709    Highschool of the Dead
Name: name, dtype: object
1057    High School DxD
Name: name, dtype: object
804    Sword Art Online
Name: name, dtype: object
724    High Schoo

In [13]:
cross_validate(algo, data, measures=['MAE', 'RMSE'], cv=3, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating MAE, RMSE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
MAE (testset)     1.2500  0.9524  0.6667  0.9563  0.2382  
RMSE (testset)    1.8028  0.9619  0.8165  1.1937  0.4347  
Fit time          0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    


{'test_mae': array([1.25      , 0.95238095, 0.66666667]),
 'test_rmse': array([1.80277564, 0.96185761, 0.81649658]),
 'fit_time': (0.00010085105895996094,
  1.8358230590820312e-05,
  1.4781951904296875e-05),
 'test_time': (7.605552673339844e-05,
  2.9802322387695312e-05,
  2.7179718017578125e-05)}