In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split
from collections import defaultdict

In [2]:
df_anime = pd.read_csv('data/anime.csv', sep=',')
df_ratings = pd.read_csv('data/rating.csv', sep=',')

In [3]:
df_anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [4]:
df_anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [5]:
df_anime.isna().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [6]:
df_ratings

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


In [7]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


In [8]:
df_ratings.isna().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

### Exercise 1

In order to access the Anime title when recommending movies, w are going to merge the anime and rating tables into one DataFrame. We are also going to replace the -1 and 'Unknown' values with NaN, so that these results can be seen by the recommendation algorithm. 

In [9]:
df_anime = df_anime.replace('Unknown', np.nan)
df_anime = df_anime.dropna(how = 'all')
df_anime['type'] = df_anime['type'].fillna('TV')
df_anime['episodes'] = df_anime['episodes'].map(lambda x:np.nan if pd.isnull(x) else int(x))
df_ratings = df_ratings.replace(-1, np.nan)

In [10]:
df_merged = pd.merge(df_anime, df_ratings, how = 'right', on ='anime_id', suffixes = ['_avg', '_user'])
df_merged.rename(columns = {'rating_user':'user_rating', 'rating_avg':'avg_rating'}, inplace = True)
df_merged

Unnamed: 0,anime_id,name,genre,type,episodes,avg_rating,members,user_id,user_rating
0,20,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220.0,7.81,683297.0,1,
1,24,School Rumble,"Comedy, Romance, School, Shounen",TV,26.0,8.06,178553.0,1,
2,79,Shuffle!,"Comedy, Drama, Ecchi, Fantasy, Harem, Magic, R...",TV,24.0,7.31,158772.0,1,
3,226,Elfen Lied,"Action, Drama, Horror, Psychological, Romance,...",TV,13.0,7.85,623511.0,1,
4,241,Girls Bravo: First Season,"Comedy, Ecchi, Fantasy, Harem, Romance, School",TV,11.0,6.69,84395.0,1,
...,...,...,...,...,...,...,...,...,...
7813732,16512,Devil Survivor 2 The Animation,"Action, Demons, Supernatural",TV,13.0,7.06,101266.0,73515,7.0
7813733,17187,Ghost in the Shell: Arise - Border:1 Ghost Pain,"Mecha, Police, Psychological, Sci-Fi",Movie,1.0,7.64,31747.0,73515,9.0
7813734,22145,Kuroshitsuji: Book of Circus,"Comedy, Demons, Fantasy, Historical, Shounen, ...",TV,10.0,8.37,122895.0,73515,10.0
7813735,790,Ergo Proxy,"Mystery, Psychological, Sci-Fi",TV,23.0,8.03,265005.0,73516,9.0


In [11]:
# Filter out anime with low ratings
df_merged = df_merged[df_merged['avg_rating'] >= 6]
df_merged

Unnamed: 0,anime_id,name,genre,type,episodes,avg_rating,members,user_id,user_rating
0,20,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220.0,7.81,683297.0,1,
1,24,School Rumble,"Comedy, Romance, School, Shounen",TV,26.0,8.06,178553.0,1,
2,79,Shuffle!,"Comedy, Drama, Ecchi, Fantasy, Harem, Magic, R...",TV,24.0,7.31,158772.0,1,
3,226,Elfen Lied,"Action, Drama, Horror, Psychological, Romance,...",TV,13.0,7.85,623511.0,1,
4,241,Girls Bravo: First Season,"Comedy, Ecchi, Fantasy, Harem, Romance, School",TV,11.0,6.69,84395.0,1,
...,...,...,...,...,...,...,...,...,...
7813732,16512,Devil Survivor 2 The Animation,"Action, Demons, Supernatural",TV,13.0,7.06,101266.0,73515,7.0
7813733,17187,Ghost in the Shell: Arise - Border:1 Ghost Pain,"Mecha, Police, Psychological, Sci-Fi",Movie,1.0,7.64,31747.0,73515,9.0
7813734,22145,Kuroshitsuji: Book of Circus,"Comedy, Demons, Fantasy, Historical, Shounen, ...",TV,10.0,8.37,122895.0,73515,10.0
7813735,790,Ergo Proxy,"Mystery, Psychological, Sci-Fi",TV,23.0,8.03,265005.0,73516,9.0


In [12]:
# CreateReader object
reader = Reader(rating_scale=(1, 10))

In [13]:
# Load the dataset using the Dataset class
data = Dataset.load_from_df(df_merged[['user_id', 'anime_id', 'avg_rating']], reader)

In [14]:
trainset, testset = train_test_split(data, test_size=.33)

### Exercise 2

KNN recommendation algorithm does not perform well when dealing with a dataset this magnitude. The algorithm works fast when dealing with a relatively small number of rows (< 10 000). Fitting the algorithm to a larger amount of data results in obtaining a IOPub data rate exceeded error. 

In order to make our recommendation model more efficient, we decided to use SVD algorithm. It seems to successfully handle the entire dataset.

In [15]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7feb93796820>

In [16]:
predictions = algo.test(testset)
print(f"RMSE: {accuracy.rmse(predictions)}, MAE: {accuracy.mae(predictions)}")

RMSE: 0.0458
MAE:  0.0287
RMSE: 0.04575869695469065, MAE: 0.02869686854076746


In [17]:
# Find out top 10 recommended anime for user.
def get_top_n(predictions, n=10):

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in list(top_n.items())[:5]:
    iname = [iid for (iid, _) in user_ratings]

    print(uid, [iid for (iid, _) in user_ratings])

51268 [19815, 4224, 4722, 18679, 72, 1943, 227, 517, 11111, 2994]
54851 [2904, 431, 1535, 28171, 9989, 16894, 13601, 5341, 11771, 853]
42086 [2904, 10087, 205, 4155, 31964, 15323, 121, 8740, 20159, 269]
58568 [11061, 2904, 164, 457, 245, 9989, 5028, 170, 7655, 16498]
25014 [5114, 9253, 4181, 2904, 263, 1575, 7311, 245, 11741, 1535]


In [18]:
animename_df = df_anime[['anime_id','name']]
animename_df.head()

Unnamed: 0,anime_id,name
0,32281,Kimi no Na wa.
1,5114,Fullmetal Alchemist: Brotherhood
2,28977,Gintama°
3,9253,Steins;Gate
4,9969,Gintama&#039;


### Exercises 3
In order to display the titles of the recommended movies in a meaningful way, we use the merged DataFrame and the code provided on the surprise library documentation page.

In [19]:
# Find out recommended animes for user

for uid, user_ratings in list(top_n.items())[:10]:
    dic = {"User Id":[], "Recommendation":[]}
    dic["User Id"].append(uid)
    
    for (iid, _) in user_ratings:
        anime_index = animename_df.index[animename_df['anime_id']==iid]
        anime_name = animename_df.iloc[anime_index]['name'].tolist()
        dic["Recommendation"].append(anime_name)

    for uu, ii in dic.items():
        print(uu, ii)
    print("\n")

User Id [51268]
Recommendation [['No Game No Life'], ['Toradora!'], ['Skip Beat!'], ['Kill la Kill'], ['Full Metal Panic? Fumoffu'], ['Paprika'], ['FLCL'], ['School Rumble Ichi Gakki Hoshuu'], ['Another'], ['Death Note Rewrite']]


User Id [54851]
Recommendation [['Code Geass: Hangyaku no Lelouch R2'], ['Howl no Ugoku Shiro'], ['Death Note'], ['Shokugeki no Souma'], ['Ano Hi Mita Hana no Namae wo Bokutachi wa Mada Shiranai.'], ['Kuroko no Basket 2nd Season'], ['Psycho-Pass'], ['Ookami to Koushinryou II'], ['Kuroko no Basket'], ['Ouran Koukou Host Club']]


User Id [42086]
Recommendation [['Code Geass: Hangyaku no Lelouch R2'], ['Fate/Zero'], ['Samurai Champloo'], ['One Piece Film: Strong World'], ['Boku no Hero Academia'], ['One Piece: Episode of Nami - Koukaishi no Namida to Nakama no Kizuna'], ['Fullmetal Alchemist'], ['One Piece Film: Strong World Episode 0'], ['Pokemon: The Origin'], ['Bleach']]


User Id [58568]
Recommendation [['Hunter x Hunter (2011)'], ['Code Geass: Hangyaku no