In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from project1 import top_20_rate_count, SVD, recommend

          rec1   rec2   rec3
user_id                     
1        16498     20    226
3        10620    121   6547
5         2001  16498  11111
7           20    226   6547
8        16498   8074  11111
10       16498    121   9253
11        6547   4224     20
12       10620   2001  11111
14       11111   4224   5114
16        4224   9253  11111
17        3588   9919     20
18        9919    226   9253
19        2904  16498   2001
20       11111   8074   9919
21       11757    121   8074
22        9919   4224  11111
24       10620   9919   2167
25          20   9253   4224
27          20  11111   9253
28        8074   2167  11111


# Preparing Data

### 1. read datasets to pandas DataFrame

In [2]:
# the first dataset - rating.csv
rating = pd.read_csv('rating.csv', sep=',')
rating['rating'] = rating['rating'].apply(lambda x: np.nan if x==-1 else x)
print('shape of rating is', rating.shape)
rating.head()

shape of rating is (7813737, 3)


Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


In [3]:
# the second dataset - anime.csv
anime = pd.read_csv('anime.csv', sep=',').rename(columns = {'name': 'anime_name'})
print('shape of anime is', anime.shape)
anime.head()

shape of anime is (12294, 7)


Unnamed: 0,anime_id,anime_name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


### 2. focus on a subset: top 20 anime based on rating counts

In [4]:
# select the top 20 anime based on rating count
rating_count_20 = top_20_rate_count(rating, "anime_id", "rating")

# add names of the 20 anime to the dataset
rating_count_20 = pd.merge(rating_count_20, anime, on='anime_id')[['anime_id', 'count', 'anime_name']]
rating_count_20.head()

Unnamed: 0,anime_id,count,anime_name
0,1535,34226,Death Note
1,11757,26310,Sword Art Online
2,16498,25290,Shingeki no Kyojin
3,1575,24126,Code Geass: Hangyaku no Lelouch
4,6547,23565,Angel Beats!


From now on, we will only focus on the **_top 20 animes_** that are most commonly rated by users.  
We are only interested in users who have rated at least one of the 20 most commonly rated anime.  
And we will leave other users out.

In [5]:
rating20 = rating[rating["anime_id"].isin(rating_count_20['anime_id'])]
print('shape of rating20 is', rating20.shape)
rating20.head()

shape of rating20 is (495269, 3)


Unnamed: 0,user_id,anime_id,rating
0,1,20,
3,1,226,
14,1,2001,
22,1,4224,
36,1,6547,


### 3. transform the dataframe: one user per row

In [10]:
# create a pivot table
rating_pivot = rating20.pivot_table(index=['user_id'], columns=['anime_id'], values='rating')
print('shape of rating_pivot is', rating_pivot.shape)

# put 0 if a user has not rated an anime
rating_pivot.fillna(0, inplace=True)
rating_pivot.head(10)

shape of rating_pivot is (61180, 20)


anime_id,20,121,199,226,1535,1575,2001,2167,2904,3588,4224,5114,6547,8074,9253,9919,10620,11111,11757,16498
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,10.0,0.0
3,8.0,0.0,10.0,0.0,10.0,0.0,0.0,0.0,0.0,8.0,0.0,10.0,0.0,6.0,0.0,8.0,0.0,8.0,9.0,10.0
5,6.0,0.0,8.0,0.0,4.0,0.0,0.0,0.0,0.0,5.0,3.0,0.0,3.0,2.0,9.0,4.0,0.0,0.0,1.0,0.0
7,0.0,8.0,0.0,0.0,9.0,9.0,0.0,9.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,8.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,9.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0
11,0.0,7.0,10.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,8.0,0.0,0.0,7.0,0.0,0.0,0.0,8.0,9.0
12,0.0,0.0,0.0,0.0,10.0,10.0,0.0,0.0,10.0,0.0,0.0,9.0,0.0,6.0,10.0,0.0,0.0,0.0,9.0,10.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,9.0,6.0,9.0,6.0,8.0,0.0,6.0,8.0
16,0.0,0.0,0.0,0.0,9.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Implementing SVD

In [7]:
reduced, push_back = SVD(rating_pivot, dim=10, full_matrices=False)
print('shape of reduced is', reduced.shape)
print('shape of push_back is', push_back.shape)
push_back.head()

shape of reduced is (61180, 10)
shape of push_back is (61180, 20)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,2.596742,0.020816,-1.331291,2.308622,-1.785923,0.058843,0.331169,0.397696,0.034192,0.606683,0.513528,-0.516781,1.578496,2.704276,-0.199963,1.803972,2.290435,1.341379,4.582664,3.676649
1,7.822355,4.432743,9.410604,-0.461986,9.979232,0.190647,-0.235262,-2.368415,-0.274828,6.697843,1.483036,6.681715,2.031716,2.963467,1.123729,9.105539,4.85871,4.675001,7.905171,10.415891
2,2.058552,0.560666,7.093647,-0.829962,5.168334,0.426229,2.886348,1.186973,0.157781,4.30046,4.21293,1.069485,2.259333,0.926505,1.680712,3.564543,1.522331,1.893887,1.323831,2.120948
3,3.691066,3.326753,0.056059,3.530243,7.498011,9.298604,-1.469074,4.829843,8.866625,-0.994185,2.257373,3.041095,3.317555,1.467808,1.853732,0.716712,1.403141,1.486687,2.881651,2.541712
4,1.122014,-0.789232,-0.555396,0.819814,-0.907395,-0.013085,-0.468526,-0.025823,0.028123,-0.237761,0.091889,0.051164,1.286806,1.60292,0.723525,1.457137,2.220659,1.510128,3.895738,3.892375


In [8]:
# sort the recommendation ratings ascendingly for each user
rec_sort = np.argsort(push_back, axis=1).set_index(rating_pivot.index).iloc[:, ::-1]
rec_sort.head()

Unnamed: 0_level_0,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,18,19,13,0,3,16,15,12,17,9,10,7,6,5,8,1,14,11,2,4
3,19,4,2,15,18,0,9,11,16,17,1,13,12,10,14,5,6,8,3,7
5,2,4,9,10,15,6,12,19,0,17,14,16,18,7,11,13,1,5,8,3
7,5,8,4,7,0,3,1,12,11,18,19,10,14,17,13,16,15,2,9,6
8,18,19,16,13,17,15,12,0,3,14,10,11,8,5,7,9,6,2,1,4


# Recommendation

In [9]:
rec = recommend(rating_pivot, rec_sort, rating_count_20)
print(rec.head(10))

          rec1   rec2   rec3
user_id                     
1        16498     20    226
3        10620    121   6547
5         2001  16498  11111
7           20    226   6547
8        16498   8074  11111
10       16498    121   9253
11        6547   4224     20
12       10620   2001  11111
14       11111   4224   5114
16        4224   9253  11111
