In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## prepare data

In [2]:
anime = pd.read_csv('anime.csv', sep=',')
rating = pd.read_csv('rating.csv', sep=',')
rating['rating'] = rating['rating'].apply(lambda x: np.nan if x==-1 else x)
print('shape of rating is', rating.shape)
rating.head()

shape of rating is (7813737, 3)


Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


In [18]:
# select the top 20 anime based on rating count
rating_count = rating[rating.rating != -1]
rating_count = rating_count.groupby(by = ['anime_id'])['rating'].count()
rating_count = rating_count.reset_index().rename(columns = {'rating': 'rating_count'})
rating_count = rating_count.sort_values(by = 'rating_count',ascending = False)

rating_count_20 = pd.merge(rating_count, anime, on='anime_id', suffixes= ['', '_user'])
rating_count_20 = rating_count_20[['anime_id', 'rating_count', 'name', 'genre']]
rating_count_20 = rating_count_20.rename(columns = {'name': 'anime_name'}).iloc[:20]
rating_count_20

Unnamed: 0,anime_id,rating_count,anime_name,genre
0,1535,34226,Death Note,"Mystery, Police, Psychological, Supernatural, ..."
1,11757,26310,Sword Art Online,"Action, Adventure, Fantasy, Game, Romance"
2,16498,25290,Shingeki no Kyojin,"Action, Drama, Fantasy, Shounen, Super Power"
3,1575,24126,Code Geass: Hangyaku no Lelouch,"Action, Mecha, Military, School, Sci-Fi, Super..."
4,6547,23565,Angel Beats!,"Action, Comedy, Drama, School, Supernatural"
5,226,23528,Elfen Lied,"Action, Drama, Horror, Psychological, Romance,..."
6,20,22071,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P..."
7,5114,21494,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili..."
8,121,21332,Fullmetal Alchemist,"Action, Adventure, Comedy, Drama, Fantasy, Mag..."
9,2904,21124,Code Geass: Hangyaku no Lelouch R2,"Action, Drama, Mecha, Military, Sci-Fi, Super ..."


In [4]:
type(rating_count_20)

pandas.core.frame.DataFrame

From now on, we will only focus on the **_top 20 animes_** that are most commonly rated by users.

In [5]:
rating20 = rating[rating.anime_id.isin(rating_count_20['anime_id'])]
print('shape of rating20 is', rating20.shape)
rating20.head()

shape of rating20 is (495269, 3)


Unnamed: 0,user_id,anime_id,rating
0,1,20,
3,1,226,
14,1,2001,
22,1,4224,
36,1,6547,


In [6]:
# create a pivot table
rating_pivot = rating20.pivot_table(index=['user_id'], columns=['anime_id'], values='rating')
print('shape of rating_pivot is', rating_pivot.shape)

# put 0 if user has not rated this anime
rating_pivot.fillna(0, inplace=True)
rating_pivot.head()

shape of rating_pivot is (61180, 20)


anime_id,20,121,199,226,1535,1575,2001,2167,2904,3588,4224,5114,6547,8074,9253,9919,10620,11111,11757,16498
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,10.0,0.0
3,8.0,0.0,10.0,0.0,10.0,0.0,0.0,0.0,0.0,8.0,0.0,10.0,0.0,6.0,0.0,8.0,0.0,8.0,9.0,10.0
5,6.0,0.0,8.0,0.0,4.0,0.0,0.0,0.0,0.0,5.0,3.0,0.0,3.0,2.0,9.0,4.0,0.0,0.0,1.0,0.0
7,0.0,8.0,0.0,0.0,9.0,9.0,0.0,9.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,8.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,9.0,0.0


In [7]:
rating_pivot.iloc[:, 4]

user_id
1         0.0
3        10.0
5         4.0
7         9.0
8         0.0
         ... 
73511     0.0
73512     0.0
73513     0.0
73515    10.0
73516     0.0
Name: 1535, Length: 61180, dtype: float64

## SVD

In [8]:
U,S,V = np.linalg.svd(rating_pivot, full_matrices=False)
print('shape of U is', U.shape)
print('shape of S is', S.shape)
print('shape of V is', V.shape)

shape of U is (61180, 20)
shape of S is (20,)
shape of V is (20, 20)


In [9]:
# reduce to the dimension of 10
rating_pivot_10 = U[:, :10] * S[:10]
rating_pivot_10.shape

(61180, 10)

In [10]:
rating_pivot_10

array([[-4.43290008e+00, -4.95997010e+00,  2.02369696e-01, ...,
         2.04694377e+00,  1.99818243e-01, -2.94664435e+00],
       [-1.94160112e+01, -4.63889671e+00,  1.37448337e+01, ...,
        -1.28587930e+00, -3.66899453e-01,  4.77934752e+00],
       [-9.43883433e+00, -8.18338294e-01,  3.21769803e+00, ...,
        -1.70652503e+00,  2.88732043e+00,  4.69035246e+00],
       ...,
       [-1.63785645e+00,  1.26747092e+00,  2.23668681e+00, ...,
         8.66424513e-01, -1.50537436e+00,  1.16289884e+00],
       [-2.66864519e+01,  7.20704921e+00, -4.46577333e-01, ...,
         9.59800725e+00,  3.05274459e+00,  2.89287330e+00],
       [-1.70302550e+00, -1.35635534e+00, -1.28591584e-02, ...,
         2.25779130e+00,  5.62158134e-01, -2.08671125e-01]])

In [11]:
# push back to the original dimension
push_back = np.dot(U[:, :10]*S[:10],V[:10,:])
print('shape of push_back is', push_back.shape)

shape of push_back is (61180, 20)


In [12]:
# sort the recommendation ratings from low to high for each user
push_back_sort = np.argsort(push_back, axis=1)
push_back_sort_pd = pd.DataFrame(push_back_sort, index=rating_pivot.index).iloc[:, ::-1]
# push_back_sort_pd

In [13]:
rating_pivot.columns

Int64Index([   20,   121,   199,   226,  1535,  1575,  2001,  2167,  2904,
             3588,  4224,  5114,  6547,  8074,  9253,  9919, 10620, 11111,
            11757, 16498],
           dtype='int64', name='anime_id')

In [14]:
all_users = rating_pivot.index
columns = ['1_id', '1_name', '2_id', '2_name', '3_id', '3_name']
rec_matrix = pd.DataFrame(index=all_users, columns=columns)
# rec_matrix

In [15]:
def recommend(rating, rec_order, anime_id_name):
    """
    user_id - integer
    rating - pandas.DataFrame - the pivot dataframe with original ratings
    rec_order - pandas.DataFrame
    anime_id_name - pandas.DataFrame - help find the name of the anime

    give THREE recommendations to the user.
    recommendations must be anime that has NOT been rated by the user"""
    
    all_users = rating.index[:500]
    columns = ['user_id', 'rec1', 'rec2', 'rec3']
    rec_matrix = pd.DataFrame(index=all_users, columns=columns)
    
    for user_id in all_users:
#         rec_id_list = []
#         rec_name_list = []
        rec_list = [user_id]

        # if the usre has not rated any of the 20 anime,
        # the user will not be in our dataset and we cannot
        # give a recommendation to them.
#         if user_id not in rating.index:
#             continue

        user_rec_order = rec_order.loc[user_id]
        user_rating = rating.loc[user_id]
        anime_order = rating.columns
        max_anime = len(rating.columns)

        # create rec_list
        i = 0
        rec_num = 0
        while (i < max_anime) and (rec_num < 3):
            anime_rec = user_rec_order.iloc[i]
            anime_id = anime_order[anime_rec]  # get the id of that anime
            if user_rating.loc[anime_id] == 0:
                rec_num+=1
                rec_list.append(anime_id)  # if not rated, recommend
                
#                 anime = anime_id_name.loc[anime_id_name['anime_id'] == i]
#                 anime_name = anime['anime_name']
#                 rec_list.append(anime_name)
            i += 1
        
        if len(rec_list)<4:
            for i in range(len(rec_list), 4):
                rec_list.append(0)
        
#         print(rec_list)
        rec_matrix.loc[user_id] = rec_list

#         # create rec_name_list
#         for i in rec_id_list:
#             anime = anime_id_name.loc[anime_id_name['anime_id'] == i]
#             anime_name = anime['anime_name'].values[0]
#             rec_name_list.append(anime_name)

    return rec_matrix

In [16]:
rec_matrix = recommend(rating_pivot, push_back_sort_pd, rating_count_20)
# print(rec_id)
# print(rec_name)
rec_matrix.head(10)

Unnamed: 0_level_0,user_id,rec1,rec2,rec3
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,16498,20,226
3,3,10620,121,6547
5,5,2001,16498,11111
7,7,20,226,6547
8,8,16498,8074,11111
10,10,16498,121,9253
11,11,6547,4224,20
12,12,10620,2001,11111
14,14,11111,4224,5114
16,16,4224,9253,11111


In [17]:
# test the anime recommended are not rated by the user
# if not rated, the entry should be 0
user_id = 1047
user = rating_pivot.loc[user_id]
id_list, name_list = recommend(user_id, rating_pivot, push_back_sort_pd, rating_count_20)
for i in range(len(id_list)):
    print(user.loc[id_list[i]])

TypeError: recommend() takes 3 positional arguments but 4 were given