In [118]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [119]:
rating_data = pd.read_csv("data/animelist.csv", nrows=10000000)
anima_data = pd.read_csv("data/anime.csv")
anima_data = anima_data.rename(columns={"MAL_ID": "anime_id"})

In [120]:
count = rating_data['user_id'].value_counts()
count1 = rating_data['anime_id'].value_counts()
rating_data = rating_data[rating_data['user_id'].isin(count[count >= 500].index)].copy()
rating_data = rating_data[rating_data['anime_id'].isin(count1[count1 >= 100].index)].copy()

In [121]:
rating_data.isna().sum()

user_id             0
anime_id            0
rating              0
watching_status     0
watched_episodes    0
dtype: int64

In [122]:
# Encoding categorical data
user_ids = rating_data["user_id"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
user_encoded2user = {i: x for i, x in enumerate(user_ids)}
rating_data["user"] = rating_data["user_id"].map(user2user_encoded)
n_users = len(user2user_encoded)

anime_ids = rating_data["anime_id"].unique().tolist()
anime2anime_encoded = {x: i for i, x in enumerate(anime_ids)}
anime_encoded2anime = {i: x for i, x in enumerate(anime_ids)}
rating_data["anime"] = rating_data["anime_id"].map(anime2anime_encoded)
n_animes = len(anime2anime_encoded)

print("Num of users: {}, Num of animes: {}".format(n_users, n_animes))
print("Min rating: {}, Max rating: {}".format(min(rating_data['rating']), max(rating_data['rating'])))

Num of users: 6196, Num of animes: 7436
Min rating: 0, Max rating: 10


In [123]:
rating_data = rating_data

In [124]:
g = rating_data.groupby('user_id')['rating'].count()
top_users = g.dropna().sort_values(ascending=False)[:20]
top_r = rating_data.join(top_users, rsuffix='_r', how='inner', on='user_id')

g = rating_data.groupby('anime_id')['rating'].count()
top_animes = g.dropna().sort_values(ascending=False)[:20]
top_r = top_r.join(top_animes, rsuffix='_r', how='inner', on='anime_id')

pivot = pd.crosstab(top_r.user_id, top_r.anime_id, top_r.rating, aggfunc=np.sum)

In [125]:
pivot.fillna(0, inplace=True)
pivot

anime_id,226,1535,1575,2001,2167,4224,5081,5114,6547,6746,9253,9989,10620,11757,15809,16498,19815,20507,22319,30276
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
4132,7.0,10.0,7.0,0.0,6.0,5.0,0.0,9.0,6.0,0.0,9.0,8.0,8.0,9.0,8.0,9.0,6.0,0.0,7.0,7.0
4773,7.0,7.0,9.0,9.0,10.0,8.0,9.0,9.0,8.0,8.0,10.0,8.0,9.0,7.0,8.0,8.0,4.0,8.0,7.0,8.0
6536,8.0,10.0,10.0,10.0,10.0,10.0,0.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,10.0,10.0,10.0,10.0,10.0
7179,8.0,7.0,9.0,9.0,8.0,7.0,8.0,10.0,7.0,9.0,7.0,9.0,8.0,4.0,8.0,7.0,7.0,7.0,6.0,7.0
10255,8.0,8.0,8.0,9.0,8.0,8.0,6.0,10.0,7.0,8.0,10.0,9.0,6.0,0.0,7.0,8.0,5.0,6.0,0.0,4.0
10665,8.0,8.0,0.0,0.0,10.0,9.0,0.0,0.0,9.0,9.0,0.0,10.0,9.0,9.0,0.0,10.0,0.0,8.0,8.0,7.0
11100,4.0,7.0,9.0,0.0,5.0,0.0,7.0,0.0,6.0,0.0,0.0,0.0,4.0,5.0,7.0,5.0,6.0,0.0,2.0,0.0
15083,7.0,8.0,9.0,9.0,0.0,6.0,8.0,7.0,8.0,8.0,8.0,8.0,8.0,8.0,7.0,7.0,8.0,7.0,7.0,8.0
16057,10.0,10.0,10.0,10.0,9.0,0.0,8.0,10.0,10.0,10.0,10.0,10.0,10.0,9.0,10.0,10.0,10.0,9.0,9.0,10.0
16869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [126]:
rating_data

Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes,user,anime
1415,6,9062,8,1,1,0,0
1416,6,9919,0,1,2,0,1
1417,6,150,7,1,15,0,2
1418,6,4981,0,1,12,0,3
1419,6,10793,0,1,2,0,4
...,...,...,...,...,...,...,...
9999995,32600,6,0,3,2,6195,513
9999996,32600,2129,4,3,2,6195,2004
9999997,32600,34902,0,3,4,6195,2873
9999998,32600,37722,0,3,1,6195,2967


In [127]:
piviot_table = rating_data.pivot(columns="anime_id",index="user_id", values="rating").fillna(0)
piviot_table

anime_id,1,5,6,7,8,15,16,17,18,19,...,47160,47164,47250,47398,47616,47778,48375,48413,48417,48438
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,8.0,6.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32587,10.0,7.0,10.0,8.0,0.0,10.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32589,8.0,0.0,7.0,0.0,0.0,9.0,0.0,0.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
