In [65]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [66]:
rating_data = pd.read_csv("data/animelist.csv", nrows=10000000)
anima_data = pd.read_csv("data/anime.csv")
anima_data = anima_data.rename(columns={"MAL_ID": "anime_id"})

In [67]:
count = rating_data['user_id'].value_counts()
count1 = rating_data['anime_id'].value_counts()
rating_data = rating_data[rating_data['user_id'].isin(count[count >= 1000].index)].copy()
rating_data = rating_data[rating_data['anime_id'].isin(count1[count1 >= 5000].index)].copy()

In [68]:
rating_data.isna().sum()

user_id             0
anime_id            0
rating              0
watching_status     0
watched_episodes    0
dtype: int64

In [69]:
# Encoding categorical data
user_ids = rating_data["user_id"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
user_encoded2user = {i: x for i, x in enumerate(user_ids)}
rating_data["user"] = rating_data["user_id"].map(user2user_encoded)
n_users = len(user2user_encoded)

anime_ids = rating_data["anime_id"].unique().tolist()
anime2anime_encoded = {x: i for i, x in enumerate(anime_ids)}
anime_encoded2anime = {i: x for i, x in enumerate(anime_ids)}
rating_data["anime"] = rating_data["anime_id"].map(anime2anime_encoded)
n_animes = len(anime2anime_encoded)

print("Num of users: {}, Num of animes: {}".format(n_users, n_animes))
print("Min rating: {}, Max rating: {}".format(min(rating_data['rating']), max(rating_data['rating'])))

Num of users: 1499, Num of animes: 428
Min rating: 0, Max rating: 10


In [70]:
rating_data = rating_data

In [71]:
g = rating_data.groupby('user_id')['rating'].count()
top_users = g.dropna().sort_values(ascending=False)[:20]
top_r = rating_data.join(top_users, rsuffix='_r', how='inner', on='user_id')

g = rating_data.groupby('anime_id')['rating'].count()
top_animes = g.dropna().sort_values(ascending=False)[:20]
top_r = top_r.join(top_animes, rsuffix='_r', how='inner', on='anime_id')

pivot = pd.crosstab(top_r.user_id, top_r.anime_id, top_r.rating, aggfunc=np.sum)

In [72]:
pivot.fillna(0, inplace=True)
pivot

anime_id,226,849,1535,1575,2001,4224,5081,5114,6547,6746,8074,9253,9989,10620,11757,15809,16498,19815,20507,30276
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
240,0,0,8,8,8,7,7,7,7,8,5,9,0,7,8,7,8,8,6,6
3160,0,0,8,0,0,9,10,10,0,10,2,10,10,5,0,8,10,0,10,0
4132,7,0,10,7,0,5,0,9,6,0,6,9,8,8,9,8,9,6,0,7
9528,10,0,8,10,0,10,10,10,10,10,10,10,9,10,10,10,10,10,10,10
10268,4,6,7,10,7,4,10,7,8,6,7,9,8,4,4,8,7,7,7,8
11100,4,8,7,9,0,0,7,0,6,0,0,0,0,4,5,7,5,6,0,0
11249,9,7,10,9,10,10,9,10,8,9,9,10,9,10,8,9,10,9,9,10
16526,9,9,10,10,8,8,10,10,7,8,8,9,8,8,9,9,9,10,9,8
16530,8,9,10,10,9,9,9,10,9,10,9,10,8,9,8,9,9,8,9,9
17155,0,8,9,9,0,8,0,0,8,0,7,10,9,6,7,8,9,9,9,10


In [73]:
rating_data

Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes,user,anime
3988,17,34572,0,1,167,0,0
3993,17,40748,0,1,22,0,1
4006,17,40028,0,1,13,0,2
4017,17,32998,9,2,12,0,3
4019,17,25397,6,2,12,0,4
...,...,...,...,...,...,...,...
9996489,32587,10495,0,6,0,1498,408
9996493,32587,392,0,6,0,1498,372
9996506,32587,23283,0,6,0,1498,304
9996509,32587,11319,0,6,0,1498,306


In [77]:
piviot_table = rating_data.pivot(columns="anime_id",index="user_id", values="rating").fillna(0)
piviot_table

anime_id,1,6,16,19,20,21,24,30,32,33,...,38000,38408,38524,38671,38691,39587,40028,40221,40591,40748
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
111,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,7.0,9.0,8.0,10.0,0.0,0.0,7.0,9.0,0.0
121,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,8.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32534,0.0,0.0,0.0,10.0,10.0,9.0,0.0,0.0,0.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32564,0.0,0.0,0.0,5.0,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,10.0,8.0,0.0,0.0,0.0,8.0,9.0,0.0
32570,10.0,7.0,8.0,0.0,6.0,8.0,7.0,7.0,6.0,0.0,...,8.0,7.0,6.0,6.0,7.0,0.0,7.0,0.0,0.0,6.0
32582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
