In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [14]:
rating_data = pd.read_csv("data/animelist.csv", nrows=10000000)
anima_data = pd.read_csv("data/anime.csv")
anima_data = anima_data.rename(columns={"MAL_ID": "anime_id"})
anima_contact_data = anima_data[["anime_id", "Name"]]
anima_contact_data

Unnamed: 0,anime_id,Name
0,1,Cowboy Bebop
1,5,Cowboy Bebop: Tengoku no Tobira
2,6,Trigun
3,7,Witch Hunter Robin
4,8,Bouken Ou Beet
...,...,...
17557,48481,Daomu Biji Zhi Qinling Shen Shu
17558,48483,Mieruko-chan
17559,48488,Higurashi no Naku Koro ni Sotsu
17560,48491,Yama no Susume: Next Summit


In [15]:
rating_data = rating_data.merge(anima_contact_data, left_on = 'anime_id', right_on = 'anime_id', how = 'left')
rating_data = rating_data[["user_id", "Name", "anime_id","rating", "watching_status", "watched_episodes"]]
rating_data

Unnamed: 0,user_id,Name,anime_id,rating,watching_status,watched_episodes
0,0,Basilisk: Kouga Ninpou Chou,67,9,1,1
1,0,Fairy Tail,6702,7,1,4
2,0,Gokusen,242,10,1,4
3,0,Kuroshitsuji,4898,0,1,1
4,0,One Piece,21,10,1,0
...,...,...,...,...,...,...
9999995,32600,Trigun,6,0,3,2
9999996,32600,True Tears,2129,4,3,2
9999997,32600,Tsurezure Children,34902,0,3,4
9999998,32600,Uchi no Maid ga Uzasugiru!,37722,0,3,1


In [16]:
rating_data.drop_duplicates()
rating_data.to_csv("data/test.csv")
rating_data

Unnamed: 0,user_id,Name,anime_id,rating,watching_status,watched_episodes
0,0,Basilisk: Kouga Ninpou Chou,67,9,1,1
1,0,Fairy Tail,6702,7,1,4
2,0,Gokusen,242,10,1,4
3,0,Kuroshitsuji,4898,0,1,1
4,0,One Piece,21,10,1,0
...,...,...,...,...,...,...
9999995,32600,Trigun,6,0,3,2
9999996,32600,True Tears,2129,4,3,2
9999997,32600,Tsurezure Children,34902,0,3,4
9999998,32600,Uchi no Maid ga Uzasugiru!,37722,0,3,1


In [17]:
rating_data.shape

(10000000, 6)

In [18]:
count = rating_data['user_id'].value_counts()
count1 = rating_data['anime_id'].value_counts()
rating_data = rating_data[rating_data['user_id'].isin(count[count >= 500].index)].copy()
rating_data = rating_data[rating_data['anime_id'].isin(count1[count1 >= 100].index)].copy()

In [19]:
rating_data.isna().sum()

user_id             0
Name                0
anime_id            0
rating              0
watching_status     0
watched_episodes    0
dtype: int64

In [7]:
# Encoding categorical data
user_ids = rating_data["user_id"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
user_encoded2user = {i: x for i, x in enumerate(user_ids)}
rating_data["user"] = rating_data["user_id"].map(user2user_encoded)
n_users = len(user2user_encoded)

anime_ids = rating_data["anime_id"].unique().tolist()
anime2anime_encoded = {x: i for i, x in enumerate(anime_ids)}
anime_encoded2anime = {i: x for i, x in enumerate(anime_ids)}
rating_data["anime"] = rating_data["anime_id"].map(anime2anime_encoded)
n_animes = len(anime2anime_encoded)

print("Num of users: {}, Num of animes: {}".format(n_users, n_animes))
print("Min rating: {}, Max rating: {}".format(min(rating_data['rating']), max(rating_data['rating'])))

Num of users: 6196, Num of animes: 7436
Min rating: 0, Max rating: 10


In [8]:
rating_data = rating_data

In [9]:
g = rating_data.groupby('user_id')['rating'].count()
top_users = g.dropna().sort_values(ascending=False)[:20]
top_r = rating_data.join(top_users, rsuffix='_r', how='inner', on='user_id')

g = rating_data.groupby('anime_id')['rating'].count()
top_animes = g.dropna().sort_values(ascending=False)[:20]
top_r = top_r.join(top_animes, rsuffix='_r', how='inner', on='anime_id')

pivot = pd.crosstab(top_r.user_id, top_r.anime_id, top_r.rating, aggfunc=np.sum)

KeyboardInterrupt: 

In [None]:
pivot.fillna(0, inplace=True)
pivot

anime_id,226,1535,1575,2001,2167,4224,5081,5114,6547,6746,9253,9989,10620,11757,15809,16498,19815,20507,22319,30276
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
4132,7.0,10.0,7.0,0.0,6.0,5.0,0.0,9.0,6.0,0.0,9.0,8.0,8.0,9.0,8.0,9.0,6.0,0.0,7.0,7.0
4773,7.0,7.0,9.0,9.0,10.0,8.0,9.0,9.0,8.0,8.0,10.0,8.0,9.0,7.0,8.0,8.0,4.0,8.0,7.0,8.0
6536,8.0,10.0,10.0,10.0,10.0,10.0,0.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,10.0,10.0,10.0,10.0,10.0
7179,8.0,7.0,9.0,9.0,8.0,7.0,8.0,10.0,7.0,9.0,7.0,9.0,8.0,4.0,8.0,7.0,7.0,7.0,6.0,7.0
10255,8.0,8.0,8.0,9.0,8.0,8.0,6.0,10.0,7.0,8.0,10.0,9.0,6.0,0.0,7.0,8.0,5.0,6.0,0.0,4.0
10665,8.0,8.0,0.0,0.0,10.0,9.0,0.0,0.0,9.0,9.0,0.0,10.0,9.0,9.0,0.0,10.0,0.0,8.0,8.0,7.0
11100,4.0,7.0,9.0,0.0,5.0,0.0,7.0,0.0,6.0,0.0,0.0,0.0,4.0,5.0,7.0,5.0,6.0,0.0,2.0,0.0
15083,7.0,8.0,9.0,9.0,0.0,6.0,8.0,7.0,8.0,8.0,8.0,8.0,8.0,8.0,7.0,7.0,8.0,7.0,7.0,8.0
16057,10.0,10.0,10.0,10.0,9.0,0.0,8.0,10.0,10.0,10.0,10.0,10.0,10.0,9.0,10.0,10.0,10.0,9.0,9.0,10.0
16869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
rating_data

Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes,user,anime
1415,6,9062,8,1,1,0,0
1416,6,9919,0,1,2,0,1
1417,6,150,7,1,15,0,2
1418,6,4981,0,1,12,0,3
1419,6,10793,0,1,2,0,4
...,...,...,...,...,...,...,...
9999995,32600,6,0,3,2,6195,513
9999996,32600,2129,4,3,2,6195,2004
9999997,32600,34902,0,3,4,6195,2873
9999998,32600,37722,0,3,1,6195,2967


In [None]:
piviot_table = rating_data.pivot(index="anime_id",column="user_id", values="rating").fillna(0)
piviot_table

TypeError: pivot() got an unexpected keyword argument 'column'

In [None]:
from scipy.sparse import csr_matrix
piviot_table_matrix = csr_matrix(piviot_table.values)

In [None]:
piviot_table_matrix

<6196x7436 sparse matrix of type '<class 'numpy.float64'>'
	with 3014575 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(metric="cosine", algorithm="brute")
model.fit(piviot_table_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [None]:
piviot_table.iloc[2276, :].values.reshape(1, -1)

array([[8., 0., 0., ..., 0., 0., 0.]])

In [None]:
def predict():
    random_anime = np.random.choice(piviot_table.shape[0]) 

    query = piviot_table.iloc[random_anime, :].values.reshape(1, -1)
    distance, suggestions = model.kneighbors(query, n_neighbors=6)


    for i in range(0, len(distance.flatten())):
        if i == 0:
            print('Recommendations for {0}:\n'.format(anima_data.iloc[random_anime]["Name"]))
        else:
            print('{0}: {1}, with distance of {2}:'.format(i, anima_data.iloc[suggestions[0][i]]["Name"], distance.flatten()[i]))

In [None]:
predict()

Recommendations for Kagami no Genon:

1: Shina Dark: Kuroki Tsuki no Ou to Souheki no Tsuki no Himegimi, with distance of 0.6605662793881182:
2: Tactical Roar, with distance of 0.6713804261618661:
3: Serial Experiments Lain, with distance of 0.6803430228118441:
4: 4-Day Weekend, with distance of 0.6856262750689825:
5: Hoero! Bun Bun, with distance of 0.6875895052144237:
