In [82]:
import numpy as np
import pandas as pd
from collections import defaultdict

from surprise import Reader
from surprise import Dataset
from surprise import KNNBasic
from surprise.model_selection import cross_validate

import pprint

In [83]:
ratings_df = pd.read_csv("rating.csv")
ratings_df.shape

(7813737, 3)

In [84]:
ratings_df.head(5)

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [85]:
# ratings_df["rating"].unique()

total_ratings_count = ratings_df["rating"].value_counts()
print(total_ratings_count)

rating
 8     1646019
-1     1476496
 7     1375287
 9     1254096
 10     955715
 6      637775
 5      282806
 4      104291
 3       41453
 2       23150
 1       16649
Name: count, dtype: int64


In [86]:
anime_df = pd.read_csv("anime.csv")
anime_df.shape

(12294, 7)

In [87]:
anime_df.head(5)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [88]:
df = ratings_df.merge(anime_df.drop_duplicates(), left_on="anime_id", right_on= "anime_id", how="left")
df

Unnamed: 0,user_id,anime_id,rating_x,name,genre,type,episodes,rating_y,members
0,1,20,-1,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297.0
1,1,24,-1,School Rumble,"Comedy, Romance, School, Shounen",TV,26,8.06,178553.0
2,1,79,-1,Shuffle!,"Comedy, Drama, Ecchi, Fantasy, Harem, Magic, R...",TV,24,7.31,158772.0
3,1,226,-1,Elfen Lied,"Action, Drama, Horror, Psychological, Romance,...",TV,13,7.85,623511.0
4,1,241,-1,Girls Bravo: First Season,"Comedy, Ecchi, Fantasy, Harem, Romance, School",TV,11,6.69,84395.0
...,...,...,...,...,...,...,...,...,...
7813732,73515,16512,7,Devil Survivor 2 The Animation,"Action, Demons, Supernatural",TV,13,7.06,101266.0
7813733,73515,17187,9,Ghost in the Shell: Arise - Border:1 Ghost Pain,"Mecha, Police, Psychological, Sci-Fi",Movie,1,7.64,31747.0
7813734,73515,22145,10,Kuroshitsuji: Book of Circus,"Comedy, Demons, Fantasy, Historical, Shounen, ...",TV,10,8.37,122895.0
7813735,73516,790,9,Ergo Proxy,"Mystery, Psychological, Sci-Fi",TV,23,8.03,265005.0


In [89]:
df.shape

(7813737, 9)

In [90]:
df = df[df.rating_x!=-1]

In [91]:
total_ratings_count = df["rating_x"].value_counts()
print(total_ratings_count)

rating_x
8     1646019
7     1375287
9     1254096
10     955715
6      637775
5      282806
4      104291
3       41453
2       23150
1       16649
Name: count, dtype: int64


In [92]:
df["rating_x"].describe()

count    6.337241e+06
mean     7.808497e+00
std      1.572496e+00
min      1.000000e+00
25%      7.000000e+00
50%      8.000000e+00
75%      9.000000e+00
max      1.000000e+01
Name: rating_x, dtype: float64

In [93]:
df = df.sample(frac=0.0001, random_state=42) 
len(df)

634

In [94]:
df.columns

Index(['user_id', 'anime_id', 'rating_x', 'name', 'genre', 'type', 'episodes',
       'rating_y', 'members'],
      dtype='object')

In [95]:
df.shape

(634, 9)

In [96]:
mean_rating = df.groupby("anime_id") [["rating_x"]].mean()
mean_rating.sort_values(by="rating_x", ascending=False)

Unnamed: 0_level_0,rating_x
anime_id,Unnamed: 1_level_1
1,10.0
8038,10.0
6895,10.0
6500,10.0
6114,10.0
...,...
1117,4.0
1401,3.0
3772,3.0
7148,3.0


In [97]:
df["rating_x"].describe()

count    634.000000
mean       7.843849
std        1.464450
min        1.000000
25%        7.000000
50%        8.000000
75%        9.000000
max       10.000000
Name: rating_x, dtype: float64

## Mallinnus

In [98]:
lukija = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(df[["user_id", "name", "rating_x"]], lukija)
data

<surprise.dataset.DatasetAutoFolds at 0x2ad2ebea000>

In [99]:
# Set all data as training set >> tämän voi käyttää, jos riittävästi dataa
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=0.25)
print(trainset)
print(testset)

<surprise.trainset.Trainset object at 0x000002AD2EBE9760>
[(49782, 'Akazukin Chacha', 8.0), (66849, 'Carnival Phantasm', 8.0), (52624, 'Strike Witches OVA', 6.0), (32319, 'Kill la Kill', 7.0), (52419, 'Kyou kara Maou!', 7.0), (40389, 'Kono Minikuku mo Utsukushii Sekai', 7.0), (55290, 'Majin Tantei Nougami Neuro', 7.0), (66864, 'Death Note', 10.0), (65469, 'Shingeki no Kyojin Movie 1: Guren no Yumiya', 6.0), (72932, 'Kuroko no Basket: Mou Ikkai Yarimasen ka', 8.0), (35729, 'Hitsugi no Chaika', 8.0), (67617, 'Naruto Movie 2: Dai Gekitotsu! Maboroshi no Chiteiiseki Dattebayo!', 7.0), (55319, 'Love Hina Again', 6.0), (59837, 'Pandora Hearts', 7.0), (6014, 'Terra e... (TV)', 10.0), (40986, 'Soul Eater', 8.0), (73120, 'InuYasha', 8.0), (65840, 'Harmony', 9.0), (28192, 'Bokurano', 7.0), (36628, 'Rosario to Vampire Capu2', 9.0), (58703, 'Code Geass: Hangyaku no Lelouch R2', 9.0), (36132, 'Tokyo Ghoul: &quot;Jack&quot;', 9.0), (66298, 'High School DxD New', 8.0), (38997, 'Meganebu!', 5.0), (491

In [100]:
# Build and train an algorithm.
# Tarjotaan trainset algoritmille

sim_options = {"user_based": True} # False compute similarities between items, True between customers

algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x2ad2ebea1b0>

In [101]:
preds = algo.test(testset)
preds[0:20]

[Prediction(uid=49782, iid='Akazukin Chacha', r_ui=8.0, est=7.8715789473684215, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=66849, iid='Carnival Phantasm', r_ui=8.0, est=7.8715789473684215, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=52624, iid='Strike Witches OVA', r_ui=6.0, est=7.8715789473684215, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=32319, iid='Kill la Kill', r_ui=7.0, est=7.8715789473684215, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=52419, iid='Kyou kara Maou!', r_ui=7.0, est=7.8715789473684215, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=40389, iid='Kono Minikuku mo Utsukushii Sekai', r_ui=7.0, est=7.8715789473684215, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=55290, iid='Majin Tant

In [102]:
user_id_to_search = 70740  # Replace with the specific user_id
user_data = df[df['user_id'] == user_id_to_search]

print(user_data)


Empty DataFrame
Columns: [user_id, anime_id, rating_x, name, genre, type, episodes, rating_y, members]
Index: []
