In [1]:
import pandas as pd
import numpy as np
import pickle

from surprise import Dataset, Reader, SVD, SVDpp, accuracy
from surprise.model_selection import GridSearchCV, cross_validate

import seaborn as sns
import matplotlib.pyplot as plt

from collections import defaultdict, Counter

import warnings
warnings.filterwarnings("ignore")

In [2]:
score_df = pd.read_csv('rate_df.csv') #user/scores
det_df = pd.read_csv('det_df.csv') #anime/rating/genres

In [3]:
det_df = det_df[det_df['Rating'] != 'None']

In [4]:
# test_csv = pd.read_csv('test_csv.csv')
# test_csv['Score'] = test_csv['Score'].map(int)

In [5]:
# df_list = [score_df, test_csv]
# score_df = pd.concat(df_list)

In [121]:
# score_df.to_pickle('score_df.pkl')
# det_df.to_pickle('det_df.pkl')

### Cleaning

In [4]:
# creates AnimeID for each anime
# score_df
grouped_name = score_df.groupby('Anime')

temp_df = grouped_name.count()
temp_df_idx = pd.DataFrame(temp_df.index)

temp_df_idx['AnimeID'] = temp_df_idx.index
dict_df=temp_df_idx[['AnimeID','Anime']]

desc_dict = dict_df.set_index('Anime').to_dict()
new_dict = desc_dict['AnimeID']

score_df['AnimeID'] = score_df['Anime'].map(new_dict)

# det_df
grouped_name = det_df.groupby('Anime')

temp_df = grouped_name.count()
temp_df_idx = pd.DataFrame(temp_df.index)

temp_df_idx['AnimeID'] = temp_df_idx.index
dict_df=temp_df_idx[['AnimeID','Anime']]

desc_dict = dict_df.set_index('Anime').to_dict()
new_dict = desc_dict['AnimeID']

det_df['AnimeID'] = det_df['Anime'].map(new_dict)

# create UserID for each user
grouped_user = score_df.groupby('User')

temp_df_user = grouped_user.count()
temp_df_user_idx = pd.DataFrame(temp_df_user.index)

temp_df_user_idx['UserID']=temp_df_user_idx.index
dict_df_user=temp_df_user_idx[['UserID','User']] 

desc_dict_user = dict_df_user.set_index('User').to_dict()
new_dict_user = desc_dict_user['UserID']

score_df['UserID'] = score_df['User'].map(new_dict_user)

In [5]:
# remove shows/users with low counts
min_anime_ratings = 2
min_user_ratings =  2

clean_ani_ratings = score_df.groupby('AnimeID').filter(lambda x: x['AnimeID'].count() >= min_anime_ratings)
test_df = clean_ani_ratings.groupby('UserID').filter(lambda x: x['UserID'].count() >= min_user_ratings)

"{0} shows deleted. Old dimensions: {1}; New dimensions: {2}"\
.format(len(score_df['AnimeID'].value_counts()) - len(test_df['AnimeID'].value_counts())\
        ,score_df.shape, test_df.shape )


'4 shows deleted. Old dimensions: (4401, 5); New dimensions: (2006, 5)'

In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2006 entries, 1 to 4399
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Anime    2006 non-null   object
 1   User     2006 non-null   object
 2   Score    2006 non-null   int64 
 3   AnimeID  2006 non-null   int64 
 4   UserID   2006 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 94.0+ KB


### Model

In [7]:
reader = Reader(rating_scale=(test_df['Score'].min(), test_df['Score'].max()))
data = Dataset.load_from_df(test_df[["UserID", "AnimeID", "Score"]], reader=reader)

trainset = data.build_full_trainset()
testset = trainset.build_anti_testset()

In [None]:
# return best param from list of searched parameters
t_list = []
for i in range(1,101,1):
    t_list.append(i)

n_list = []    
    
for i in range(0,11):
    param_grid = {'n_factors': t_list}
    gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5)
    gs.fit(data)

    n_list.append(gs.best_params['rmse'])

t2 = []

for sub in n_list:
    for key in sub:
        t2.append(int(sub[key]))

t2 = list(set(sorted(t2)))
print(t2, '\n')

param_grid = {'n_factors': t2}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5)
gs.fit(data)

print(gs.best_params['rmse'])

In [8]:
algo_SVD = SVD(n_factors = 5, random_state=4444)
algo_SVD.fit(trainset)

testset = trainset.build_anti_testset()

predictions = algo_SVD.test(testset)

predictions[0:10]

[Prediction(uid=155, iid=84, r_ui=7.9411764705882355, est=8.348596735891386, details={'was_impossible': False}),
 Prediction(uid=155, iid=58, r_ui=7.9411764705882355, est=8.999081057674939, details={'was_impossible': False}),
 Prediction(uid=155, iid=46, r_ui=7.9411764705882355, est=8.13063806626297, details={'was_impossible': False}),
 Prediction(uid=155, iid=50, r_ui=7.9411764705882355, est=8.633872457895134, details={'was_impossible': False}),
 Prediction(uid=155, iid=205, r_ui=7.9411764705882355, est=8.76882539737906, details={'was_impossible': False}),
 Prediction(uid=155, iid=51, r_ui=7.9411764705882355, est=8.655350418150467, details={'was_impossible': False}),
 Prediction(uid=155, iid=1, r_ui=7.9411764705882355, est=8.523988247615378, details={'was_impossible': False}),
 Prediction(uid=155, iid=104, r_ui=7.9411764705882355, est=7.161381855954086, details={'was_impossible': False}),
 Prediction(uid=155, iid=113, r_ui=7.9411764705882355, est=7.484476979266376, details={'was_impos

In [9]:
# accuracy metric, lower is better!
accuracy.rmse(predictions)

RMSE: 0.6517


0.651661527711851

In [10]:
# borrowed from https://www.jiristodulka.com/post/recsys_cf/ 
def get_top_n(predictions, userId, anime_df, ratings_df, n = 10):
    # map the predictions to each user
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # sort the predictions for each user and retrieve the k highest ones
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_n[uid] = user_ratings[: n ]
    
    # returns how many movies the user has already rated
    user_data = ratings_df[ratings_df['UserID'] == (userId)]
    print('User {0} has already rated {1} shows/movies.'.format(userId, user_data.shape[0]))

    # DataFrame with predictions
    preds_df = pd.DataFrame([(id, pair[0],pair[1]) for id, row in top_n.items() for pair in row],
                        columns=['UserID' , 'AnimeID', 'PredScore'])
    
    # return top N recommended anime with (merged) titles and genres 
    pred_usr = preds_df[preds_df['UserID'] == (userId)].merge(anime_df, how = 'left', left_on = 'AnimeID', right_on = 'AnimeID')
            
    # return top N historically rated anime with (merged) titles and genres for holistic evaluation
    hist_usr = ratings_df[ratings_df['UserID'] == (userId) ].sort_values('Score', ascending = False).merge\
    (anime_df, how = 'left', left_on = 'AnimeID', right_on = 'AnimeID')
    
    return hist_usr, pred_usr

In [17]:
hist_SVD_837, pred_SVD_837 = get_top_n(predictions, anime_df = det_df, userId = 349, ratings_df = test_df)

User 349 has already rated 23 shows/movies.


In [18]:
hist_SVD_837

Unnamed: 0,Anime_x,User,Score,AnimeID,UserID,Anime_y,Rating,Genres
0,Shouwa Genroku Rakugo Shinjuu,CodeBlazeFate,10,211,349,Shouwa Genroku Rakugo Shinjuu: Sukeroku Futata...,PG-13 - Teens 13 or older,"['Drama', 'Historical', 'Josei']"
1,Neon Genesis Evangelion: The End of Evangelion,CodeBlazeFate,10,165,349,Nichijou,PG-13 - Teens 13 or older,"['Slice of Life', 'Comedy', 'School', 'Shounen']"
2,Mobile Suit Gundam: The Origin,CodeBlazeFate,10,142,349,Monogatari Series: Second Season,R - 17+ (violence & profanity),"['Mystery', 'Comedy', 'Supernatural', 'Romance..."
3,Sennen Joyuu,CodeBlazeFate,10,199,349,Shelter,G - All Ages,"['Sci-Fi', 'Music']"
4,Shouwa Genroku Rakugo Shinjuu: Sukeroku Futata...,CodeBlazeFate,10,212,349,Slam Dunk,PG-13 - Teens 13 or older,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun..."
5,Tokyo Godfathers,CodeBlazeFate,9,227,349,Tonari no Totoro,G - All Ages,"['Adventure', 'Comedy', 'Supernatural']"
6,Ping Pong the Animation,CodeBlazeFate,9,179,349,Planetes,PG-13 - Teens 13 or older,"['Drama', 'Romance', 'Sci-Fi', 'Seinen', 'Space']"
7,Kenpuu Denki Berserk,CodeBlazeFate,9,102,349,Kimetsu no Yaiba,R - 17+ (violence & profanity),"['Action', 'Demons', 'Historical', 'Shounen', ..."
8,Kaguya-sama wa Kokurasetai: Tensai-tachi no Re...,CodeBlazeFate,9,92,349,Kaguya-sama wa Kokurasetai?: Tensai-tachi no R...,PG-13 - Teens 13 or older,"['Comedy', 'Psychological', 'Romance', 'School..."
9,Redline,CodeBlazeFate,9,186,349,Romeo no Aoi Sora,PG-13 - Teens 13 or older,"['Adventure', 'Drama', 'Historical', 'Slice of..."


In [19]:
pred_SVD_837

Unnamed: 0,UserID,AnimeID,PredScore,Anime,Rating,Genres
0,349,189,8.763629,SKET Dance,PG-13 - Teens 13 or older,"['Comedy', 'School', 'Shounen']"
1,349,145,8.682854,Monster,R+ - Mild Nudity,"['Drama', 'Horror', 'Mystery', 'Police', 'Psyc..."
2,349,4,8.589087,Aria the Origination,G - All Ages,"['Sci-Fi', 'Slice of Life', 'Fantasy', 'Shounen']"
3,349,129,8.581745,Mahou Shoujo Madoka★Magica Movie 3: Hangyaku n...,PG-13 - Teens 13 or older,"['Mystery', 'Psychological', 'Drama', 'Magic',..."
4,349,108,8.480882,Kiseijuu: Sei no Kakuritsu,R - 17+ (violence & profanity),"['Action', 'Sci-Fi', 'Horror', 'Psychological'..."
5,349,6,8.465558,Ashita no Joe 2,PG-13 - Teens 13 or older,"['Action', 'Drama', 'Shounen', 'Slice of Life'..."
6,349,58,8.460181,Gintama°,PG-13 - Teens 13 or older,"['Action', 'Comedy', 'Historical', 'Parody', '..."
7,349,74,8.422486,Hajime no Ippo: Rising,PG-13 - Teens 13 or older,"['Comedy', 'Sports', 'Drama', 'Shounen']"
8,349,69,8.403109,Haikyuu!!: Karasuno Koukou vs. Shiratorizawa G...,PG-13 - Teens 13 or older,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun..."
9,349,41,8.390011,Fate/stay night Movie: Heaven's Feel - III. Sp...,R - 17+ (violence & profanity),"['Action', 'Supernatural', 'Magic', 'Fantasy']"


### User UI

In [69]:
from ast import literal_eval

g = []

for gn in det_df['Genres']:
    g.append(literal_eval(gn))

for g_list in g:
    tg += g_list

# sorted(list(set(tg))) 

In [120]:
r = []

for rating in det_df['Rating']:
    r.append(rating)

sorted(list(set(r)))

['G - All Ages',
 'PG - Children',
 'PG-13 - Teens 13 or older',
 'R - 17+ (violence & profanity)',
 'R+ - Mild Nudity']

In [114]:
test_rt = ['G - All Ages', 'R+ - Mild Nudity']
test_in = []

for index, rating in enumerate(pred_SVD_837['Rating']):
    if rating in test_rt:
        print(rating)
        print(index)
        test_in.append(index)

R+ - Mild Nudity
1
G - All Ages
6


In [128]:
test_li = ['Action', 'Adventure']
test_in2 = []

for index, gn in enumerate(pred_SVD_837['Genres']):
    #for gt in test_li:
    if all(x in literal_eval(gn) for x in test_li):
        #if gt in literal_eval(gn):
        print(gn)
        print(index)
        test_in2.append(index)
            
for num in test_in2:
    print(pred_SVD_837.iloc[num].Anime)

['Action', 'Adventure', 'Demons', 'Drama', 'Fantasy', 'Horror', 'Military', 'Romance', 'Seinen', 'Supernatural']
1
['Action', 'Adventure', 'Fantasy']
2
Kenpuu Denki Berserk
Mononoke Hime


In [116]:
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3

final_list = intersection(test_in, test_in2)

for num in final_list:
    print(pred_SVD_837.iloc[num].Anime)

Kenpuu Denki Berserk
