In [27]:
import pandas as pd
import ast
import itertools
import numpy as np
from surprise import SVD, KNNBasic, NMF
from surprise.model_selection import cross_validate
from surprise import Dataset
from surprise import Reader
from collections import defaultdict





In [3]:
animes = pd.read_csv('data/animes.csv')
profiles = pd.read_csv('data/profiles.csv')


### Part 1: Naive exploration

In [4]:
animes.head()

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83,https://cdn.myanimelist.net/images/anime/6/867...,https://myanimelist.net/anime/34599/Made_in_Abyss
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084,4,1.0,9.23,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']","Jan 6, 2017",1.0,214621,502,22.0,8.83,https://cdn.myanimelist.net/images/anime/3/815...,https://myanimelist.net/anime/31758/Kizumonoga...


In [5]:
profiles.head()

Unnamed: 0,profile,gender,birthday,favorites_anime,link
0,DesolatePsyche,Male,"Oct 2, 1994","['33352', '25013', '5530', '33674', '1482', '2...",https://myanimelist.net/profile/DesolatePsyche
1,baekbeans,Female,"Nov 10, 2000","['11061', '31964', '853', '20583', '918', '925...",https://myanimelist.net/profile/baekbeans
2,skrn,,,"['918', '2904', '11741', '17074', '23273', '32...",https://myanimelist.net/profile/skrn
3,edgewalker00,Male,Sep 5,"['5680', '849', '2904', '3588', '37349']",https://myanimelist.net/profile/edgewalker00
4,aManOfCulture99,Male,"Oct 30, 1999","['4181', '7791', '9617', '5680', '2167', '4382...",https://myanimelist.net/profile/aManOfCulture99


In [6]:
profiles.describe()

Unnamed: 0,profile,gender,birthday,favorites_anime,link
count,81727,53856,46807,81727,81727
unique,47885,3,7708,35395,47885
top,Voivodian,Male,1995,[],https://myanimelist.net/profile/Voivodian
freq,6,37096,211,16602,6


In [7]:
anime_list = ['Fullmetal Alchemist: Brotherhood','Haikyuu!! Second Season']
anime_ids = list(set(animes.loc[animes['title'].isin(anime_list), 'uid']))
anime_ids = list(map(str,anime_ids))
anime_ids

['5114', '28891']

In [8]:
def safe_literal_eval(node):
     try:
         return ast.literal_eval(node)
     except ValueError:
         return ['nope']  # happens when literal eval cannot process node

In [9]:
scores = []
similars = []
profiles = profiles[profiles['favorites_anime']!='[]']
profiles.head()

Unnamed: 0,profile,gender,birthday,favorites_anime,link
0,DesolatePsyche,Male,"Oct 2, 1994","['33352', '25013', '5530', '33674', '1482', '2...",https://myanimelist.net/profile/DesolatePsyche
1,baekbeans,Female,"Nov 10, 2000","['11061', '31964', '853', '20583', '918', '925...",https://myanimelist.net/profile/baekbeans
2,skrn,,,"['918', '2904', '11741', '17074', '23273', '32...",https://myanimelist.net/profile/skrn
3,edgewalker00,Male,Sep 5,"['5680', '849', '2904', '3588', '37349']",https://myanimelist.net/profile/edgewalker00
4,aManOfCulture99,Male,"Oct 30, 1999","['4181', '7791', '9617', '5680', '2167', '4382...",https://myanimelist.net/profile/aManOfCulture99


In [10]:
profiles_list = profiles['favorites_anime'].apply(safe_literal_eval)


for item in profiles_list:
    
    intersection = list(set(item) & set(anime_ids))
    score = len(intersection)/max(len(item),len(anime_ids))
    scores.append(score)

    similars.append(item)
df = pd.DataFrame(list(zip(scores,similars)),columns=['Score','Anime'])

In [11]:
df_reco = df[~df['Anime'].isin(anime_ids)].sort_values(by='Score', ascending=False)
df_reco['len'] = df_reco['Anime'].apply(len)
df_reco = df_reco[df_reco['len']!=1].head(20)
df_reco

Unnamed: 0,Score,Anime,len
52728,0.666667,"[28735, 5114, 28891]",3
8452,0.5,"[5114, 18679]",2
30935,0.5,"[1575, 5114]",2
51768,0.5,"[9756, 5114]",2
29161,0.5,"[28891, 35075]",2
33626,0.5,"[270, 5114]",2
5082,0.5,"[5114, 37779]",2
57961,0.5,"[5114, 21]",2
61570,0.5,"[5114, 853]",2
36535,0.5,"[5114, 9330]",2


In [12]:
reco_ids = list(set(list(itertools.chain(*df_reco['Anime'].values.tolist()))))
reco_ids = list(map(int,reco_ids))

In [13]:
reco_names = animes.loc[animes['uid'].isin(reco_ids), 'title'].unique()
list(set(reco_names).difference(anime_list))

['Fullmetal Alchemist',
 'Kill la Kill',
 'Monster',
 'Yakusoku no Neverland',
 'Hoozuki no Reitetsu 2nd Season',
 'Dragon Crisis!',
 'Sen to Chihiro no Kamikakushi',
 'Hellsing',
 'Shouwa Genroku Rakugo Shinjuu',
 'Steins;Gate',
 'Mahou Shoujo Madoka★Magica',
 'Death Note',
 'Ouran Koukou Host Club',
 'Fairy Tail: Final Series',
 'Gintama',
 'Magic Kaito 1412',
 'Code Geass: Hangyaku no Lelouch',
 'One Piece',
 'Berserk']

In [14]:
def get_recos(anime_list, n=20):
    anime_ids = list(set(animes.loc[animes['title'].isin(anime_list), 'uid']))
    anime_ids = list(map(str,anime_ids))
    scores = []
    similars = []
    profiles_df = profiles[profiles['favorites_anime']!='[]']
    profiles_list = profiles['favorites_anime'].apply(safe_literal_eval)
    for item in profiles_list:

        intersection = list(set(item) & set(anime_ids))
        score = len(intersection)/max(len(item),len(anime_ids))
        scores.append(score)

        similars.append(item)
        
    df = pd.DataFrame(list(zip(scores,similars)),columns=['Score','Anime'])
    df_reco = df[~df['Anime'].isin(anime_ids)].sort_values(by='Score', ascending=False)
    df_reco['len'] = df_reco['Anime'].apply(len)
    df_reco = df_reco[df_reco['len']!=1].head(n)
    reco_ids = list(set(list(itertools.chain(*df_reco['Anime'].values.tolist()))))
    reco_ids_list = list(map(int,reco_ids))
    reco_names = animes.loc[animes['uid'].isin(reco_ids_list), 'title'].unique()
    reco_list = list(set(reco_names).difference(anime_list))
    return reco_list

    
    
    

In [15]:
get_recos(anime_list)

['Fullmetal Alchemist',
 'Kill la Kill',
 'Monster',
 'Yakusoku no Neverland',
 'Hoozuki no Reitetsu 2nd Season',
 'Dragon Crisis!',
 'Sen to Chihiro no Kamikakushi',
 'Hellsing',
 'Shouwa Genroku Rakugo Shinjuu',
 'Steins;Gate',
 'Mahou Shoujo Madoka★Magica',
 'Death Note',
 'Ouran Koukou Host Club',
 'Fairy Tail: Final Series',
 'Gintama',
 'Magic Kaito 1412',
 'Code Geass: Hangyaku no Lelouch',
 'One Piece',
 'Berserk']

### Part 2: Let's get real: Rec. Sys. with SVD_

In [31]:
reviews = pd.read_csv('data/reviews.csv',usecols=['uid','anime_uid','score'])
reviews.head(10)
reviews.to_csv('data/reviews_light.csv')


In [17]:
#table = pd.pivot_table(reviews, values='score', index='profile', columns=['anime_uid'], aggfunc=np.mean)

In [18]:

min_anime_ratings = 500
filter_anime = reviews['anime_uid'].value_counts() > min_anime_ratings
filter_anime = filter_anime[filter_anime].index.tolist()

min_user_ratings = 1
filter_users = reviews['uid'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

df = reviews[(reviews['anime_uid'].isin(filter_anime)) & (reviews['uid'].isin(filter_users))]
df.head()



Unnamed: 0,uid,anime_uid,score
1,259117,34599,10
3,8254,2904,9
4,291149,4181,10
5,10046,2904,10
7,140903,2904,8


#### Now, adding a new set of reviews

In [19]:
new_uid = max(df['uid'])+1
anime_list = ['Fullmetal Alchemist', 'Gintama', 'Death Note']
new_anime_ids = animes.loc[animes['title'].isin(anime_list), 'uid'].unique()
new_list = list(zip([new_uid]*3,list(new_anime_ids),list(10*np.ones(len(anime_list)))))
new_df = pd.DataFrame(new_list, columns=['uid','anime_uid','score'])
df_reviews = df.append(new_df).reset_index(drop=True)
df_reviews['score'] = df_reviews['score'].apply(int)

df_reviews

  df_reviews = df.append(new_df).reset_index(drop=True)


Unnamed: 0,uid,anime_uid,score
0,259117,34599,10
1,8254,2904,9
2,291149,4181,10
3,10046,2904,10
4,140903,2904,8
...,...,...,...
38424,93509,10620,9
38425,50225,10620,10
38426,325671,918,10
38427,325671,121,10


#### surprise SVD

In [20]:
reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(df_reviews[['uid', 'anime_uid', 'score']], reader)

In [30]:
# Use the famous SVD algorithm.
algo = SVD(random_state=42)

# Run k-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.2056  1.1529  1.1322  1.1636  0.0309  
MAE (testset)     0.7159  0.6923  0.6829  0.6971  0.0139  
Fit time          2.25    2.30    2.29    2.28    0.02    
Test time         0.12    0.41    0.12    0.22    0.14    


{'test_rmse': array([1.20558725, 1.15291643, 1.13220474]),
 'test_mae': array([0.7159178 , 0.69234776, 0.68293996]),
 'fit_time': (2.2481396198272705, 2.301584005355835, 2.287795066833496),
 'test_time': (0.11598873138427734, 0.40784192085266113, 0.1217341423034668)}

In [25]:
# Retrieve the trainset.
trainset = data.build_full_trainset()

# Build an algorithm, and train it.
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2413a8a89d0>

In [26]:
uid = 325671  # raw user id (as in the ratings file).
iid = 5514  # raw item id (as in the ratings file).

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, verbose=True)

user: 325671     item: 5514       r_ui = None   est = 8.63   {'was_impossible': False}


In [27]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [28]:
# Then predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

top_n[uid]

[(5114, 9.794065749332422),
 (4181, 9.610971564867008),
 (2904, 9.575575633901755),
 (4224, 9.548404501462818),
 (853, 9.475179597029047),
 (2001, 9.332090604909556),
 (1210, 9.31207079499246),
 (21, 9.25441174218581),
 (30276, 9.162919067019745),
 (5081, 9.086108146686389)]

In [30]:
# Print the recommended items for each user
anime_ids = []
scores = []
for item in top_n[uid]:
    anime_ids.append(item[0])
    scores.append(item[1])
    df = pd.DataFrame(list(zip(list(anime_ids),scores)),columns=['anime_id','score'])

In [31]:
df

Unnamed: 0,anime_id,score
0,5114,9.794066
1,4181,9.610972
2,2904,9.575576
3,4224,9.548405
4,853,9.47518
5,2001,9.332091
6,1210,9.312071
7,21,9.254412
8,30276,9.162919
9,5081,9.086108
