In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
import operator
%matplotlib inline

In [2]:
#read the csvs
anime = pd.read_csv('../datasets/anime.csv')
anime.shape
rating = pd.read_csv('../datasets/rating.csv')

(12294, 7)

In [3]:
anime.head()
rating.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [4]:
rating.rating.replace({-1: np.nan}, regex=False, inplace=True)
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


In [5]:
#recomendation by category of tv
anime_tv = anime[anime['type']=='TV']
anime_tv.head()
anime_tv.shape

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351


(3787, 7)

In [6]:
#join the two dataframes on the anime_id column
merged = rating.merge(anime_tv, left_on='anime_id', right_on='anime_id', suffixes=['_user', ''])
merged.rename(columns={'rating_user': 'user_rating'}, inplace=True)
merged.head()

Unnamed: 0,user_id,anime_id,user_rating,name,genre,type,episodes,rating,members
0,1,20,,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
1,3,20,8.0,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
2,5,20,6.0,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
3,6,20,,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
4,10,20,,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297


In [7]:
merged=merged[['user_id', 'name', 'user_rating']]
merged_sub=merged[merged.user_id<=10000]
merged_sub.head()

Unnamed: 0,user_id,name,user_rating
0,1,Naruto,
1,3,Naruto,8.0
2,5,Naruto,6.0
3,6,Naruto,
4,10,Naruto,


For colaborating filtering we need to create a pivot table of users on one aixs and tv shows names along the other. The pivot table will help us in defining the similarity between uses and shows to better predict who will like that


In [8]:
piv = merged_sub.pivot_table(index=['user_id'], columns=['name'], values='user_rating')
piv.shape
piv.head()

(9387, 2708)

name,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,3 Choume no Tama: Uchi no Tama Shirimasenka?,30-sai no Hoken Taiiku,91 Days,...,"Zone of the Enders: Dolores, I",Zukkoke Knight: Don De La Mancha,ef: A Tale of Melodies.,ef: A Tale of Memories.,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,2.0,
7,,,,,,,,,,,...,,,,,,,,,,


In [9]:
# Note: as we are subtracting the mean from each rating to standardize
# all users with only one rating who had rated everything tha same will be dropped

# Normalize the values
piv_norm = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)

piv_norm.fillna(0, inplace=True)
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]
piv_norm.head()

user_id,3,5,7,8,10,11,12,14,16,17,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Sign,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Tasogare no Udewa Densetsu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
009-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
07-Ghost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.519231,0.0,0.0,0.0


In [10]:
# Out data needs to be in a sparse matrix format to be read by the following functions

piv_sparse = sp.sparse.csr_matrix(piv_norm)

These matrices show us the computed cosine similarity values between each user/user array pair and item/item array pair

In [11]:
item_similarity = cosine_similarity(piv_sparse)
user_similarity = cosine_similarity(piv_sparse.T)

In [12]:
# Inserting the similarity matrices into dataframe objects

item_sim_df = pd.DataFrame(item_similarity, index=piv_norm.index, columns=piv_norm.index)
user_sim_df = pd.DataFrame(user_similarity, index=piv_norm.columns, columns=piv_norm.columns)

In [13]:
# This function will return the top 5 anime with the highest similarity value

def top_animes(anime_name):
    count =1
    print('Simliar shows to {} include: \n'.format(anime_name))
    for item in item_sim_df.sort_values(by=anime_name, ascending=False).index[1:11]:
        print('No. {}: {}'.format(count, item))
        count +=1

In [41]:
# This function will return the top 5 users with the highest similarity value

def top_users(user):
    if user not in piv_norm.columns:
        return ('No data available on user {}'.format(user))
    print('Most similar users: \n')
    sim_values = user_sim_df.sort_values(by=user, ascending=False).loc[:, user].tolist()[1: 11]
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    zipped = zip(sim_users, sim_values, )
    for user, sim in zipped:
        print('User #{0}, Similarity value: {1:.2f}'.format(user, sim))

In [44]:
# This function constructs a list of lists containing the highest rated shows per similar user 
# and returns the name of the show along with the frecuency it appears in the list

def similar_user_recs(user):
    if user not in piv_norm.columns:
        print("No data available for user {}".format(user))
    
    #get list of users more similar to user variable
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1: 11]
    best = []
    most_common = {}
    
    for i in sim_users:
#         print(i)
        max_score = piv_norm.loc[:, i].max()
        best.append(piv_norm[piv_norm.loc[:, i]==max_score].index.tolist())
        #get the animes that users like the most
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] =1
    
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:5]

In [49]:
# This function calculates the weighted average of similar users
# to determine a potential rating for an input user and show
def predicted_rating(anime_name, user):
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1: 1000]
    user_values = user_sim_df.sort_values(by=user, ascending=False).loc[:, user].tolist()[1:1000]
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_users):
        rating = piv.loc[i, anime_name]
        similarity = user_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
    return sum(rating_list)/sum(weight_list)

In [35]:
top_animes('Cowboy Bebop')

Simliar shows to Cowboy Bebop include: 

No. 1: Samurai Champloo
No. 2: Tengen Toppa Gurren Lagann
No. 3: Baccano!
No. 4: Ghost in the Shell: Stand Alone Complex
No. 5: Neon Genesis Evangelion
No. 6: Ghost in the Shell: Stand Alone Complex 2nd GIG
No. 7: Fullmetal Alchemist: Brotherhood
No. 8: Steins;Gate
No. 9: Mushishi
No. 10: Great Teacher Onizuka


In [42]:
top_users(3)

Most similar users: 

User #2986, Similarity value: 0.37
User #2411, Similarity value: 0.36
User #3681, Similarity value: 0.36
User #656, Similarity value: 0.35
User #298, Similarity value: 0.34
User #3028, Similarity value: 0.34
User #8436, Similarity value: 0.33
User #2038, Similarity value: 0.33
User #2374, Similarity value: 0.33
User #4233, Similarity value: 0.33


In [46]:
similar_user_recs(3)

[('Boku dake ga Inai Machi', 4),
 ('Shingeki no Kyojin', 4),
 ('Steins;Gate', 4),
 ('Fullmetal Alchemist: Brotherhood', 4),
 ('Clannad: After Story', 3)]

In [50]:
predicted_rating('Cowboy Bebop', 3)

8.499797904818438

Below we'll see how the predict_rating performs compared to the observed rated values for user 3.

In [51]:
# Creates a list a of every show watched by user 3
watched = piv.T[piv.loc[3, :]>0].index.tolist()

In [54]:
# Make a list of the squared errors between actual and predicted value

errors = []
for i in watched:
    actual = piv.loc[3, i]
    predicted = predicted_rating(i ,3)
    errors.append((actual - predicted)**2)

In [55]:
# This is the average squared error for user 3

np.mean(errors)

0.8848212679397977