In [1]:
import pandas as pd
import re
import numpy as np
import scipy as sp
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
import gc
import operator
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
anime = pd.read_csv('data/finalanime.csv')
user = pd.read_csv('data/rating.csv')

In [3]:
anime[anime['name']=='One Piece']

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,genre1,count
5863,21,One Piece,"Action, Adventure, Comedy, Drama, Fantasy, Sho...",TV,35.977578,8.58,504862,Action,2768


In [4]:
def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    
    return text

anime['name'] = anime['name'].apply(text_cleaning)
user['rating'] = user['rating'].apply(lambda x: 0 if x == -1 else x)
user_sub = user[user['user_id']<20000]
user_sub = user_sub.apply(pd.to_numeric,downcast='integer')
user_sub.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2065572 entries, 0 to 2065571
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int16
 1   anime_id  int32
 2   rating    int8 
dtypes: int16(1), int32(1), int8(1)
memory usage: 29.5 MB


In [5]:
gc.collect()

24

In [6]:
merged = user_sub.merge(anime, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_user', ''])
merged.rename(columns = {'rating_user':'user_rating'}, inplace = True)

In [7]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2065522 entries, 0 to 2065521
Data columns (total 11 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int16  
 1   anime_id     int32  
 2   user_rating  int8   
 3   name         object 
 4   genre        object 
 5   type         object 
 6   episodes     float64
 7   rating       float64
 8   members      int64  
 9   genre1       object 
 10  count        int64  
dtypes: float64(2), int16(1), int32(1), int64(2), int8(1), object(4)
memory usage: 155.6+ MB


In [8]:
gc.collect()

20

In [9]:
piv = merged.pivot_table(index=['user_id'], columns=['name'], values='user_rating').apply(pd.to_numeric,downcast='float')
piv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19999 entries, 1 to 19999
Columns: 9243 entries, 0 to ◯
dtypes: float32(9243)
memory usage: 705.3 MB


In [10]:
# Note: As we are subtracting the mean from each rating to standardize
# all users with only one rating or who had rated everything the same will be dropped

# Normalize the values
piv_norm = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)


# Drop all columns containing only zeros representing users who did not rate
piv_norm.fillna(0, inplace=True)
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]

In [11]:
gc.collect()

38

In [12]:
piv_sparse = sp.sparse.csr_matrix(piv_norm.values)

In [13]:
item_similarity = cosine_similarity(piv_sparse)
user_similarity = cosine_similarity(piv_sparse.T)

In [14]:
item_sim_df = pd.DataFrame(item_similarity, index = piv_norm.index, columns = piv_norm.index)
user_sim_df = pd.DataFrame(user_similarity, index = piv_norm.columns, columns = piv_norm.columns)

In [15]:
def top_animes(anime_name):
    count = 1
    print('If you like {}, you may also like:\n'.format(anime_name))
    for item in item_sim_df.sort_values(by = anime_name, ascending = False).index[1:11]:
        print('No. {}: {}'.format(count, item))
        count +=1

In [16]:
def top_animes1():
    x = input('Enter Anime Name:')
    anime_name = anime[anime['name'].str.contains(x, case=False)].sort_values(by='members', ascending=False).reset_index()['name'][0]
    count = 1
    print('If you like {}, you may also like:\n'.format(anime_name))
    for item in item_sim_df.sort_values(by = anime_name, ascending = False).index[1:11]:
        print('No. {}: {}'.format(count, item))
        count +=1

In [17]:
x = input('Enter Anime Name:')

Enter Anime Name:shingeki


In [18]:
anime_n = anime[anime['name'].str.contains(x, case=False)].sort_values(by='members', ascending=False).reset_index()['name'][0]

In [19]:
top_animes(anime_n)

If you like Shingeki no Kyojin, you may also like:

No. 1: Fullmetal Alchemist: Brotherhood
No. 2: Death Note
No. 3: Steins;Gate
No. 4: Code Geass: Hangyaku no Lelouch R2
No. 5: Code Geass: Hangyaku no Lelouch
No. 6: Hunter x Hunter (2011)
No. 7: Psycho-Pass
No. 8: Sword Art Online
No. 9: Mirai Nikki (TV)
No. 10: Clannad: After Story


In [22]:
top_animes1()

Enter Anime Name:naruto
If you like Naruto, you may also like:

No. 1: Bleach
No. 2: Dragon Ball
No. 3: Dragon Ball Z
No. 4: Fairy Tail
No. 5: Death Note
No. 6: Ao no Exorcist
No. 7: Fullmetal Alchemist
No. 8: D.Gray-man
No. 9: Soul Eater
No. 10: Naruto Movie 1: Dai Katsugeki!! Yuki Hime Shinobu Houjou Dattebayo!
