In [1]:
# import data exploration libraries
import pandas as pnds
import numpy as nmpy

# import data visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# import regex
import re

# import scaler from scikit
from sklearn.preprocessing import StandardScaler

# import KNN algorithm
from sklearn.neighbors import NearestNeighbors

In [2]:
# use pandas .read_csv to read anime.csv and rating.csv
rating = pnds.read_csv('rating.csv')
anime = pnds.read_csv('anime.csv')

In [3]:
# use .head() to visualize first few rows
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [4]:
# use .head() to visualize first few rows
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
# check which anime have an unknown number of episodes
anime[anime['episodes']=='Unknown'].head(25)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
74,21,One Piece,"Action, Adventure, Comedy, Drama, Fantasy, Sho...",TV,Unknown,8.58,504862
252,235,Detective Conan,"Adventure, Comedy, Mystery, Police, Shounen",TV,Unknown,8.25,114702
615,1735,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,Unknown,7.94,533578
991,966,Crayon Shin-chan,"Comedy, Ecchi, Kids, School, Shounen, Slice of...",TV,Unknown,7.73,26267
1021,33157,Tanaka-kun wa Itsumo Kedaruge Specials,"Comedy, School, Slice of Life",Special,Unknown,7.72,5400
1272,21639,Yu☆Gi☆Oh! Arc-V,"Action, Fantasy, Game, Shounen",TV,Unknown,7.61,17571
1309,8687,Doraemon (2005),"Comedy, Kids, Sci-Fi, Shounen",TV,Unknown,7.59,2980
1928,32410,Dimension W: W no Tobira Online,"Sci-Fi, Seinen",Special,Unknown,7.4,4799
1930,30694,Dragon Ball Super,"Action, Adventure, Comedy, Fantasy, Martial Ar...",TV,Unknown,7.4,111443
1993,32977,Aggressive Retsuko,"Comedy, Music",TV,Unknown,7.38,5465


In [6]:
# input number of episodes for anime with unknown number of episodes
known_episodes = {"One Piece":875, "Detective Conan":942, "Naruto: Shippuuden":500, "Crayon Shin-chan":972, 
                  "Tanaka-kun wa Itsumo Kedaruge Specials":12, "Yu☆Gi☆Oh! Arc-V":148,"Doraemon (2005)":559,
                  "Dimension W: W no Tobira Online":5,"Dragon Ball Super":131,"Aggressive Retsuko":11,
                  "Chibi Maruko-chan (1995)":1032,"Inazuma Eleven: Outer Code":6,"Pokemon Sun &amp; Moon":43,
                  "Youkai Watch":214,"Nobunaga no Shinobi":26,"Disney Tsum Tsum":39,"Sore Ike! Anpanman":68}

In [7]:
# input episode number to each anime in known_episodes
for i,j in known_episodes.items():
    anime.loc[anime["name"]==i,"episodes"] = j

In [8]:
# set episode number of hentai animes to 1
# set episode number of animes that are movies or OVAs to 1
anime.loc[(anime["genre"] == "Hentai") & (anime["episodes"] == "Unknown"),"episodes"] = "1"
anime.loc[((anime["type"] == "Movie") | (anime["type"] == "OVA")) & (anime["episodes"] == "Unknown"), "episodes"] = "1"

anime["episodes"] = anime["episodes"].map(lambda x:nmpy.nan if x=="Unknown" else x)
anime["episodes"].fillna(anime["episodes"].median(),inplace = True)

In [9]:
# change type to categorical values using .get_dummies
pnds.get_dummies(anime[["type"]]).head()

Unnamed: 0,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,1,0,0,0,0,0
1,0,0,0,0,0,1
2,0,0,0,0,0,1
3,0,0,0,0,0,1
4,0,0,0,0,0,1


In [10]:
# convert members and rating from string to float
anime["members"] = anime["members"].astype(float)
anime["rating"] = anime["rating"].astype(float)
anime["rating"].fillna(anime["rating"].median(),inplace = True)

In [23]:
# scaling
a_genre = anime["genre"].str.get_dummies(sep=",")
a_type = pnds.get_dummies(anime[["type"]])
a_rating = anime[["rating"]]
a_members = anime[["members"]]
a_episodes = anime[["episodes"]]


anime_feat = pnds.concat([a_genre, a_type, a_rating, a_members, a_episodes], axis=1)
anime["name"] = anime["name"].map(lambda name:re.sub('[^A-Za-z0-9]+', " ", name))
#a_members.head()
anime_feat.head()

Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,Yaoi,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV,rating,members,episodes
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,9.37,200630.0,1
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,9.26,793665.0,64
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.25,114262.0,51
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.17,673572.0,24
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.16,151266.0,51


In [12]:
# use scaler from StandardScaler
scaler = StandardScaler()
scaler.fit(anime_feat)


  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [13]:
nmpy.round(anime_feat,2)

Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,Yaoi,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV,rating,members,episodes
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,9.37,200630.0,1
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,9.26,793665.0,64
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.25,114262.0,51
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.17,673572.0,24
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.16,151266.0,51
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,9.15,93351.0,10
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.13,425855.0,148
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,9.11,80679.0,110
8,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,9.10,72534.0,1
9,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.11,81109.0,13


In [14]:
n = NearestNeighbors(n_neighbors=7, algorithm='ball_tree').fit(anime_feat)
distances, indices = n.kneighbors(anime_feat)

In [15]:
all_names = list(anime.name.values)

def find_id_from_name(partial):
    for name in all_names:
        if partial in name:
            print(name,all_names.index(name))

def find_index(name):
    return anime[anime["name"]==name].index.tolist()[0]
            
def find_recommendation(query=None,id=None):
    if id:
        for id in indices[id][1:]:
            print(anime.ix[id]["name"])
    if query:
        found_id = find_index(query)
        for id in indices[found_id][1:]:
            print(anime.ix[id]["name"])

In [16]:
find_recommendation(query="Girls und Panzer")

Genshiken
Mahou Sensou
Last Exile
Maken Ki 
Princess Lover 
Tiger amp Bunny


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [17]:
find_recommendation(query="Neon Genesis Evangelion")

Guilty Crown
Mahou Shoujo Madoka Magica
Ano Hi Mita Hana no Namae wo Bokutachi wa Mada Shiranai 
Sen to Chihiro no Kamikakushi
Clannad After Story
Fate Zero


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [18]:
find_recommendation(query="Violet Evergarden")

Bus Gamer
D C II S S Da Capo II Second Season
Ao no Exorcist Movie Special
Shion no Ou
Holy Knight
Heart no Kuni no Alice Wonderful Wonder World


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
