In [1]:
# importing necessary liberaries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
sns.set_theme(style='darkgrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 250
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = pd.read_csv('')
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [3]:
# Display basic information
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [5]:
df.isna().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [6]:
# there is null values in the genre, type, rating columns

In [7]:
df.shape

(12294, 7)

In [8]:
# dealing with missing values

In [9]:
# I need to drop those rows where rating and genre column values are missing because I Cannot impute missing values for recommendation
df.dropna(subset=['rating', 'genre'], inplace=True)

In [10]:
df.shape

(12017, 7)

In [11]:
df.isna().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

all the missing values removed

In [12]:
# check unique values

In [13]:
df.head(3)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262


In [14]:
df['anime_id'].nunique()

12017

In [15]:
df['genre'].nunique()

3229

# DATA TRANSFORMATION

In [16]:
# Spliting genres into lists

df['genre'] = df['genre'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"[Drama, Romance, School, Supernatural]",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"[Action, Adventure, Drama, Fantasy, Magic, Mil...",TV,64,9.26,793665
2,28977,Gintama°,"[Action, Comedy, Historical, Parody, Samurai, ...",TV,51,9.25,114262
3,9253,Steins;Gate,"[Sci-Fi, Thriller]",TV,24,9.17,673572
4,9969,Gintama&#039;,"[Action, Comedy, Historical, Parody, Samurai, ...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,[Hentai],OVA,1,4.15,211
12290,5543,Under World,[Hentai],OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,[Hentai],OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,[Hentai],OVA,1,4.98,175


In [17]:
# Converting genres to numerical representation

multi = MultiLabelBinarizer()
genres_enco = multi.fit_transform(df['genre'])
genres = pd.DataFrame(genres_enco, columns=multi.classes_)
genres

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12012,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12013,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# standardization

sc = StandardScaler()
x = sc.fit_transform(df[['rating']])
ratings = pd.DataFrame(x, columns=['rating'])
ratings

Unnamed: 0,rating
0,2.824474
1,2.717032
2,2.707265
3,2.629126
4,2.619358
...,...
12012,-2.274108
12013,-2.147132
12014,-1.561088
12015,-1.463414


In [19]:
# concatinating both ratings and geners

features = pd.concat([genres, ratings], axis=1)
features

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri,rating
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,2.824474
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,2.717032
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.707265
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,2.629126
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.619358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12012,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-2.274108
12013,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-2.147132
12014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1.561088
12015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1.463414


In [20]:
# Caculating cosine similarity

sim = cosine_similarity(features)

In [21]:
# converting to dataframe just for the sake of understandig

pd.DataFrame(sim)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12007,12008,12009,12010,12011,12012,12013,12014,12015,12016
0,1.000000,0.660890,0.583674,0.718734,0.574181,0.787885,0.645950,0.732834,0.567584,0.568699,...,-0.678030,-0.728560,-0.710185,-0.736728,-0.734130,-0.747076,-0.739813,-0.687210,-0.673821,-0.575510
1,0.660890,1.000000,0.651706,0.630954,0.645706,0.697497,0.808797,0.727373,0.641492,0.642207,...,-0.595221,-0.639580,-0.623449,-0.646750,-0.644470,-0.655835,-0.649459,-0.603280,-0.591527,-0.505222
2,0.583674,0.651706,1.000000,0.718337,0.999864,0.696826,0.727521,0.645569,0.999613,0.999664,...,-0.594176,-0.638457,-0.622355,-0.645615,-0.643339,-0.654684,-0.648319,-0.602222,-0.590489,-0.504336
3,0.718734,0.630954,0.718337,1.000000,0.709575,0.668751,0.697049,0.797920,0.703458,0.704493,...,-0.731667,-0.786194,-0.766366,-0.795009,-0.792205,-0.806176,-0.798338,-0.741574,-0.727126,-0.621038
4,0.574181,0.645706,0.999864,0.709575,1.000000,0.690568,0.721019,0.637747,0.999936,0.999956,...,-0.584513,-0.628073,-0.612233,-0.635115,-0.632876,-0.644036,-0.637775,-0.592427,-0.580885,-0.496133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12012,-0.747076,-0.655835,-0.654684,-0.806176,-0.644036,-0.695123,-0.724537,-0.722481,-0.636637,-0.637888,...,0.984557,0.998586,0.994914,0.999536,0.999286,1.000000,0.999767,0.987943,0.982905,0.930934
12013,-0.739813,-0.649459,-0.648319,-0.798338,-0.637775,-0.688365,-0.717493,-0.715457,-0.630448,-0.631686,...,0.988106,0.999501,0.996856,0.999961,0.999869,0.999767,1.000000,0.991054,0.986649,0.938598
12014,-0.687210,-0.603280,-0.602222,-0.741574,-0.592427,-0.639420,-0.666477,-0.664586,-0.585621,-0.586771,...,0.999789,0.994776,0.998513,0.992199,0.993088,0.987943,0.991054,1.000000,0.999558,0.976247
12015,-0.673821,-0.591527,-0.590489,-0.727126,-0.580885,-0.626962,-0.653492,-0.651638,-0.574211,-0.575339,...,0.999958,0.991302,0.996452,0.988055,0.989160,0.982905,0.986649,0.999558,1.000000,0.982256


In [22]:
# defining function for recommendation

def recommend_anime(title, data, sim, number_of_recommendations=5):
    
    # Get the index of the anime that matches the title
    idx = df[df['name'] == title].index[0]

    # Geting the pairwise similarity scores of all anime with that perticular anime
    sim_score = list(enumerate(sim[idx]))

    # Sorting the anime based on the similarity score
    sim_score = sorted(sim_score, key=lambda x: x[1], reverse=True)

    # Get the scores of the most similar anime
    sim_score = sim_score[1:number_of_recommendations+1]

    # Get the anime index
    anime_index = [i[0] for i in sim_score]

    # Return the top n most similar anime
    return df['name'].iloc[anime_index]


In [23]:
# Call the function to get recommendatations for Naruto

recommend_anime('Naruto', df, sim, number_of_recommendations=5)

615                                    Naruto: Shippuuden
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
486                              Boruto: Naruto the Movie
1343                                          Naruto x UT
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
Name: name, dtype: object

In [None]:
# I made the recommendation using Naruto title