In [46]:
# Import libraries
import pandas as pd
import numpy as np

# Change pandas settings so we can see the all columns in the dataframe
pd.set_option('max_columns', 99)

# Preprocessing Data

In [75]:
anime_df = pd.read_csv("cleaned_anime.csv")
anime_df.head()

Unnamed: 0,anime_id,name,genre
0,28891,Haikyuu!! Second Season,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun..."
1,23273,Shigatsu wa Kimi no Uso,"['Drama', 'Music', 'Romance', 'School', 'Shoun..."
2,34599,Made in Abyss,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F..."
3,5114,Fullmetal Alchemist: Brotherhood,"['Action', 'Military', 'Adventure', 'Comedy', ..."
4,31758,Kizumonogatari III: Reiketsu-hen,"['Action', 'Mystery', 'Supernatural', 'Vampire']"


In [76]:
anime_df = anime_df[anime_df["genre"]  != "['']"]

In [77]:
anime_df["genre"] = anime_df["genre"].str.replace("'", "").str.strip("][").str.split(", ")
type(anime_df["genre"][0])

list

In [78]:
anime_df.head()

Unnamed: 0,anime_id,name,genre
0,28891,Haikyuu!! Second Season,"[Comedy, Sports, Drama, School, Shounen]"
1,23273,Shigatsu wa Kimi no Uso,"[Drama, Music, Romance, School, Shounen]"
2,34599,Made in Abyss,"[Sci-Fi, Adventure, Mystery, Drama, Fantasy]"
3,5114,Fullmetal Alchemist: Brotherhood,"[Action, Military, Adventure, Comedy, Drama, M..."
4,31758,Kizumonogatari III: Reiketsu-hen,"[Action, Mystery, Supernatural, Vampire]"


In [79]:
# Using scikit learn's MLB package to one hot encode the genres
from sklearn.preprocessing import MultiLabelBinarizer

# Code from https://stackoverflow.com/questions/45312377/how-to-one-hot-encode-from-a-pandas-column-containing-a-list
mlb = MultiLabelBinarizer(sparse_output=True)

anime_df = anime_df.join(pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(anime_df["genre"]),
                index=anime_df.index,
                columns=mlb.classes_))

# Drop the origininal genre column
anime_df.drop("genre", axis=1, inplace=True)

In [80]:
anime_df.head()

Unnamed: 0,anime_id,name,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,Hentai,Historical,Horror,Josei,Kids,Magic,Martial Arts,Mecha,Military,Music,Mystery,Parody,Police,Psychological,Romance,Samurai,School,Sci-Fi,Seinen,Shoujo,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,28891,Haikyuu!! Second Season,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
1,23273,Shigatsu wa Kimi no Uso,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,34599,Made in Abyss,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,5114,Fullmetal Alchemist: Brotherhood,1,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,31758,Kizumonogatari III: Reiketsu-hen,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0


In [82]:
rating_df = pd.read_csv("cleaned_rating.csv")
rating_df.head()

Unnamed: 0,user,anime_id,rating
0,-----noname-----,18441,2
1,-----noname-----,2025,4
2,---SnowFlake---,1535,6
3,---was-----,10110,8
4,--EYEPATCH--,35839,10


In [83]:
rating_df["user"].value_counts()

Sidewinder51    611
Stark700        537
ktulu007        482
LegendAqua      442
ggultra2764     355
               ... 
caldxm            1
liabia            1
Marii-nyan        1
KykyGhibli        1
aaronschmit       1
Name: user, Length: 47885, dtype: int64

In [84]:
import random
random.seed(5)
random.choice(rating_df["user"])

'Vaenny'

In [99]:
user_df = rating_df[rating_df["user"]=="Vaenny"]


# Drop the columns that are not needed
user_df = user_df.drop("user", axis=1)
user_df = user_df.sort_values("anime_id")

# Reset the indexes
user_df.reset_index(drop=True, inplace=True)

user_df

Unnamed: 0,anime_id,rating
0,670,5
1,811,7
2,1824,8
3,3958,7
4,4548,5
5,6201,9
6,6392,6
7,8675,9
8,8861,9
9,9253,9


In [100]:
user_genre_df = anime_df[anime_df["anime_id"].isin(user_df["anime_id"])]
user_genre_df.head()

Unnamed: 0,anime_id,name,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,Hentai,Historical,Horror,Josei,Kids,Magic,Martial Arts,Mecha,Military,Music,Mystery,Parody,Police,Psychological,Romance,Samurai,School,Sci-Fi,Seinen,Shoujo,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
72,28999,Charlotte,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
197,32093,Tanaka-kun wa Itsumo Kedaruge,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
291,29803,Overlord,1,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
335,23289,Gekkan Shoujo Nozaki-kun,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
343,28725,Kokoro ga Sakebitagatterunda.,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [101]:
user_genre_df = user_genre_df.sort_values("anime_id")
user_genre_df.reset_index(drop=True, inplace=True)
user_genre_df.head()

Unnamed: 0,anime_id,name,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,Hentai,Historical,Horror,Josei,Kids,Magic,Martial Arts,Mecha,Military,Music,Mystery,Parody,Police,Psychological,Romance,Samurai,School,Sci-Fi,Seinen,Shoujo,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,670,Lamune,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,811,I''s Pure,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1824,Hadashi no Gen,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3958,Kannagi,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
4,4548,Yozakura Quartet,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0


In [102]:
user_genre_matrix = user_genre_df.drop(["anime_id", "name"], axis=1)
user_genre_matrix.head()

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,Hentai,Historical,Horror,Josei,Kids,Magic,Martial Arts,Mecha,Military,Music,Mystery,Parody,Police,Psychological,Romance,Samurai,School,Sci-Fi,Seinen,Shoujo,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
4,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0


In [105]:
user_df["rating"]

0      5
1      7
2      8
3      7
4      5
5      9
6      6
7      9
8      9
9      9
10     8
11     8
12     6
13     6
14     6
15     9
16     7
17     8
18     7
19     9
20     7
21     5
22     6
23     7
24     8
25     8
26     6
27     7
28     7
29     7
30     7
31     8
32    10
33     9
34     6
35     7
36     7
37     7
38     7
39     6
40     8
41     8
42     7
43     8
44     7
45     8
46     7
47     9
48     8
49    10
50     5
51     9
52     4
53     6
54     8
55     3
56     6
57     9
Name: rating, dtype: int64

In [106]:
weights = user_genre_matrix.transpose().dot(user_df["rating"])

weights

Action           103
Adventure         39
Cars               0
Comedy           231
Dementia           0
Demons             7
Drama            130
Ecchi             83
Fantasy          114
Game              30
Harem             81
Hentai             0
Historical        19
Horror            21
Josei              0
Kids               0
Magic             28
Martial Arts      14
Mecha             12
Military           0
Music              5
Mystery           18
Parody             0
Police             0
Psychological     22
Romance          175
Samurai            0
School           173
Sci-Fi            37
Seinen            43
Shoujo             5
Shoujo Ai          0
Shounen           58
Shounen Ai         0
Slice of Life     81
Space              0
Sports             6
Super Power       37
Supernatural      93
Thriller          17
Vampire            8
Yaoi               0
Yuri               0
dtype: int64

In [107]:
# Set the index of the dataframe to the anime_id
recommendation_table = anime_df.set_index("anime_id")
# Drop the name column
recommendation_table.drop("name", axis=1, inplace=True)
recommendation_table.head()

Unnamed: 0_level_0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,Hentai,Historical,Horror,Josei,Kids,Magic,Martial Arts,Mecha,Military,Music,Mystery,Parody,Police,Psychological,Romance,Samurai,School,Sci-Fi,Seinen,Shoujo,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1
28891,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
23273,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
34599,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5114,1,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
31758,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0


In [108]:
recommendation_series = (recommendation_table * weights).sum(axis=1) / weights.sum()
recommendation_series.head()

anime_id
28891    0.353846
23273    0.320118
34599    0.200000
5114     0.415976
31758    0.131361
dtype: float64

In [109]:
recommendations = recommendation_series.sort_values(ascending=False)
recommendations.head(10)

anime_id
6489     0.684615
28285    0.673964
25157    0.673964
30544    0.644970
157      0.634911
1836     0.626036
79       0.626036
22877    0.623077
33581    0.618935
3712     0.607692
dtype: float64

In [110]:
# Find the top 10 animes in the recommendations in the anime dataset and put it in a new dataframe
recommendations_df = anime_df.loc[anime_df["anime_id"].isin(recommendations.head(10).keys())]
# Set the index of the dataframe to the anime ids
recommendations_df.set_index("anime_id", inplace=True)
# Use loc and the anime ids of the top 10 anime recommendations to preserve the order and output that to the user
recommendations_df.loc[recommendations.head(10).keys()][["name"]]

Unnamed: 0_level_0,name
anime_id,Unnamed: 1_level_1
6489,Zero no Tsukaima: Princesses no Rondo Picture ...
28285,Trinity Seven: Nanatsu no Taizai to Nana Madoushi
25157,Trinity Seven
30544,Gakusen Toshi Asterisk
157,Mahou Sensei Negima!
1836,Shuffle! Memories
79,Shuffle!
22877,Seireitsukai no Blade Dance
33581,Trinity Seven Movie 1: Eternity Library to Alc...
3712,Zero no Tsukaima: Princesses no Rondo
