In [1]:
# Import the required modules
import pandas as pd
import re

### Preprocessing

In this step, we will
+ read in the datasets
+ filter out the columns we don't need
+ merge the two dataframes to get one dataframe containing the anime and user ratings
+ clean the new dataframe to get rid of NaN values
+ transform the dataframe to get a matrix with users as rows and anime as columns

In [2]:
# Read in the datasets
anime = pd.read_csv('anime.csv')
ratings = pd.read_csv('rating.csv')

In [3]:
# Let's take a look at the anime df
anime.head() # Looks good, but we don't need all of the columns

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
# Let's select only the id and the name
# We do not need the other columns
anime_filtered = anime[["anime_id","name"]]

In [5]:
# Now we have the anime dataset with the columns we need
anime_filtered.head()

Unnamed: 0,anime_id,name
0,32281,Kimi no Na wa.
1,5114,Fullmetal Alchemist: Brotherhood
2,28977,Gintama°
3,9253,Steins;Gate
4,9969,Gintama&#039;


In [6]:
# Let's take a look at the ratings dataset
ratings.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [7]:
# There seem to be ratings of -1
# This makes sense, a user will not rate every anime
# since they haven't seen every anime
ratings[ratings["rating"] == -1]

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813628,73515,2385,-1
7813629,73515,2386,-1
7813631,73515,2490,-1
7813635,73515,2680,-1


In [8]:
# For calculation purposes, let's only 
# take entries which have a rating in 1-10
ratings = ratings[ratings["rating"] > 0]

In [9]:
# Looks good
ratings.head()

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10
81,1,11617,10
83,1,11757,10
101,1,15451,10
153,2,11771,10


In [19]:
# Let's merge the two datasets to get one single
# dataframe for carrying out collaborative filtering
df = pd.merge(anime_filtered, ratings, on="anime_id")

In [20]:
# Looks good, but we need this in a user-rating matrix
df.head()

Unnamed: 0,anime_id,name,user_id,rating
0,32281,Kimi no Na wa.,99,5
1,32281,Kimi no Na wa.,152,10
2,32281,Kimi no Na wa.,244,10
3,32281,Kimi no Na wa.,271,10
4,32281,Kimi no Na wa.,322,10


In [21]:
# Let's pivot the dataframe so that we have the matrix of
# ratings for each user and anime
user_ratings = df.pivot_table(index='user_id',columns='name',values='rating')

In [22]:
# Let's take a look at our matrix
user_ratings.head()

name,&quot;0&quot;,"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,&quot;Eiji&quot;,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,...,lilac (bombs Jun Togawa),makemagic,s.CRY.ed,vivi,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,2.0,,,,,
7,,,,,,,,,,,...,,,,,,,,,,


In [23]:
# There seem to be a lot of NaN values
# Let's drop all anime which have less than 100 ratings
user_ratings = user_ratings.dropna(thresh=100,axis=1)

In [25]:
# We will clean the remaining values by replacing NaN with 0
user_ratings = user_ratings.fillna(0)

In [26]:
# Looks better
user_ratings.head()

name,&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,...,ef: A Tale of Memories.,ef: A Tale of Memories. - Prologue,ef: A Tale of Memories. - Recollections,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Collaborative filtering

This is the crux of the project. In this step, we will use collaborative filtering on our matrix to get a new matrix which will contain the similarity scores for each anime in the original dataset.

We will be using Item Based Collaborative Filtering for more accuracy with results. The method we will use for comparison will be cosine similarity.

In [27]:
# We will be using the cosine similarity method to form our similarity matrix
# Let's import the class from sklearn
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
# Let's apply the method to our dataframe to get the similarity matrix
item_similarity = cosine_similarity(user_ratings.T)

In [29]:
# Finally, we want this in a dataframe
item_similarity_df = pd.DataFrame(item_similarity, index=user_ratings.columns, columns=user_ratings.columns)

In [30]:
# Looks good
item_similarity_df.head()

name,&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,...,ef: A Tale of Memories.,ef: A Tale of Memories. - Prologue,ef: A Tale of Memories. - Recollections,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,1.0,0.675673,0.526912,0.073538,0.080379,0.069893,0.067951,0.076972,0.076523,0.098063,...,0.177311,0.182829,0.13657,0.094056,0.043361,0.129323,0.132639,0.140941,0.138086,0.148708
&quot;Bungaku Shoujo&quot; Memoire,0.675673,1.0,0.620813,0.077749,0.084162,0.07009,0.067068,0.077682,0.077155,0.086668,...,0.182469,0.168416,0.129806,0.096255,0.051448,0.126142,0.13252,0.130747,0.146299,0.144513
&quot;Bungaku Shoujo&quot; Movie,0.526912,0.620813,1.0,0.074746,0.086444,0.071361,0.060379,0.074834,0.074404,0.091667,...,0.235683,0.159335,0.104317,0.087987,0.061359,0.131519,0.134665,0.125365,0.140295,0.135924
.hack//G.U. Returner,0.073538,0.077749,0.074746,1.0,0.617171,0.540926,0.506771,0.467671,0.431416,0.330049,...,0.098382,0.083506,0.066647,0.046615,0.074718,0.100065,0.10363,0.11051,0.084837,0.0936
.hack//G.U. Trilogy,0.080379,0.084162,0.086444,0.617171,1.0,0.495202,0.414907,0.389019,0.418251,0.352122,...,0.110663,0.076691,0.057844,0.069266,0.109269,0.116684,0.109622,0.110674,0.083006,0.096962


### Adding User Interaction

We have built our similarity matrix. Now we want to put it to the test by asking the user to enter an anime they want to find similar anime to and see if it works.

In [31]:
'''
This function takes in the name of the anime, and finds the column
for that anime in the similarity dataframe.
Then it sorts the values in the column by descending order of similarity
score and returns the sorted column as a series.
'''
def get_similar_anime(name):
    similar_score = item_similarity_df[name]
    similar_score = similar_score.sort_values(ascending=False)
    return similar_score

In [32]:
'''
This function takes in a series and a limit x
It returns the first x values from the series.
Note: Since every anime is most similar to itself, it 
ignores the first value in the series, which will be the 
anime the user entered.
'''
def get_top_x (similar, x):
    return similar.head(x + 1)[1:]

In [1]:
'''
This function takes in a series containing the similarity
scores of the top x anime similar to the one the user entered.
It loops through the names of the anime (which are the indices in the series) 
and finds the details for that anime from the main anime dataframe we imported
in the beginning.
It then adds the row to a new dataframe, and returns the dataframe.
'''
def get_top_similar(similar_anime):
    series = pd.DataFrame()
    for anime_name in similar_anime.index:
        row = anime[anime["name"] == anime_name][["name","genre","rating","episodes"]]
        series = series.append(row)
    return series

In [34]:
# Generate a list of all the anime (only the names)
# We will use this list to verify which anime the user
# is looking for
list_of_names = anime["name"]
list_of_names = list(list_of_names)

In [35]:
'''
This function takes in a regular expression and finds anime
in the list of anime names by comparing it with the expression.
It then returns the list of anime which matched with the regular
expression.
'''
def find_anime(reg):
        anime_found = []
        for name in list_of_names:
            match = re.search(reg, name, re.IGNORECASE)
            if match is None:
                continue
            else:
                anime_found.append(match.string)
        return anime_found

In [36]:
'''
This function takes in the user input and generates a regular
expression. This regex will be used to find anime in the database.
We do this because anime in the database have specific name formats,
while a user might only enter keywords in the anime name.
(e.g - steins gate is stored in the database as 'Steins;Gate')
'''
def generate_regex(user_input):
    regex = ''
    for ch in user_input:
        if ch == ' ':
            regex += '|'
        else:
            regex += ch
    return regex

In [37]:
'''
This function takes in a list of anime that matched the keywords
entered by the user. 
It goes through the list, and asks the user each time if the anime
in the list is what the user intended to look for.
If not, it continues looping. If the anime is found, it returns the
name of the anime as present in the database.
'''
def confirm_user_anime(found):
    for name in found:
        choice = input(f'Were you looking for {name}? (y/n) ')
        if choice == 'y':
            return name
        else:
            continue
    return ''

In [50]:
def main():
    # Welcome user to system
    print('Welcome to the Anime Recommendation System')
    
    # Ask user for input
    user_input = input('Please enter an anime you want to find : ')
    
    # Generate regex using the user input
    regex = generate_regex(user_input)
    
    # Find a list of anime user could be intending
    found = find_anime(regex)
    
    # Check to see if any anime were found
    if len(found) == 0:
        print('Sorry, we were not able to find that in our database')
    
    # Confirm the anime from the user
    anime_to_find = confirm_user_anime(found)
    
    # Check to see if the user's anime was in the list
    if len(anime_to_find) == 0:
        print('Sorry we were unable to find the anime you were looking for')
    
    # Ask user for number of similar anime they want to see
    limit = input('How many similar anime would you like to see ? ')
    limit = int(limit)
    
    # Find the similar anime from the similarity dataframe
    similar_anime = get_similar_anime(anime_to_find)
    top_x_similar = get_top_x(similar_anime, limit)
    top_similar_anime = get_top_similar(top_x_similar)    
    
    # Present the list to the user
    %clear
    print(f'Here are {limit} anime similar to {anime_to_find} we think you might enjoy!')
    display(top_similar_anime)

In [51]:
if __name__ == '__main__':
    main()

Welcome to the Anime Recommendation System
Please enter an anime you want to find : danshi
Were you looking for Danshi Koukousei no Nichijou? (y/n) y
How many similar anime would you like to see ? 10
Here are 10 anime similar to Danshi Koukousei no Nichijou we think you might enjoy!


Unnamed: 0,name,genre,rating,episodes
604,Danshi Koukousei no Nichijou Specials,"Comedy, School, Shounen, Slice of Life",7.94,6
583,Chuunibyou demo Koi ga Shitai!,"Comedy, Drama, Romance, School, Slice of Life",7.95,12
491,Hataraku Maou-sama!,"Comedy, Demons, Fantasy, Romance, Shounen",8.03,13
335,Hyouka,"Mystery, School, Slice of Life",8.17,22
401,Yahari Ore no Seishun Love Comedy wa Machigatt...,"Comedy, Drama, Romance, School",8.12,13
262,Gekkan Shoujo Nozaki-kun,"Comedy, Romance, School",8.24,12
97,Nichijou,"Comedy, School, Slice of Life",8.52,26
553,Working&#039;!!,"Comedy, Slice of Life",7.98,13
824,Working!!,"Comedy, Slice of Life",7.82,13
57,Ano Hi Mita Hana no Namae wo Bokutachi wa Mada...,"Drama, Slice of Life, Supernatural",8.62,11
