In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix, save_npz
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, sigmoid_kernel
from tqdm import tqdm

In [2]:
"""
animeRating is from:
    animeRating = pd.read_csv("data/rating_complete.csv")
    animeRating = animeRating.drop(columns=['user_id'])
    animeRating = animeRating.groupby('anime_id').mean()
"""

animeSynopsis = pd.read_csv("data/anime_with_synopsis.csv").rename(columns={'sypnopsis': 'Synopsis'})

animeRating = pd.read_csv("data/rating.csv").rename(columns={'user_rating': 'Score'}) # For replacing unknown value

animeSynopsis = animeSynopsis.replace("Unknown", "0")

animeSynopsis.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,Synopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [3]:
animeList = pd.read_csv("data/anime.csv")
anime = pd.merge(animeSynopsis, animeList[['MAL_ID', 'Type', 'Popularity', 'Members', 'Favorites', 'Ranked', 'Episodes', 'Rating']], on='MAL_ID')

anime['Score'] = anime['Score'].astype("float64")

anime = anime.dropna()

anime = anime.reset_index().drop(columns=['index'])

print(anime.shape)

anime.head()

(16206, 12)


Unnamed: 0,MAL_ID,Name,Score,Genres,Synopsis,Type,Popularity,Members,Favorites,Ranked,Episodes,Rating
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever...",TV,39,1251960,61971,28.0,26,R - 17+ (violence & profanity)
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ...",Movie,518,273145,1174,159.0,1,R - 17+ (violence & profanity)
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...",TV,201,558913,12944,266.0,26,PG-13 - Teens 13 or older
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...,TV,1467,94683,587,2481.0,26,PG-13 - Teens 13 or older
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...,TV,4369,13224,18,3710.0,52,PG - Children


In [4]:
# source: https://www.kaggle.com/indralin/try-content-based-and-collaborative-filtering
# Cleaning text
import re

def text_cleaning(text):
    stopword = set(stopwords.words('english'))
    text = text.lower()
    text = re.sub(r'&quot;', '', text)
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    text = re.sub('[/(){}\[\]\|@,;]', ' ', text)
    text = re.sub('[^0-9a-z #+_]', '', text)
    text = ' '.join(word for word in text.split() if word not in stopword)
    return text

anime["Features"] = anime["Synopsis"].str.cat(anime["Genres"], sep=" ")
anime["Features"] = anime["Features"].str.cat(anime["Name"], sep=" ")
anime["Features"] = anime["Features"].apply(text_cleaning)
anime["name_lower"] = anime["Name"].apply(lambda x: x.lower())

anime.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,Synopsis,Type,Popularity,Members,Favorites,Ranked,Episodes,Rating,Features,name_lower
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever...",TV,39,1251960,61971,28.0,26,R - 17+ (violence & profanity),year 2071 humanity colonized several planets m...,cowboy bebop
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ...",Movie,518,273145,1174,159.0,1,R - 17+ (violence & profanity),day another bountysuch life often unlucky crew...,cowboy bebop: tengoku no tobira
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...",TV,201,558913,12944,266.0,26,PG-13 - Teens 13 or older,vash stampede man 60 000 000 000 bounty head r...,trigun
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...,TV,1467,94683,587,2481.0,26,PG-13 - Teens 13 or older,ches individuals special powers like esp telek...,witch hunter robin
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...,TV,4369,13224,18,3710.0,52,PG - Children,dark century people suffering rule devil vande...,bouken ou beet


In [5]:
anime.drop(columns=['name_lower']).to_csv("data/anime_clean_noml.csv", index=False)

# Create TF-IDF Matrix Features
Convert raw documents like text or similar to a matrix of TF-IDF features. Returning text to int with unique values of int.

TF is simply the frequency of a word in a document. IDF is the inverse of the document frequency among the whole corpus of documents. TF-IDF is used mainly because of two reasons: Suppose we search for “the rise of analytics” on Google. It is certain that “the” will occur more frequently than “analytics” but the relative importance of analytics is higher than the search query point of view. In such cases, TF-IDF weighting negates the effect of high frequency words in determining the importance of an item (document).


In [6]:
matVet = TfidfVectorizer(stop_words='english', 
                         analyzer='word', 
                         ngram_range=(1,3), 
                         min_df=3,
                         strip_accents='unicode',
                         max_features=None,
                         token_pattern=r'\w{1,}') # currently only support english, and analyze based on word (since this is document dataset)

tfidMatrix = matVet.fit_transform(anime["Features"])
tfidMatrix.shape

(16206, 47392)

In [None]:
kernelSimilarity = linear_kernel(tfidMatrix, tfidMatrix)
kernelSimilarity.shape

In [9]:
np.savez_compressed('kernelSimilarity.npz', kernelSimilarity)

In [7]:
def animeSearch(df_, nameQuery, n=5, sortByScore=True):
    df = df_.copy()
    nameQuery = nameQuery.lower()
    
    pd.set_option('display.max_rows', n)
    if n in ['all', 'All']:
        pd.set_option('display.max_rows', len(nameContains))
        
    nameContains = df.loc[df.name_lower.str.contains(nameQuery, na=False)].drop(columns=['Features',
                                                                                         'name_lower'])
    if sortByScore:
        nameContains = nameContains.sort_values(by="Score", ascending=False)
    
    return nameContains


def getHighestScoreAnime(df_, n=5, query=None):
    df = df_.copy()
    
    pd.set_option('display.max_rows', n)
    
    getHighest = df[df.Score != 'Unknown'].copy()
    getHighest["Score"] = getHighest["Score"].astype("float64")
    
    if query != None:
        getHighest = getHighest.sort_values(
            by="Score", ascending=False).query(query)[:n]
        print(
            f"Generated {getHighest.shape[0]} Rows, and {getHighest.shape[1]} Columns")
        return getHighest.drop(columns=['Features','name_lower'])
    
    getHighest = getHighest.sort_values(by="Score", ascending=False)[:n]
    
    print(
        f"Generated {getHighest.shape[0]} Rows, and {getHighest.shape[1]} Columns")
    
    return getHighest.drop(columns=['Features','name_lower'])


def filterByScore(df_, query=None, showAll=False, sort=True, ascending=False):
    df = df_.copy()
    
    getDf = df[df.Score != 'Unknown']
    getDf["Score"] = getDf["Score"].astype("float64")
    getDf = getDf.query(query)
    
    if sort:
        getDf = getDf.sort_values(by="Score", ascending=ascending)
    
    if showAll:
        pd.set_option('display.max_rows', len(getDf))
        print(f"Generated {getDf.shape[0]} Rows, and {getDf.shape[1]} Columns")
        return getDf.drop(columns=['Features','name_lower'])
    
    print(f"Generated {getDf.shape[0]} Rows, and {getDf.shape[1]} Columns")
    return getDf.drop(columns=['Features','name_lower'])


def filterByGenre(df_, genreContains, query=None, sortByScore=False, ascending=False, showAll=False):
    
    getGenres = df_.copy()
    getGenres = getGenres.loc[getGenres.Genres.str.contains(
        genreContains, na=False)]
    
    if query:
        getGenres = getGenres[getGenres.Score != 'Unknown']
        getGenres["Score"] = getGenres["Score"].astype("float64")
        getGenres = getGenres.query(query)
    
    if sortByScore:
        getGenres = getGenres[getGenres.Score != 'Unknown']
        getGenres["Score"] = getGenres["Score"].astype("float64")
        getGenres = getGenres.sort_values(by="Score", ascending=ascending)
    
    if showAll:
        pd.set_option('display.max_rows', len(getGenres))
        print(f"Filterting {genreContains} ...")
        print(
            f"Generated {getGenres.shape[0]} Rows, and {getGenres.shape[1]} Columns")
    
        return getGenres.drop(columns=["Features"])
    
    print(f"Filterting {genreContains} ...")
    print(
        f"Generated {getGenres.shape[0]} Rows, and {getGenres.shape[1]} Columns")
    
    return getGenres.drop(columns=['Features','name_lower'])


def getSimilarAnimeByIndex(index, df_=anime, model=kernelSimilarity, n=10, showAll=True):
    df = df_.copy()
    
    getSimilar = pd.DataFrame(model[index], columns=["Similarity"])
    filteringAboveZeros = getSimilar.query("Similarity > 0")
    sortSimilar = filteringAboveZeros.sort_values(
        by="Similarity", ascending=False)
    similarityScore = sortSimilar.iloc[1:n+1]
    similarAnime = similarityScore.index

    if showAll:
        pd.set_option('display.max_rows', n)
        return df.iloc[similarAnime].join(similarityScore).set_index('MAL_ID').sort_values(by="Score", ascending=False).drop(columns=['Features','name_lower'])

    return df.iloc[similarAnime].join(similarityScore).set_index('MAL_ID').sort_values(by="Score", ascending=False).drop(columns=['Features','name_lower'])


def getSimilarAnimeByName(name, df_=anime, model=kernelSimilarity, n=10, showAll=True):
    df = df_.copy()
    name = name.lower()
    
    getAnime = df.loc[df.name_lower.str.contains(name, na=False)].drop(
        columns=['Features']).sort_values(by="Score", ascending=False)
    getIndex = getAnime.index[0]
    getSimilar = pd.DataFrame(model[getIndex], columns=["Similarity"])
    filteringAboveZeros = getSimilar.query("Similarity > 0")
    sortSimilar = filteringAboveZeros.sort_values(
        by="Similarity", ascending=False)
    similarityScore = sortSimilar.iloc[1:n+1]
    similarAnime = similarityScore.index

    if showAll:
        pd.set_option('display.max_rows', n)
        return df.iloc[similarAnime].join(similarityScore).set_index('MAL_ID').sort_values(by="Score", ascending=False).drop(columns=['Features','name_lower'])

    return df.iloc[similarAnime].join(similarityScore).set_index('MAL_ID').sort_values(by="Score", ascending=False).drop(columns=['Features','name_lower'])


def getAnimeType(animeType, df_=anime):
    df = df_.copy()
    return df[df.Type == animeType].sort_values(by="Score", ascending=False).drop(columns=['Features','name_lower'])

In [8]:
animeResults = animeSearch(anime, nameQuery="Shingeki", n=5)
animeResults

Unnamed: 0,MAL_ID,Name,Score,Genres,Synopsis,Type,Popularity,Members,Favorites,Ranked
14644,40028,Shingeki no Kyojin: The Final Season,9.17,"Action, Military, Mystery, Super Power, Drama,...",Gabi Braun and Falco Grice have been training ...,TV,119,733260,44862,2.0
13717,38524,Shingeki no Kyojin Season 3 Part 2,9.10,"Action, Drama, Fantasy, Military, Mystery, Sho...",Seeking to restore humanity's diminishing hope...,TV,63,1073626,40985,4.0
...,...,...,...,...,...,...,...,...,...,...
11681,35122,Shingeki no Kyotou,6.35,"Action, Fantasy, Shounen",special anime featuring a battle against the C...,Special,3833,18073,127,6630.0
8755,28447,Shingeki no Bahamut: Genesis - Roundup,6.33,"Action, Adventure, Demons, Supernatural, Magic...",cap of the first six episodes of Shingeki no B...,Special,4354,13467,7,6727.0


In [9]:
getSimilarAnimeByIndex(index=6550, n=5)

Unnamed: 0_level_0,Name,Score,Genres,Synopsis,Type,Popularity,Members,Favorites,Ranked,Similarity
MAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
40028,Shingeki no Kyojin: The Final Season,9.17,"Action, Military, Mystery, Super Power, Drama,...",Gabi Braun and Falco Grice have been training ...,TV,119,733260,44862,2.0,0.394277
38524,Shingeki no Kyojin Season 3 Part 2,9.1,"Action, Drama, Fantasy, Military, Mystery, Sho...",Seeking to restore humanity's diminishing hope...,TV,63,1073626,40985,4.0,0.314509
35760,Shingeki no Kyojin Season 3,8.59,"Action, Military, Mystery, Super Power, Drama,...","Still threatened by the ""Titans"" that rob them...",TV,48,1212430,14971,76.0,0.404391
25777,Shingeki no Kyojin Season 2,8.45,"Action, Military, Mystery, Super Power, Drama,...","For centuries, humanity has been hunted by gia...",TV,16,1591506,18262,128.0,0.393233
42091,Shingeki no Kyojin: Chronicle,7.68,"Action, Military, Mystery, Super Power, Drama,...",The compilation film will recap the anime's 59...,Movie,2249,50634,211,1089.0,0.351591


In [14]:
getSimilarAnimeByName("chuunibyou", n=20)

Unnamed: 0_level_0,Name,Score,Genres,Synopsis,Type,Popularity,Members,Favorites,Ranked,Similarity
MAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
37450,Seishun Buta Yarou wa Bunny Girl Senpai no Yum...,8.38,"Comedy, Supernatural, Drama, Romance, School",The rare and inexplicable Puberty Syndrome is ...,TV,78,940033,29642,166.0,0.091608
14741,Chuunibyou demo Koi ga Shitai!,7.77,"Slice of Life, Comedy, Drama, Romance, School",Everybody has had that stage in their life whe...,TV,77,955524,17973,878.0,0.330204
18671,Chuunibyou demo Koi ga Shitai! Ren,7.56,"Comedy, Drama, Romance, School, Slice of Life","The awkward lovebirds, Yuuta Togashi and Rikka...",TV,226,522620,2952,1378.0,0.507584
16934,Chuunibyou demo Koi ga Shitai!: Kirameki no......,7.51,"Comedy, Drama, Romance, School, Slice of Life",lthough Yuuta Togashi and Rikka Takanashi have...,Special,901,164542,144,1551.0,0.473849
27601,Chuunibyou demo Koi ga Shitai! Ren: The Rikka ...,7.47,"Comedy, Drama, Romance, School, Slice of Life","One normal school day, Rikka Takanashi notices...",Special,1519,91158,88,1672.0,0.358478
19021,Takanashi Rikka Kai: Chuunibyou demo Koi ga Sh...,7.35,"Comedy, Drama, Romance, School, Slice of Life","Summary of the first season of the show, as se...",Movie,1739,75194,102,2109.0,0.647468
15687,Chuunibyou demo Koi ga Shitai! Lite,7.26,"Comedy, School",Short episodes aired on KyoAni's official YouT...,ONA,1306,109113,176,2520.0,0.416979
35847,SSSS.Gridman,7.19,"Action, Sci-Fi, Mecha",Yuuta Hibiki wakes up in the room of Rikka Tak...,TV,734,199953,937,2804.0,0.12133
22859,Takanashi Rikka Kai: Chuunibyou demo Koi ga Sh...,7.18,Comedy,Special episode included with the Blu-ray/DVD ...,Special,3112,27746,7,2840.0,0.639991
21797,Chuunibyou demo Koi ga Shitai! Ren Lite,7.17,"Comedy, School",Short episodes aired on KyoAni's official YouT...,ONA,1879,65674,87,2901.0,0.389399
