In [1]:
import re
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from scipy.sparse import csr_matrix, save_npz
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer, MinMaxScaler

In [2]:
# Reading anime with synopsis
animeSynopsis = pd.read_csv("../data/raw/anime_with_synopsis.csv").rename(columns={'sypnopsis': 'Synopsis'})

# Cleaning anime with synopsis
animeSynopsis['MAL_ID'] = animeSynopsis['MAL_ID'].replace("Unknown", 0).fillna(0).astype('int64')
animeSynopsis['Score'] = animeSynopsis['Score'].replace("Unknown", 0).fillna(0).astype('float64')
animeSynopsis = animeSynopsis.fillna("Unknown")

# Reading anime list
useCols = ['MAL_ID', 'Type', 'Popularity', 'Favorites', 'Ranked', 'Episodes', 'Rating', 'Premiered', 'Studios', 'Source']

animeList = pd.read_csv("../data/raw/anime.csv", usecols=useCols)

# Cleaning anime list
animeList['Ranked'] = animeList['Ranked'].replace("Unknown", 0).fillna(0).astype('float64')
animeList['MAL_ID'] = animeList['MAL_ID'].replace("Unknown", 0).fillna(0).astype('int64')
animeList['Episodes'] = animeList['Episodes'].replace("Unknown", 0).fillna(0).astype('int64')
animeList['Favorites'] = animeList['Favorites'].replace("Unknown", 0).fillna(0).astype('int64')
animeList['Popularity'] = animeList['Popularity'].replace("Unknown", 0).fillna(0).astype('int64')
animeList = animeList.fillna("Unknown")

# Merge animeList dataframe and animeSynopsis dataframe
anime = pd.merge(animeSynopsis, animeList, on='MAL_ID')

# source: https://www.kaggle.com/indralin/try-content-based-and-collaborative-filtering
# Cleaning text
def text_cleaning(text):
    stopword = set(stopwords.words('english'))
    text = text.lower()
    text = re.sub(r'&quot;', '', text)
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    text = re.sub('[/(){}\[\]\|@,;]', ' ', text)
    text = re.sub('[^0-9a-z #+_]', '', text)
    text = ' '.join(word for word in text.split() if word not in stopword)
    return text

useCols = ["Genres", "Name", "Type", "Source", "Rating", "Premiered", "Studios"]
anime["Features"] = anime["Synopsis"].str.cat(anime[useCols], sep=" ")
anime["Features"] = anime["Features"].apply(text_cleaning)
anime["name_lower"] = anime["Name"].apply(lambda x: x.lower())

anime.to_csv('../data/dataset/anime_clean.csv', index=False)

print(anime.shape)
anime.head(2)

(16214, 16)


Unnamed: 0,MAL_ID,Name,Score,Genres,Synopsis,Type,Episodes,Premiered,Studios,Source,Rating,Ranked,Popularity,Favorites,Features,name_lower
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever...",TV,26,Spring 1998,Sunrise,Original,R - 17+ (violence & profanity),28.0,39,61971,year 2071 humanity colonized several planets m...,cowboy bebop
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ...",Movie,1,Unknown,Bones,Original,R - 17+ (violence & profanity),159.0,518,1174,day another bountysuch life often unlucky crew...,cowboy bebop: tengoku no tobira


In [3]:
def preprocessing_category(df, column, is_multilabel=False):
    # Binarise labels
    lb = LabelBinarizer()
    if is_multilabel:
        lb = MultiLabelBinarizer()
    expandedLabelData = lb.fit_transform(df[column])
    labelClasses = lb.classes_

    # Create a pandas.DataFrame from our output
    category_df = pd.DataFrame(expandedLabelData, columns=labelClasses)
    del df[column]
    return pd.concat([df, category_df], axis=1)

In [4]:
# Using anime metadata to filtering content

anime_metadata = anime.copy()

del anime_metadata['Features']
del anime_metadata['Name']
del anime_metadata['Genres']
del anime_metadata['name_lower']
del anime_metadata['Synopsis']
del anime_metadata['MAL_ID']

catCols = anime_metadata.select_dtypes(exclude=['int64', 'float64']).columns
numCols = anime_metadata.select_dtypes(exclude=['object']).columns

for col in catCols:
    anime_metadata = preprocessing_category(anime_metadata, col)

anime_metadata[numCols] =  MinMaxScaler().fit_transform(anime_metadata[numCols])
anime_metadata = anime_metadata.values
print(anime_metadata.shape)

np.save("../data/binary/anime_metadata.npy", anime_metadata)

(16214, 1303)


In [5]:
# Using tf-idf to encode based from synopsis, name and genre to filtering content

matVet = TfidfVectorizer(stop_words='english', 
                         analyzer='word', 
                         ngram_range=(1,3), 
                         min_df=3,
                         strip_accents='unicode',
                         max_features=None,
                         token_pattern=r'\w{1,}') # currently only support english, and analyze based on word (since this is document dataset)

animeFeatures = anime['Features'].copy()
animeFeaturesTfidf = matVet.fit_transform(animeFeatures)

save_npz("../data/binary/animeFeaturesTfidf.npz", animeFeaturesTfidf)

animeFeaturesTfidf.shape

(16214, 52594)

In [7]:
def animeSearch(df_, nameQuery, n=5, sortByScore=True):
    df = df_.copy()
    nameQuery = nameQuery.lower()
    df["name_lower"] = df["Name"].apply(lambda x: x.lower())

    nameContains = df.loc[df.name_lower.str.contains(nameQuery, na=False)].drop(columns=['Features',
                                                                                         'name_lower'])

    if sortByScore:
        nameContains = nameContains.sort_values(by="Score", ascending=False)

    if n in ['all', 'All']:
        pd.set_option('display.max_rows', len(nameContains))
    else:
        pd.set_option('display.max_rows', n)
        nameContains = nameContains[:n]
        return nameContains
    return nameContains


def getSimilar(df=anime, vector=None, query_index=None, n=50):
    model_knn = NearestNeighbors(metric='cosine', n_neighbors=n)
    model_knn.fit(csr_matrix(vector.astype(np.float)))

    distances, indices = model_knn.kneighbors(
        vector[query_index, :].reshape(1, -1), n_neighbors=n)
    result, score = [], []
    for i in range(0, len(distances.flatten())):
        index = indices.flatten()[i]
        if index == query_index:
            continue
        result.append(df.iloc[index])
    results_df = pd.DataFrame(result)#.sort_values(by="Score", ascending=False)
    pd.set_option('display.max_rows', len(results_df))
    return results_df


def mostSimilarByIndex(query_index, n=50, showAll=True):
    vectorMeta = anime_metadata
    vectorSynop = animeFeaturesTfidf
    Meta = getSimilar(
        df=anime, vector=vectorMeta, query_index=query_index, n=n)
    Synop = getSimilar(
        df=anime, vector=vectorSynop, query_index=query_index, n=n)
    MetaSynop = Synop.append(Meta)
    MetaSynop = MetaSynop.drop_duplicates().sort_values(by="Score", ascending=False)
    MetaSynop = MetaSynop.drop(columns=['Features'])
    if showAll:
        pd.set_option('display.max_rows', len(MetaSynop))
        print(
            f"Generated total dataframe with {MetaSynop.shape[0]} rows and {MetaSynop.shape[1]} columns")
        return MetaSynop
    pd.set_option('display.max_rows', 10)
    print(
        f"Generated dataframe with {MetaSynop.shape[0]} rows and {MetaSynop.shape[1]} columns")
    return MetaSynop

def mostSimilarByName(name, n=50, showAll=True):
    query = animeSearch(nameQuery=name, df_=anime, n=1, sortByScore=False)
    query_index = query.index
    vectorMeta = anime_metadata
    vectorSynop = animeFeaturesTfidf
    Meta = getSimilar(
        df=anime, vector=vectorMeta, query_index=query_index, n=n)
    Synop = getSimilar(
        df=anime, vector=vectorSynop, query_index=query_index, n=n)
    MetaSynop = Synop.append(Meta)
    MetaSynop = MetaSynop.drop_duplicates().sort_values(by="Score", ascending=False)
    MetaSynop = MetaSynop.drop(columns=['Features'])
    if showAll:
        pd.set_option('display.max_rows', len(MetaSynop))
        print(
            f"Generated total dataframe with {MetaSynop.shape[0]} rows and {MetaSynop.shape[1]} columns")
        return query, MetaSynop
    pd.set_option('display.max_rows', 10)
    print(
        f"Generated dataframe with {MetaSynop.shape[0]} rows and {MetaSynop.shape[1]} columns")
    return query, MetaSynop

In [8]:
animeResults = animeSearch(anime, nameQuery="chuunibyou", n=5)
animeResults

Generated dataframe with 5 rows and 14 columns


Unnamed: 0,MAL_ID,Name,Score,Genres,Synopsis,Type,Episodes,Premiered,Studios,Source,Rating,Ranked,Popularity,Favorites
11986,35608,Chuunibyou demo Koi ga Shitai! Movie: Take On Me,8.15,"Comedy, Drama, Romance, School, Slice of Life",lthough already a third-year high school stude...,Movie,1,Unknown,Kyoto Animation,Light novel,PG-13 - Teens 13 or older,356.0,933,1574
6291,14741,Chuunibyou demo Koi ga Shitai!,7.77,"Slice of Life, Comedy, Drama, Romance, School",Everybody has had that stage in their life whe...,TV,12,Fall 2012,Kyoto Animation,Light novel,PG-13 - Teens 13 or older,878.0,77,17973
7010,18671,Chuunibyou demo Koi ga Shitai! Ren,7.56,"Comedy, Drama, Romance, School, Slice of Life","The awkward lovebirds, Yuuta Togashi and Rikka...",TV,12,Winter 2014,Kyoto Animation,Light novel,PG-13 - Teens 13 or older,1378.0,226,2952
6658,16934,Chuunibyou demo Koi ga Shitai!: Kirameki no......,7.51,"Comedy, Drama, Romance, School, Slice of Life",lthough Yuuta Togashi and Rikka Takanashi have...,Special,1,Unknown,Kyoto Animation,Light novel,PG-13 - Teens 13 or older,1551.0,901,144
8592,27601,Chuunibyou demo Koi ga Shitai! Ren: The Rikka ...,7.47,"Comedy, Drama, Romance, School, Slice of Life","One normal school day, Rikka Takanashi notices...",Special,1,Unknown,Kyoto Animation,Light novel,PG-13 - Teens 13 or older,1672.0,1519,88


In [9]:
mostSimilar(query_index=11986, n=20)

Generated total dataframe with 36 rows and 16 columns


Unnamed: 0,MAL_ID,Name,Score,Genres,Synopsis,Type,Episodes,Premiered,Studios,Source,Rating,Ranked,Popularity,Favorites,Features,name_lower
13608,38329,Seishun Buta Yarou wa Yumemiru Shoujo no Yume ...,8.68,"Supernatural, Drama, Romance, School","Six months ago, Sakuta Azusagawa had a chance ...",Movie,1,Unknown,CloverWorks,Light novel,PG-13 - Teens 13 or older,46.0,370,6468,six months ago sakuta azusagawa chance encount...,seishun buta yarou wa yumemiru shoujo no yume ...
13397,37987,Violet Evergarden Movie,8.65,"Slice of Life, Drama, Fantasy",original sequel to the TV anime which aired in...,Movie,1,Unknown,Kyoto Animation,Light novel,PG-13 - Teens 13 or older,55.0,779,961,original sequel tv anime aired 2018 movie foll...,violet evergarden movie
4286,7311,Suzumiya Haruhi no Shoushitsu,8.65,"Comedy, Mystery, Romance, School, Sci-Fi, Supe...","One cold Christmas day, Kyon heads over to sch...",Movie,1,Unknown,Kyoto Animation,Light novel,PG-13 - Teens 13 or older,54.0,243,11585,one cold christmas day kyon heads school sos b...,suzumiya haruhi no shoushitsu
13439,38040,Kono Subarashii Sekai ni Shukufuku wo!: Kurena...,8.52,"Adventure, Comedy, Fantasy, Magic, Parody, Sup...",It is not strange that the Demon Lord's forces...,Movie,1,Unknown,J.C.Staff,Light novel,PG-13 - Teens 13 or older,98.0,321,3814,strange demon lords forces fear crimson demons...,kono subarashii sekai ni shukufuku wo!: kurena...
12728,36885,Saenai Heroine no Sodatekata Fine,8.48,"Harem, Comedy, Romance, Ecchi, School",h the second Winter Comiket just around the co...,Movie,1,Unknown,CloverWorks,Light novel,PG-13 - Teens 13 or older,113.0,1567,1132,h second winter comiket around corner blessing...,saenai heroine no sodatekata fine
14511,39741,Violet Evergarden Gaiden: Eien to Jidou Shuki ...,8.4,"Slice of Life, Drama, Fantasy","Isabella, the daughter of the noble York famil...",Movie,1,Unknown,Kyoto Animation,Light novel,PG-13 - Teens 13 or older,152.0,824,716,isabella daughter noble york family enrolled a...,violet evergarden gaiden: eien to jidou shuki ...
13050,37450,Seishun Buta Yarou wa Bunny Girl Senpai no Yum...,8.38,"Comedy, Supernatural, Drama, Romance, School",The rare and inexplicable Puberty Syndrome is ...,TV,13,Fall 2018,CloverWorks,Light novel,PG-13 - Teens 13 or older,166.0,78,29642,rare inexplicable puberty syndrome thought myt...,seishun buta yarou wa bunny girl senpai no yum...
5085,9617,K-On! Movie,8.33,"Music, Slice of Life, Comedy",Graduation looms for the founding members of t...,Movie,1,Unknown,Kyoto Animation,4-koma manga,PG-13 - Teens 13 or older,202.0,607,2091,graduation looms founding members light music ...,k-on! movie
10880,33674,No Game No Life: Zero,8.32,"Game, Supernatural, Drama, Romance, Fantasy","In ancient Disboard, Riku is an angry, young w...",Movie,1,Unknown,Madhouse,Light novel,PG-13 - Teens 13 or older,208.0,173,7022,ancient disboard riku angry young warrior inte...,no game no life: zero
12838,37095,"Violet Evergarden: Kitto ""Ai"" wo Shiru Hi ga K...",8.32,"Drama, Fantasy, Slice of Life",The CH Postal Company has just received a requ...,Special,1,Unknown,Kyoto Animation,Light novel,PG-13 - Teens 13 or older,211.0,832,494,ch postal company received request transcribe ...,"violet evergarden: kitto ""ai"" wo shiru hi ga k..."
