In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
pd.set_option("display.max_columns", None)
RANDOM_SEED = 123

In [3]:
df_anime = pd.read_csv("assets/anime.csv")
df_manga = pd.read_csv("assets/manga.csv")

In [4]:
df_anime.shape, df_manga.shape

((24985, 39), (64833, 30))

## Data Prepocessing

In [5]:
def data_cleaning(input_anime, input_manga):
    df_anime = input_anime.copy()
    df_manga = input_manga.copy()

    # remove unnecessary columns
    df_anime = df_anime.drop(columns=['anime_id', 'main_picture', 'url', 'trailer_url', 'background', 'created_at', 'updated_at', 'episode_duration', 'broadcast_day', 'broadcast_time', 'licensors', 'title_synonyms'])
    df_manga = df_manga.drop(columns=['manga_id', 'main_picture', 'url', 'background', 'created_at_before', 'updated_at', 'title_synonyms'])

    # remove rows that are null in 'synopsis' and 'title', which are crucial for our project
    df_anime.dropna(subset=['title', 'synopsis', 'title_english', 'title_japanese'], inplace=True)
    df_manga.dropna(subset=['title', 'synopsis', 'title_english', 'title_japanese'], inplace=True)

    # remove '(Sources:...)' from synopsis
    df_anime['synopsis'] = df_anime['synopsis'].apply(lambda x: re.sub(r'\(Source:.*\)', '', x))
    df_manga['synopsis'] = df_manga['synopsis'].apply(lambda x: re.sub(r'\(Source:.*\)', '', x))

    # remove '[Written by ...]' from synopsis
    df_anime['synopsis'] = df_anime['synopsis'].apply(lambda x: re.sub(r'\[Written by.*\]', '', x))
    df_manga['synopsis'] = df_manga['synopsis'].apply(lambda x: re.sub(r'\[Written by.*\]', '', x))

    print('cleaned anime shape: ', df_anime.shape)
    print('cleaned manga shape: ', df_manga.shape)

    return df_anime, df_manga

df_anime_cleaned, df_manga_cleaned = data_cleaning(df_anime, df_manga)

cleaned anime shape:  (9506, 27)
cleaned manga shape:  (15668, 23)


In [6]:
print("anime extra columns: \n", [col for col in df_anime_cleaned.columns if not col in df_manga_cleaned.columns])

anime extra columns: 
 ['episodes', 'source', 'total_duration', 'rating', 'start_year', 'start_season', 'studios', 'producers']


In [7]:
print("manga extra columns: \n", [col for col in df_manga_cleaned.columns if not col in df_anime_cleaned.columns])

manga extra columns: 
 ['volumes', 'chapters', 'authors', 'serializations']


In [8]:
print("common columns: \n", [col for col in df_anime_cleaned.columns if col in df_manga_cleaned.columns])

common columns: 
 ['title', 'type', 'score', 'scored_by', 'status', 'start_date', 'end_date', 'members', 'favorites', 'sfw', 'approved', 'real_start_date', 'real_end_date', 'genres', 'themes', 'demographics', 'synopsis', 'title_english', 'title_japanese']


In [9]:
df_manga['type'].unique()

array(['manga', 'manhwa', 'light_novel', 'one_shot', 'manhua', 'novel',
       'doujinshi'], dtype=object)

In [10]:
df_anime['source'].unique()

array(['manga', 'visual_novel', 'original', 'web_manga', 'light_novel',
       'novel', 'game', '4_koma_manga', 'music', 'other', 'web_novel',
       'card_game', 'book', 'mixed_media', nan, 'picture_book', 'radio'],
      dtype=object)

In [11]:
df_anime['type'].unique()

array(['tv', 'movie', 'ona', 'ova', 'special', 'music', nan], dtype=object)

**Extra columns alignment**:
| anime columns | manga columns | combine |
| --- | --- | --- |
| episodes | chapters | episodes/chapters |
| NULL | volumes |
| source | type | original_source |
| type | NULL |
| total_duration | NULL |
| rating | NULL |
| start_year | NULL |
| start_season | NULL |
| studios | NULL |
| producers | NULL |
| NULL | authors |
| NULL | serializations |

In [12]:
def columns_alignment(input_anime, input_manga):
    df_anime = input_anime.copy()
    df_manga = input_manga.copy()

    # treat 'episodes' and 'chapters' the same, create null value for 'volume' in anime
    df_anime.rename(columns={'episodes': 'episodes/chapters'}, inplace=True)
    df_manga.rename(columns={'chapters': 'episodes/chapters'}, inplace=True)

    # 'source' in anime is corresponding to the 'type' in manga, so combine them
    df_anime.rename(columns={'source': 'original_source'}, inplace=True)
    df_manga.rename(columns={'type': 'original_source'}, inplace=True)

    return df_anime, df_manga

df_anime_aligned, df_manga_aligned = columns_alignment(df_anime_cleaned, df_manga_cleaned)


In [13]:
df_full = pd.concat([df_anime_aligned, df_manga_aligned], ignore_index=True)
df_full

Unnamed: 0,title,type,score,scored_by,status,episodes/chapters,start_date,end_date,original_source,members,favorites,total_duration,rating,sfw,approved,start_year,start_season,real_start_date,real_end_date,genres,themes,demographics,studios,producers,synopsis,title_english,title_japanese,volumes,authors,serializations
0,Fullmetal Alchemist: Brotherhood,tv,9.10,2037075,finished_airing,64.0,2009-04-05,2010-07-04,manga,3206028,219036,1 days 01:57:20,r,True,True,2009.0,spring,2009-04-05,2010-07-04,"['Action', 'Adventure', 'Drama', 'Fantasy']",['Military'],['Shounen'],['Bones'],"['Aniplex', 'Square Enix', 'Mainichi Broadcast...",After a horrific alchemy experiment goes wrong...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,,,
1,Hunter x Hunter (2011),tv,9.04,1671587,finished_airing,148.0,2011-10-02,2014-09-24,manga,2688079,202109,2 days 10:15:16,pg_13,True,True,2011.0,fall,2011-10-02,2014-09-24,"['Action', 'Adventure', 'Fantasy']",[],['Shounen'],['Madhouse'],"['VAP', 'Nippon Television Network', 'Shueisha']",Hunters devote themselves to accomplishing haz...,Hunter x Hunter,HUNTER×HUNTER（ハンター×ハンター）,,,
2,Shingeki no Kyojin Season 3 Part 2,tv,9.05,1491491,finished_airing,10.0,2019-04-29,2019-07-01,manga,2133927,55644,0 days 03:59:10,r,True,True,2019.0,spring,2019-04-29,2019-07-01,"['Action', 'Drama']","['Gore', 'Military', 'Survival']",['Shounen'],['Wit Studio'],"['Production I.G', 'Dentsu', 'Mainichi Broadca...",Seeking to restore humanity's diminishing hope...,Attack on Titan Season 3 Part 2,進撃の巨人 Season3 Part.2,,,
3,Steins;Gate,tv,9.07,1348232,finished_airing,24.0,2011-04-06,2011-09-14,visual_novel,2463954,184312,0 days 09:44:00,pg_13,True,True,2011.0,spring,2011-04-06,2011-09-14,"['Drama', 'Sci-Fi', 'Suspense']","['Psychological', 'Time Travel']",[],['White Fox'],"['Frontier Works', 'Media Factory', 'Kadokawa ...",Eccentric scientist Rintarou Okabe has a never...,Steins;Gate,STEINS;GATE,,,
4,Koe no Katachi,movie,8.94,1540277,finished_airing,1.0,2016-09-17,2016-09-17,manga,2218467,84124,0 days 02:10:03,pg_13,True,True,2016.0,summer,2016-09-17,2016-09-17,"['Award Winning', 'Drama']",['Romantic Subtext'],['Shounen'],['Kyoto Animation'],"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","As a wild youth, elementary school student Sho...",A Silent Voice,聲の形,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25169,Bijo to Yajuu?,,,1,finished,,2015-07-01,2015-07-01,manga,4,0,,,True,True,,,2015-07-01,2015-07-01,['Romance'],[],[],,,Born to a noble politician in a prominent poli...,Beauty and the Beastmaster,美女と野獣?,1.0,"[{'id': 12314, 'first_name': 'Carol', 'last_na...",[]
25170,Madarame Shunin wa Ecchi de Zurui!,,,1,finished,16.0,2019-04-12,2020-07-15,manga,4,0,,,False,True,,,2019-04-12,2020-07-15,['Erotica'],[],['Josei'],,,"""If you can get me excited, then I'll give you...",No Fair! Chief Madarame Is a Pervert! I'll Do ...,班目主任はエッチでずるい!,2.0,"[{'id': 52199, 'first_name': 'Roca', 'last_nam...",[]
25171,PSO2 New Genesis: Central!,,,1,currently_publishing,,2021-09-15,,manga,4,0,,,True,True,,,2021-09-15,,"['Comedy', 'Sci-Fi']",[],[],,,Story following the exploits and updates in Ph...,Central!,PSO2 ニュージェネシス せんとらるっ!,,"[{'id': 66980, 'first_name': '', 'last_name': ...",[]
25172,Itazura na Ai no Shisha,,,1,finished,,2017-09-30,2017-09-30,manga,4,0,,,True,True,,,2017-09-30,2017-09-30,['Romance'],[],['Josei'],,,"Ellie, who works at a law firm, is at her wits...",Expecting the Fellani Heir,いたずらな愛の使者,1.0,"[{'id': 13921, 'first_name': 'Lucy', 'last_nam...",['Bessatsu Harlequin']


In [14]:
# train test split
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_full, test_size=0.2, random_state=RANDOM_SEED)
print('Number of rows in train set: ', len(train))

Number of rows in train set:  20139


### Text processing

Tokenization and lemmatization by keeping `{'NOUN', 'VERB', 'ADJ', 'PROPN', 'ADV'}` and named entity.

In [15]:
# tokenization and lemmatization
import spacy

POS_TO_KEEP = {'NOUN', 'VERB', 'ADJ', 'PROPN', 'ADV'}

nlp = spacy.load('en_core_web_sm')

def tokenization(text):
    doc = nlp(text)
    
    # extract named entity spans
    entity_spans = {i for ent in doc.ents for i in range(ent.start, ent.end)}
    
    # filter tokens if it is specified POS or is part of the name entity
    filtered_tokens = [
        token.lemma_ for token in doc 
        if token.pos_ in POS_TO_KEEP or token.i in entity_spans
    ]
    
    return filtered_tokens

train['title_en_token'] = train['title_english'].apply(tokenization)
train['synopsis_token'] = train['synopsis'].apply(tokenization) # this may take around 5 mins

# filter out empty token
train = train[train.title_en_token.apply(lambda x: len(x) > 0) & train.synopsis_token.apply(lambda x: len(x) > 0)] # both columns cannot be empty

print('Number of rows after filtering empty token: ', len(train))

Number of rows after filtering empty token:  20081


In [21]:
# review tokenization
with pd.option_context('display.max_colwidth', None):
    display(train[['title_english', 'title_en_token', 'synopsis', 'synopsis_token']].sample(1))

Unnamed: 0,title_english,title_en_token,synopsis,synopsis_token
9924,Air Gear,"[Air, Gear]","Itsuki Minami needs no introduction—everybody's heard of the ""Babyface"" of the Eastside. He's the toughest kid at Higashi Junior High School, easy on the eyes but dangerously tough when he needs to be. Plus, Itsuki lives with the mysterious and sexy Noyamano sisters. Life is never dull, but it becomes dangerous when Itsuki leads his school to victory over some vindictive Westside punks with gangster connections. Now he stands to lose his school, his friends, and everything he cares about. But in his darkest hour, the Noyamano girls come to Itsuki's aid. They can teach him a powerful skill that will save their school from the gangsters' siege–and introduce Itsuki to a thrilling and terrifying new world.\n\n","[Itsuki, Minami, need, introduction, hear, tough, kid, Higashi, Junior, High, School, easy, eye, dangerously, tough, need, Itsuki, live, mysterious, sexy, Noyamano, sister, life, never, dull, become, dangerous, Itsuki, lead, school, victory, vindictive, punk, gangster, connection, now, stand, lose, school, friend, care, darkest, hour, Noyamano, girl, come, Itsuki, aid, teach, powerful, skill, save, school, gangster, siege, introduce, Itsuki, thrilling, terrify, new, world]"


Since titles and synopses play different roles (titles are short and often genre-indicative, while synopses provide detailed content descriptions), we use two separate vectorizers.

In [17]:
# vectorization

from sklearn.feature_extraction.text import TfidfVectorizer

# initialize title tfidf
tfidf_title = TfidfVectorizer(
    ngram_range=(1,1),  # uni-gram
    max_features=500,   # title are shorter, use lower dim
    min_df=1,           # titles may contain rare but important words
    max_df=0.8          # filter out very common words
)

# initialize synopsis tfidf
tfidf_synopsis = TfidfVectorizer(
    ngram_range=(1,2),  # uni-gram or bi-gram
    max_features=2000,  # synopsis are longer, use higher dim
    min_df=2,           # filter out extremely rare words
    max_df=0.8          # filter out very common words
)

# train tfidf
title_en_tfidf_matrix = tfidf_title.fit_transform(train['title_en_token'].apply(lambda x: " ".join(x)))
synopsis_tfidf_matrix = tfidf_synopsis.fit_transform(train['synopsis_token'].apply(lambda x: " ".join(x)))

# add result to train df
train['title_en_tfidf'] = [title_en_tfidf_matrix[i] for i in range(title_en_tfidf_matrix.shape[0])]
train['synopsis_tfidf'] = [synopsis_tfidf_matrix[i] for i in range(synopsis_tfidf_matrix.shape[0])]

For word embedding, we choose **Word2Vec - Skip-gram** because Tends to capture rare words more effectively (e.g. niche anime/manga-specific vocabulary), comparing to Word2Vec - CBOW and GloVe.

In [18]:
# word embedding
import multiprocessing
from gensim.models import Word2Vec

# dynamically determine the number of CPU cores
num_workers = multiprocessing.cpu_count()

# train title skipgram model
skipgram_model_title = Word2Vec(
    train['title_en_token'].tolist(),
    sg=1,           # skip-gram
    vector_size=50, # title are shorter, use lower dim
    window=2,       # title are shorter, use smaller window size
    min_count=1,    # titles may contain rare but important words
    epochs=30,      # title are shorter, need more epochs to train
    workers=num_workers,
    seed=RANDOM_SEED
)

# train synopsis skipgram model
skipgram_model_synopsis = Word2Vec(
    train['synopsis_token'].tolist(),
    sg=1,               # skip-gram
    vector_size=150,    # synopsis are longer, use higher dim
    window=5,           # synopsis are longer, use larger window size
    min_count=2,        # filter out extremely rare words
    epochs=15,
    workers=num_workers,
    seed=RANDOM_SEED
)

# clean tokens that does not exist in the skipgram vocab (because of `min_count`)
model_vocab = set(skipgram_model_synopsis.wv.index_to_key)
train['synopsis_token'] = train['synopsis_token'].apply(lambda x: [token for token in x if token in model_vocab])
train = train[train['synopsis_token'].apply(lambda x: len(x) > 0)]  # filter out empty entry after clean tokens

# apply skipgram model
train['title_en_skipgram'] = train['title_en_token'].apply(lambda x: skipgram_model_title.wv[x])
train['synopsis_skipgram'] = train['synopsis_token'].apply(lambda x: skipgram_model_synopsis.wv[x])

In [22]:
train

Unnamed: 0,title,type,score,scored_by,status,episodes/chapters,start_date,end_date,original_source,members,favorites,total_duration,rating,sfw,approved,start_year,start_season,real_start_date,real_end_date,genres,themes,demographics,studios,producers,synopsis,title_english,title_japanese,volumes,authors,serializations,title_en_token,synopsis_token,title_en_tfidf,synopsis_tfidf,title_en_skipgram,synopsis_skipgram
6972,Rakka-sei,ona,6.17,144,finished_airing,1.0,2021-01-09,2021-01-09,original,316,0,0 days 00:01:35,pg_13,True,True,2021.0,winter,2021-01-09,2021-01-09,['Supernatural'],['Music'],[],['Flat Studio'],[],An original concept movie directed by banishme...,Life of Falling,落下生,,,,"[life, fall]","[original, concept, movie, direct, banishment,...","(0, 134)\t0.7702479530861399\n (0, 242)\t0....","(0, 584)\t0.17446410372690827\n (0, 1510)\t...","[[-0.6176944, -0.55040276, -1.0358789, 0.16613...","[[-0.7382443, 0.17262861, 0.08548382, 0.240234..."
8659,Sora Iro Hana Iro,movie,,52,finished_airing,1.0,2005-01-01,2005-01-01,,244,0,0 days 00:06:40,pg,True,True,,,2005,2005,"['Drama', 'Slice of Life']",[],[],[],['Tomoyasu Murata Company'],A woman and a dog go on a spiritual journey an...,Sky Colour Flower Colour,空色花色,,,,"[Sky, Colour, Flower, Colour]","[woman, dog, go, spiritual, journey, undergo, ...","(0, 146)\t0.6954718347226682\n (0, 389)\t0....","(0, 1360)\t0.23839687929050968\n (0, 635)\t...","[[0.3555759, -0.8184715, 0.13629015, -0.535150...","[[0.36267883, 0.0076106847, -0.12620318, -0.09..."
13144,"Doumo, Suki na Hito ni Horegusuri wo Irai sare...",,7.33,454,currently_publishing,,2020-08-28,,manga,1986,14,,,True,True,,,2020-08-28,,"['Fantasy', 'Romance']",[],[],,,"""I want you to make a love potion.""\n\nThe Goo...","Hi, I'm a Witch, and My Crush Wants Me to Make...",どうも、好きな人に惚れ薬を依頼された魔女です。,,"[{'id': 26905, 'first_name': 'Misato', 'last_n...",['Flos Comic'],"[Hi, witch, my, Crush, want, I, make, Love, Po...","[want, make, love, potion, the, Good, Witch, o...","(0, 253)\t0.3028344729691104\n (0, 263)\t0....","(0, 1201)\t0.15183993865040285\n (0, 1470)\...","[[0.12223349, -0.09126734, -0.16066936, -0.361...","[[-0.19333898, 0.06426527, -0.60138196, 0.4696..."
11686,Okaeri Alice,,7.13,4478,currently_publishing,,2020-04-09,,manga,16490,213,,,True,True,,,2020-04-09,,"['Drama', 'Romance']","['Love Polygon', 'School']",['Shounen'],,,"Childhood friends Youhei, Kei, and Yui are reu...","Welcome Back, Alice",おかえりアリス,,"[{'id': 6579, 'first_name': 'Shuuzou', 'last_n...",['Bessatsu Shounen Magazine'],"[welcome, Alice]","[childhood, friend, Youhei, Kei, Yui, reunite,...","(0, 11)\t0.7029792660122359\n (0, 476)\t0.7...","(0, 282)\t0.19316728993571086\n (0, 851)\t0...","[[-0.120777614, 0.03527056, -0.22116718, -0.90...","[[0.4910193, -0.18392427, 0.21176814, -0.16055..."
24369,Usotsuki Kyoushi to Shinitagari,,,14,finished,7.0,2019-04-24,2019-08-01,manga,33,2,,,False,True,,,2019-04-24,2019-08,"['Boys Love', 'Erotica']",['School'],[],,,"""Sensei, if you lie to me, I'll jump off right...",Liar Teacher & Suicidal Boy,うそつき教師と死にたがり,1.0,"[{'id': 43084, 'first_name': '', 'last_name': ...",[],"[Liar, Teacher, Suicidal, Boy]","[Sensei, lie, jump, right, now, school, roofto...","(0, 46)\t0.6504697887406832\n (0, 431)\t0.7...","(0, 1482)\t0.13160878206238308\n (0, 1086)\...","[[-0.0019331471, -0.1677159, -0.31984, -0.6544...","[[-0.5786341, 0.2120141, -0.4648349, 0.7544175..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15377,Deja-vu,,7.05,242,finished,6.0,2004-01-01,,manhwa,669,4,,,True,True,,,2004,,"['Drama', 'Fantasy', 'Romance', 'Sci-Fi']",['Historical'],[],,,"A collection of four oneshots (Spring, Summer,...","Deja-vu: Spring, Summer, Fall, Winter",데자부,1.0,"[{'id': 2049, 'first_name': 'In-Wan', 'last_na...",[],"[Deja, vu, Spring, ,, Summer, ,, Fall, Winter]","[collection, four, oneshot, Spring, ,, Summer,...","(0, 482)\t0.5592869177115406\n (0, 421)\t0....","(0, 1643)\t0.1824885216615261\n (0, 346)\t0...","[[-0.025306242, 0.0058005485, -0.051206335, -0...","[[0.4465498, 0.5653541, 0.41185227, 0.23828869..."
21602,Sheets no Namima de Miru Yume Mitai na,,,53,finished,10.0,2019-03-01,2020-10-19,manga,134,2,,,False,True,,,2019-03-01,2020-10-19,['Erotica'],[],['Josei'],,,Tender caresses and kisses exchanged in a hote...,A Dream Between the Sheets,シーツの波間でみる夢みたいな,2.0,"[{'id': 59120, 'first_name': 'Ruka', 'last_nam...",[],"[dream, sheet]","[tender, caress, kiss, exchange, hotel, suite,...","(0, 116)\t1.0","(0, 887)\t0.2069650059535709\n (0, 832)\t0....","[[-0.20427406, -0.11735094, 0.063083075, -0.45...","[[-0.22137056, 0.11321116, -0.21867803, -0.200..."
17730,Shichinin no Nana,,6.54,227,finished,27.0,2001-11-08,2002-05-25,manga,520,3,,,True,True,,,2001-11-08,2002-05-25,"['Comedy', 'Romance']",[],[],,,The fiasco started one ordinary night when Nan...,Seven of Seven,七人のナナ,3.0,"[{'id': 3710, 'first_name': 'Yasuhiro', 'last_...",['Shounen Champion (Weekly)'],"[seven, Seven]","[fiasco, start, one, ordinary, night, Nana, Su...","(0, 379)\t1.0","(0, 398)\t0.2213760138471667\n (0, 1607)\t0...","[[0.19594955, 0.08623988, -0.2528404, -0.13785...","[[0.0635403, 0.019355707, -0.057407103, 0.0702..."
15725,Gakuen Police,,6.59,659,finished,16.0,2012-05-22,2014-06-21,manga,2047,2,,,True,True,,,2012-05-22,2014-06-21,"['Comedy', 'Girls Love', 'Romance']",['School'],['Seinen'],,,Ever since Sasami was a little girl she's admi...,Gakuen Polizi,学園ポリーチェ,2.0,"[{'id': 3182, 'first_name': 'Milk', 'last_name...",['Comic High!'],"[Gakuen, Polizi]","[ever, Sasami, little, girl, admire, defend, j...",,"(0, 1976)\t0.1083459491801386\n (0, 920)\t0...","[[-0.057963822, -0.3878628, -0.17901114, -0.50...","[[0.25038865, 0.21894822, -0.30093294, -0.1645..."
