In [1]:
!where python # ensure you are in virtual environment

c:\Users\HK-Laptop-V639\Documents\GitHub\696\env696\Scripts\python.exe
C:\Users\HK-Laptop-V639\AppData\Local\Programs\Python\Python312\python.exe
C:\Users\HK-Laptop-V639\AppData\Local\Microsoft\WindowsApps\python.exe


In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
pd.set_option("display.max_columns", None)
RANDOM_SEED = 123

In [4]:
df_anime = pd.read_csv("assets/anime.csv")
df_manga = pd.read_csv("assets/manga.csv")

In [5]:
df_anime.shape, df_manga.shape

((24985, 39), (64833, 30))

## Data Prepocessing

In [6]:
def data_cleaning(input_anime, input_manga):
    df_anime = input_anime.copy()
    df_manga = input_manga.copy()

    # remove unnecessary columns
    df_anime = df_anime.drop(columns=['anime_id', 'main_picture', 'url', 'trailer_url', 'background', 'created_at', 'updated_at', 'episode_duration', 'broadcast_day', 'broadcast_time', 'licensors', 'title_synonyms'])
    df_manga = df_manga.drop(columns=['manga_id', 'main_picture', 'url', 'background', 'created_at_before', 'updated_at', 'title_synonyms'])

    # remove rows that are null in 'synopsis' and 'title', which are crucial for our project
    df_anime.dropna(subset=['title', 'synopsis', 'title_english', 'title_japanese'], inplace=True)
    df_manga.dropna(subset=['title', 'synopsis', 'title_english', 'title_japanese'], inplace=True)

    # remove '(Sources:...)' from synopsis
    df_anime['synopsis'] = df_anime['synopsis'].apply(lambda x: re.sub(r'\(Source:.*\)', '', x))
    df_manga['synopsis'] = df_manga['synopsis'].apply(lambda x: re.sub(r'\(Source:.*\)', '', x))

    # remove '[Written by ...]' from synopsis
    df_anime['synopsis'] = df_anime['synopsis'].apply(lambda x: re.sub(r'\[Written by.*\]', '', x))
    df_manga['synopsis'] = df_manga['synopsis'].apply(lambda x: re.sub(r'\[Written by.*\]', '', x))

    print('cleaned anime shape: ', df_anime.shape)
    print('cleaned manga shape: ', df_manga.shape)

    return df_anime, df_manga

df_anime_cleaned, df_manga_cleaned = data_cleaning(df_anime, df_manga)

cleaned anime shape:  (9506, 27)
cleaned manga shape:  (15668, 23)


In [7]:
print("anime extra columns: \n", [col for col in df_anime_cleaned.columns if not col in df_manga_cleaned.columns])

anime extra columns: 
 ['episodes', 'source', 'total_duration', 'rating', 'start_year', 'start_season', 'studios', 'producers']


In [8]:
print("manga extra columns: \n", [col for col in df_manga_cleaned.columns if not col in df_anime_cleaned.columns])

manga extra columns: 
 ['volumes', 'chapters', 'authors', 'serializations']


In [9]:
print("common columns: \n", [col for col in df_anime_cleaned.columns if col in df_manga_cleaned.columns])

common columns: 
 ['title', 'type', 'score', 'scored_by', 'status', 'start_date', 'end_date', 'members', 'favorites', 'sfw', 'approved', 'real_start_date', 'real_end_date', 'genres', 'themes', 'demographics', 'synopsis', 'title_english', 'title_japanese']


In [10]:
df_manga['type'].unique()

array(['manga', 'manhwa', 'light_novel', 'one_shot', 'manhua', 'novel',
       'doujinshi'], dtype=object)

In [11]:
df_anime['source'].unique()

array(['manga', 'visual_novel', 'original', 'web_manga', 'light_novel',
       'novel', 'game', '4_koma_manga', 'music', 'other', 'web_novel',
       'card_game', 'book', 'mixed_media', nan, 'picture_book', 'radio'],
      dtype=object)

In [12]:
df_anime['type'].unique()

array(['tv', 'movie', 'ona', 'ova', 'special', 'music', nan], dtype=object)

**Extra columns alignment**:
| anime columns | manga columns | combine |
| --- | --- | --- |
| episodes | chapters | episodes/chapters |
| NULL | volumes |
| source | type | original_source |
| type | NULL |
| total_duration | NULL |
| rating | NULL |
| start_year | NULL |
| start_season | NULL |
| studios | NULL |
| producers | NULL |
| NULL | authors |
| NULL | serializations |

In [13]:
def columns_alignment(input_anime, input_manga):
    df_anime = input_anime.copy()
    df_manga = input_manga.copy()

    # treat 'episodes' and 'chapters' the same, create null value for 'volume' in anime
    df_anime.rename(columns={'episodes': 'episodes/chapters'}, inplace=True)
    df_manga.rename(columns={'chapters': 'episodes/chapters'}, inplace=True)

    # 'source' in anime is corresponding to the 'type' in manga, so combine them
    df_anime.rename(columns={'source': 'original_source'}, inplace=True)
    df_manga.rename(columns={'type': 'original_source'}, inplace=True)

    return df_anime, df_manga

df_anime_aligned, df_manga_aligned = columns_alignment(df_anime_cleaned, df_manga_cleaned)


In [14]:
df_full = pd.concat([df_anime_aligned, df_manga_aligned], ignore_index=True)
df_full

Unnamed: 0,title,type,score,scored_by,status,episodes/chapters,start_date,end_date,original_source,members,favorites,total_duration,rating,sfw,approved,start_year,start_season,real_start_date,real_end_date,genres,themes,demographics,studios,producers,synopsis,title_english,title_japanese,volumes,authors,serializations
0,Fullmetal Alchemist: Brotherhood,tv,9.10,2037075,finished_airing,64.0,2009-04-05,2010-07-04,manga,3206028,219036,1 days 01:57:20,r,True,True,2009.0,spring,2009-04-05,2010-07-04,"['Action', 'Adventure', 'Drama', 'Fantasy']",['Military'],['Shounen'],['Bones'],"['Aniplex', 'Square Enix', 'Mainichi Broadcast...",After a horrific alchemy experiment goes wrong...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,,,
1,Hunter x Hunter (2011),tv,9.04,1671587,finished_airing,148.0,2011-10-02,2014-09-24,manga,2688079,202109,2 days 10:15:16,pg_13,True,True,2011.0,fall,2011-10-02,2014-09-24,"['Action', 'Adventure', 'Fantasy']",[],['Shounen'],['Madhouse'],"['VAP', 'Nippon Television Network', 'Shueisha']",Hunters devote themselves to accomplishing haz...,Hunter x Hunter,HUNTER×HUNTER（ハンター×ハンター）,,,
2,Shingeki no Kyojin Season 3 Part 2,tv,9.05,1491491,finished_airing,10.0,2019-04-29,2019-07-01,manga,2133927,55644,0 days 03:59:10,r,True,True,2019.0,spring,2019-04-29,2019-07-01,"['Action', 'Drama']","['Gore', 'Military', 'Survival']",['Shounen'],['Wit Studio'],"['Production I.G', 'Dentsu', 'Mainichi Broadca...",Seeking to restore humanity's diminishing hope...,Attack on Titan Season 3 Part 2,進撃の巨人 Season3 Part.2,,,
3,Steins;Gate,tv,9.07,1348232,finished_airing,24.0,2011-04-06,2011-09-14,visual_novel,2463954,184312,0 days 09:44:00,pg_13,True,True,2011.0,spring,2011-04-06,2011-09-14,"['Drama', 'Sci-Fi', 'Suspense']","['Psychological', 'Time Travel']",[],['White Fox'],"['Frontier Works', 'Media Factory', 'Kadokawa ...",Eccentric scientist Rintarou Okabe has a never...,Steins;Gate,STEINS;GATE,,,
4,Koe no Katachi,movie,8.94,1540277,finished_airing,1.0,2016-09-17,2016-09-17,manga,2218467,84124,0 days 02:10:03,pg_13,True,True,2016.0,summer,2016-09-17,2016-09-17,"['Award Winning', 'Drama']",['Romantic Subtext'],['Shounen'],['Kyoto Animation'],"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","As a wild youth, elementary school student Sho...",A Silent Voice,聲の形,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25169,Bijo to Yajuu?,,,1,finished,,2015-07-01,2015-07-01,manga,4,0,,,True,True,,,2015-07-01,2015-07-01,['Romance'],[],[],,,Born to a noble politician in a prominent poli...,Beauty and the Beastmaster,美女と野獣?,1.0,"[{'id': 12314, 'first_name': 'Carol', 'last_na...",[]
25170,Madarame Shunin wa Ecchi de Zurui!,,,1,finished,16.0,2019-04-12,2020-07-15,manga,4,0,,,False,True,,,2019-04-12,2020-07-15,['Erotica'],[],['Josei'],,,"""If you can get me excited, then I'll give you...",No Fair! Chief Madarame Is a Pervert! I'll Do ...,班目主任はエッチでずるい!,2.0,"[{'id': 52199, 'first_name': 'Roca', 'last_nam...",[]
25171,PSO2 New Genesis: Central!,,,1,currently_publishing,,2021-09-15,,manga,4,0,,,True,True,,,2021-09-15,,"['Comedy', 'Sci-Fi']",[],[],,,Story following the exploits and updates in Ph...,Central!,PSO2 ニュージェネシス せんとらるっ!,,"[{'id': 66980, 'first_name': '', 'last_name': ...",[]
25172,Itazura na Ai no Shisha,,,1,finished,,2017-09-30,2017-09-30,manga,4,0,,,True,True,,,2017-09-30,2017-09-30,['Romance'],[],['Josei'],,,"Ellie, who works at a law firm, is at her wits...",Expecting the Fellani Heir,いたずらな愛の使者,1.0,"[{'id': 13921, 'first_name': 'Lucy', 'last_nam...",['Bessatsu Harlequin']


In [15]:
# train test split
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_full, test_size=0.2, random_state=RANDOM_SEED)
print('Number of rows in train set: ', len(train))

Number of rows in train set:  20139


### Text processing

Tokenization and lemmatization by keeping `{'NOUN', 'VERB', 'ADJ', 'PROPN', 'ADV'}` and named entity.

In [None]:
#!python -m spacy download en_core_web_sm

In [32]:
# tokenization and lemmatization
import spacy

POS_TO_KEEP = {'NOUN', 'VERB', 'ADJ', 'PROPN', 'ADV'}

nlp = spacy.load('en_core_web_sm')

def tokenization(text):
    doc = nlp(text)
    
    # extract named entity spans
    entity_spans = {i for ent in doc.ents for i in range(ent.start, ent.end)}
    
    # filter tokens if it is specified POS or is part of the name entity
    filtered_tokens = [
        token.lemma_ for token in doc 
        if token.pos_ in POS_TO_KEEP or token.i in entity_spans
    ]
    
    return filtered_tokens

train['title_en_token'] = train['title_english'].apply(tokenization)
train['synopsis_token'] = train['synopsis'].apply(tokenization) # this may take around 5 mins

# filter out empty token
train = train[train.title_en_token.apply(lambda x: len(x) > 0) & train.synopsis_token.apply(lambda x: len(x) > 0)] # both columns cannot be empty

print('Number of rows after filtering empty token: ', len(train))

Number of rows after filtering empty token:  20080


In [25]:
# review tokenization
with pd.option_context('display.max_colwidth', None):
    display(train[['title_english', 'title_en_token', 'synopsis', 'synopsis_token']].sample(1))

Unnamed: 0,title_english,title_en_token,synopsis,synopsis_token
2885,Mars Daybreak,"[Mars, Daybreak]","Mars is now almost entirely covered in water. Humanity exists in large city-ships that float through the open seas. But life is hard for those who live on Mars—the economy is in bad shape, work is scarce, and food is expensive and highly prized. Gram and his friends try to do the best they can, but the work keeps drying up. Some have taken to a life of piracy to combat the corruption in the government; one such group is the pirates of the feared Ship of Aurora. And the Earth government, which rules Mars, has dispatched a new team of military pilots to combat them. In their specialized mecha called Round Bucklers, they must make the seas of Mars safe for humanity. Caught in the wrong place at the wrong time, Gram finds himself on the run with the most notorious pirates on Mars. But here's the thing—he's starting to like them!\n\n","[Mars, now, almost, entirely, cover, water, humanity, exist, large, city, ship, float, open, sea, life, hard, live, Mars, economy, bad, shape, work, scarce, food, expensive, highly, prized, Gram, friend, try, do, good, work, keep, dry, take, life, piracy, combat, corruption, government, one, such, group, pirate, fear, Ship, Aurora, Earth, government, rule, Mars, dispatch, new, team, military, pilot, combat, specialized, mecha, call, Round, Bucklers, make, sea, Mars, safe, humanity, catch, wrong, place, wrong, time, Gram, find, run, most, notorious, pirate, Mars, here, thing, start, like]"


Since titles and synopses play different roles (titles are short and often genre-indicative, while synopses provide detailed content descriptions), we use two separate vectorizers.

In [26]:
# vectorization

from sklearn.feature_extraction.text import TfidfVectorizer

# initialize title tfidf
tfidf_title = TfidfVectorizer(
    ngram_range=(1,1),  # uni-gram
    max_features=500,   # title are shorter, use lower dim
    min_df=1,           # titles may contain rare but important words
    max_df=0.8          # filter out very common words
)

# initialize synopsis tfidf
tfidf_synopsis = TfidfVectorizer(
    ngram_range=(1,2),  # uni-gram or bi-gram
    max_features=2000,  # synopsis are longer, use higher dim
    min_df=2,           # filter out extremely rare words
    max_df=0.8          # filter out very common words
)

# train tfidf
title_en_tfidf_matrix = tfidf_title.fit_transform(train['title_en_token'].apply(lambda x: " ".join(x)))
synopsis_tfidf_matrix = tfidf_synopsis.fit_transform(train['synopsis_token'].apply(lambda x: " ".join(x)))

# add result to train df
train['title_en_tfidf'] = [title_en_tfidf_matrix[i] for i in range(title_en_tfidf_matrix.shape[0])]
train['synopsis_tfidf'] = [synopsis_tfidf_matrix[i] for i in range(synopsis_tfidf_matrix.shape[0])]

For word embedding, we choose **Word2Vec - Skip-gram** because Tends to capture rare words more effectively (e.g. niche anime/manga-specific vocabulary), comparing to Word2Vec - CBOW and GloVe.

In [27]:
# word embedding
import multiprocessing
from gensim.models import Word2Vec

# dynamically determine the number of CPU cores
num_workers = multiprocessing.cpu_count()

# train title skipgram model
skipgram_model_title = Word2Vec(
    train['title_en_token'].tolist(),
    sg=1,               # skip-gram
    vector_size=50,     # title are shorter, use lower dim
    window=2,           # title are shorter, use smaller window size
    min_count=1,        # titles may contain rare but important words
    epochs=30,          # title are shorter, need more epochs to train
    workers=num_workers,
    seed=RANDOM_SEED
)

# train synopsis skipgram model
skipgram_model_synopsis = Word2Vec(
    train['synopsis_token'].tolist(),
    sg=1,               # skip-gram
    vector_size=150,    # synopsis are longer, use higher dim
    window=5,           # synopsis are longer, use larger window size
    min_count=2,        # filter out extremely rare words
    epochs=15,
    workers=num_workers,
    seed=RANDOM_SEED
)

# clean tokens that does not exist in the skipgram vocab (because of `min_count`)
model_vocab = set(skipgram_model_synopsis.wv.index_to_key)
train['synopsis_token'] = train['synopsis_token'].apply(lambda x: [token for token in x if token in model_vocab])
train = train[train['synopsis_token'].apply(lambda x: len(x) > 0)]  # filter out empty entry after clean tokens

# apply skipgram model
train['title_en_skipgram'] = train['title_en_token'].apply(lambda x: skipgram_model_title.wv[x])
train['synopsis_skipgram'] = train['synopsis_token'].apply(lambda x: skipgram_model_synopsis.wv[x])

In [28]:
train[['title_japanese', 'title_english', 'synopsis', 'title_en_tfidf', 'title_en_skipgram', 'synopsis_tfidf', 'synopsis_skipgram']]

Unnamed: 0,title_japanese,title_english,synopsis,title_en_tfidf,title_en_skipgram,synopsis_tfidf,synopsis_skipgram
6972,落下生,Life of Falling,An original concept movie directed by banishme...,"(0, 242)\t0.6377445340938734\n (0, 134)\t0....","[[-0.66485, -1.4011656, -0.54034376, -0.339818...","(0, 1242)\t0.14569673008382766\n (0, 1126)\...","[[-0.33755174, 0.6038259, -0.14578481, -0.2066..."
8659,空色花色,Sky Colour Flower Colour,A woman and a dog go on a spiritual journey an...,"(0, 389)\t0.7185533571750159\n (0, 146)\t0....","[[0.6963319, -0.69409543, -0.07727285, -0.6316...","(0, 1545)\t0.16974059323722257\n (0, 1955)\...","[[0.0110145565, 0.061159577, -0.17982647, -0.2..."
13144,どうも、好きな人に惚れ薬を依頼された魔女です。,"Hi, I'm a Witch, and My Crush Wants Me to Make...","""I want you to make a love potion.""\n\nThe Goo...","(0, 484)\t0.47310031371762395\n (0, 286)\t0...","[[0.014632595, -0.045632605, -0.06875107, -0.3...","(0, 1789)\t0.08472398359994046\n (0, 1667)\...","[[0.05459396, 0.2120287, -0.76196116, 0.403660..."
11686,おかえりアリス,"Welcome Back, Alice","Childhood friends Youhei, Kei, and Yui are reu...","(0, 476)\t0.7112103426953927\n (0, 11)\t0.7...","[[0.09484273, -0.07955534, -0.4588565, -0.8927...","(0, 805)\t0.12752718190515108\n (0, 1509)\t...","[[0.3446322, -0.06830638, -0.19729792, 0.16929..."
24369,うそつき教師と死にたがり,Liar Teacher & Suicidal Boy,"""Sensei, if you lie to me, I'll jump off right...","(0, 431)\t0.7595321283103508\n (0, 46)\t0.6...","[[-0.038570467, -0.11628736, -0.35151264, -0.6...","(0, 1509)\t0.06048166608467285\n (0, 1667)\...","[[0.30980316, 0.47167444, -0.37498358, 0.74289..."
...,...,...,...,...,...,...,...
15377,데자부,"Deja-vu: Spring, Summer, Fall, Winter","A collection of four oneshots (Spring, Summer,...","(0, 134)\t0.4636483477884106\n (0, 402)\t0....","[[-0.030162197, -0.018888334, -0.023518674, -0...","(0, 1664)\t0.28970119380727755\n (0, 1116)\...","[[0.16226849, -0.13105954, 0.021690583, 0.2938..."
21602,シーツの波間でみる夢みたいな,A Dream Between the Sheets,Tender caresses and kisses exchanged in a hote...,"(0, 116)\t1.0","[[-0.0856288, -0.328528, 0.124021746, -0.76370...","(0, 986)\t0.08047698770124137\n (0, 1917)\t...","[[0.20996769, 0.013712674, -0.101780675, 0.000..."
17730,七人のナナ,Seven of Seven,The fiasco started one ordinary night when Nan...,"(0, 379)\t1.0","[[-0.13962914, 0.004627935, -0.17478319, -0.21...","(0, 1242)\t0.16858197111774514\n (0, 1133)\...","[[0.09954806, 0.055672526, -0.066426694, 0.059..."
15725,学園ポリーチェ,Gakuen Polizi,Ever since Sasami was a little girl she's admi...,,"[[-0.06084153, -0.2533829, -0.17746073, -0.555...","(0, 805)\t0.10347026164845212\n (0, 1509)\t...","[[-0.037250537, 0.11637662, -0.25541684, -0.26..."
