In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
pd.set_option("display.max_columns", None)
RANDOM_SEED = 123

In [3]:
df_anime = pd.read_csv("assets/anime.csv")
df_manga = pd.read_csv("assets/manga.csv")

In [4]:
df_anime.shape, df_manga.shape

((24985, 39), (64833, 30))

## Data Prepocessing

In [5]:
def data_cleaning(input_anime, input_manga):
    df_anime = input_anime.copy()
    df_manga = input_manga.copy()

    # remove unnecessary columns
    df_anime = df_anime.drop(columns=['anime_id', 'main_picture', 'url', 'trailer_url', 'background', 'created_at', 'updated_at', 'episode_duration', 'broadcast_day', 'broadcast_time', 'licensors', 'title_synonyms'])
    df_manga = df_manga.drop(columns=['manga_id', 'main_picture', 'url', 'background', 'created_at_before', 'updated_at', 'title_synonyms'])

    # remove rows that are null in 'synopsis' and 'title', which are crucial for our project
    df_anime.dropna(subset=['title', 'synopsis', 'title_english', 'title_japanese'], inplace=True)
    df_manga.dropna(subset=['title', 'synopsis', 'title_english', 'title_japanese'], inplace=True)

    # remove '(Sources:...)' from synopsis
    df_anime.synopsis = df_anime.synopsis.apply(lambda x: re.sub(r'\(Source:.*\)', '', x))
    df_manga.synopsis = df_manga.synopsis.apply(lambda x: re.sub(r'\(Source:.*\)', '', x))

    # remove '[Written by ...]' from synopsis
    df_anime.synopsis = df_anime.synopsis.apply(lambda x: re.sub(r'\[Written by.*\]', '', x))
    df_manga.synopsis = df_manga.synopsis.apply(lambda x: re.sub(r'\[Written by.*\]', '', x))

    print('cleaned anime shape: ', df_anime.shape)
    print('cleaned manga shape: ', df_manga.shape)

    return df_anime, df_manga

df_anime_cleaned, df_manga_cleaned = data_cleaning(df_anime, df_manga)

cleaned anime shape:  (9506, 27)
cleaned manga shape:  (15668, 23)


In [6]:
print("anime extra columns: \n", [col for col in df_anime_cleaned.columns if not col in df_manga_cleaned.columns])

anime extra columns: 
 ['episodes', 'source', 'total_duration', 'rating', 'start_year', 'start_season', 'studios', 'producers']


In [7]:
print("manga extra columns: \n", [col for col in df_manga_cleaned.columns if not col in df_anime_cleaned.columns])

manga extra columns: 
 ['volumes', 'chapters', 'authors', 'serializations']


In [8]:
print("common columns: \n", [col for col in df_anime_cleaned.columns if col in df_manga_cleaned.columns])

common columns: 
 ['title', 'type', 'score', 'scored_by', 'status', 'start_date', 'end_date', 'members', 'favorites', 'sfw', 'approved', 'real_start_date', 'real_end_date', 'genres', 'themes', 'demographics', 'synopsis', 'title_english', 'title_japanese']


In [9]:
df_manga.type.unique()

array(['manga', 'manhwa', 'light_novel', 'one_shot', 'manhua', 'novel',
       'doujinshi'], dtype=object)

In [10]:
df_anime.source.unique()

array(['manga', 'visual_novel', 'original', 'web_manga', 'light_novel',
       'novel', 'game', '4_koma_manga', 'music', 'other', 'web_novel',
       'card_game', 'book', 'mixed_media', nan, 'picture_book', 'radio'],
      dtype=object)

In [11]:
df_anime.type.unique()

array(['tv', 'movie', 'ona', 'ova', 'special', 'music', nan], dtype=object)

**Extra columns alignment**:
| anime columns | manga columns | combine |
| --- | --- | --- |
| episodes | chapters | episodes/chapters |
| NULL | volumes |
| source | type | original_source |
| type | NULL |
| total_duration | NULL |
| rating | NULL |
| start_year | NULL |
| start_season | NULL |
| studios | NULL |
| producers | NULL |
| NULL | authors |
| NULL | serializations |

In [12]:
def columns_alignment(input_anime, input_manga):
    df_anime = input_anime.copy()
    df_manga = input_manga.copy()

    # treat 'episodes' and 'chapters' the same, create null value for 'volume' in anime
    df_anime.rename(columns={'episodes': 'episodes/chapters'}, inplace=True)
    df_manga.rename(columns={'chapters': 'episodes/chapters'}, inplace=True)

    # 'source' in anime is corresponding to the 'type' in manga, so combine them
    df_anime.rename(columns={'source': 'original_source'}, inplace=True)
    df_manga.rename(columns={'type': 'original_source'}, inplace=True)

    return df_anime, df_manga

df_anime_aligned, df_manga_aligned = columns_alignment(df_anime_cleaned, df_manga_cleaned)


In [13]:
df_full = pd.concat([df_anime_aligned, df_manga_aligned], ignore_index=True)
df_full

Unnamed: 0,title,type,score,scored_by,status,episodes/chapters,start_date,end_date,original_source,members,favorites,total_duration,rating,sfw,approved,start_year,start_season,real_start_date,real_end_date,genres,themes,demographics,studios,producers,synopsis,title_english,title_japanese,volumes,authors,serializations
0,Fullmetal Alchemist: Brotherhood,tv,9.10,2037075,finished_airing,64.0,2009-04-05,2010-07-04,manga,3206028,219036,1 days 01:57:20,r,True,True,2009.0,spring,2009-04-05,2010-07-04,"['Action', 'Adventure', 'Drama', 'Fantasy']",['Military'],['Shounen'],['Bones'],"['Aniplex', 'Square Enix', 'Mainichi Broadcast...",After a horrific alchemy experiment goes wrong...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,,,
1,Hunter x Hunter (2011),tv,9.04,1671587,finished_airing,148.0,2011-10-02,2014-09-24,manga,2688079,202109,2 days 10:15:16,pg_13,True,True,2011.0,fall,2011-10-02,2014-09-24,"['Action', 'Adventure', 'Fantasy']",[],['Shounen'],['Madhouse'],"['VAP', 'Nippon Television Network', 'Shueisha']",Hunters devote themselves to accomplishing haz...,Hunter x Hunter,HUNTER×HUNTER（ハンター×ハンター）,,,
2,Shingeki no Kyojin Season 3 Part 2,tv,9.05,1491491,finished_airing,10.0,2019-04-29,2019-07-01,manga,2133927,55644,0 days 03:59:10,r,True,True,2019.0,spring,2019-04-29,2019-07-01,"['Action', 'Drama']","['Gore', 'Military', 'Survival']",['Shounen'],['Wit Studio'],"['Production I.G', 'Dentsu', 'Mainichi Broadca...",Seeking to restore humanity's diminishing hope...,Attack on Titan Season 3 Part 2,進撃の巨人 Season3 Part.2,,,
3,Steins;Gate,tv,9.07,1348232,finished_airing,24.0,2011-04-06,2011-09-14,visual_novel,2463954,184312,0 days 09:44:00,pg_13,True,True,2011.0,spring,2011-04-06,2011-09-14,"['Drama', 'Sci-Fi', 'Suspense']","['Psychological', 'Time Travel']",[],['White Fox'],"['Frontier Works', 'Media Factory', 'Kadokawa ...",Eccentric scientist Rintarou Okabe has a never...,Steins;Gate,STEINS;GATE,,,
4,Koe no Katachi,movie,8.94,1540277,finished_airing,1.0,2016-09-17,2016-09-17,manga,2218467,84124,0 days 02:10:03,pg_13,True,True,2016.0,summer,2016-09-17,2016-09-17,"['Award Winning', 'Drama']",['Romantic Subtext'],['Shounen'],['Kyoto Animation'],"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","As a wild youth, elementary school student Sho...",A Silent Voice,聲の形,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25169,Bijo to Yajuu?,,,1,finished,,2015-07-01,2015-07-01,manga,4,0,,,True,True,,,2015-07-01,2015-07-01,['Romance'],[],[],,,Born to a noble politician in a prominent poli...,Beauty and the Beastmaster,美女と野獣?,1.0,"[{'id': 12314, 'first_name': 'Carol', 'last_na...",[]
25170,Madarame Shunin wa Ecchi de Zurui!,,,1,finished,16.0,2019-04-12,2020-07-15,manga,4,0,,,False,True,,,2019-04-12,2020-07-15,['Erotica'],[],['Josei'],,,"""If you can get me excited, then I'll give you...",No Fair! Chief Madarame Is a Pervert! I'll Do ...,班目主任はエッチでずるい!,2.0,"[{'id': 52199, 'first_name': 'Roca', 'last_nam...",[]
25171,PSO2 New Genesis: Central!,,,1,currently_publishing,,2021-09-15,,manga,4,0,,,True,True,,,2021-09-15,,"['Comedy', 'Sci-Fi']",[],[],,,Story following the exploits and updates in Ph...,Central!,PSO2 ニュージェネシス せんとらるっ!,,"[{'id': 66980, 'first_name': '', 'last_name': ...",[]
25172,Itazura na Ai no Shisha,,,1,finished,,2017-09-30,2017-09-30,manga,4,0,,,True,True,,,2017-09-30,2017-09-30,['Romance'],[],['Josei'],,,"Ellie, who works at a law firm, is at her wits...",Expecting the Fellani Heir,いたずらな愛の使者,1.0,"[{'id': 13921, 'first_name': 'Lucy', 'last_nam...",['Bessatsu Harlequin']


In [14]:
# train test split
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_full, test_size=0.2, random_state=RANDOM_SEED)

### Text processing

In [15]:
title_en = train.title_english
synopsis = train.synopsis

We tokenize title and synopsis by keeping `{'NOUN', 'VERB', 'ADJ', 'PROPN', 'ADV'}` and named entity.

In [16]:
# tokenization
import spacy

POS_TO_KEEP = {'NOUN', 'VERB', 'ADJ', 'PROPN', 'ADV'}

nlp = spacy.load('en_core_web_sm')

def tokenization(text):
    doc = nlp(text)
    tokens = []

    for token in doc:
        # keep specified POS
        if token.pos_ in POS_TO_KEEP:
            tokens.append(token.text)
            continue

        # keep if it is part of the name entity
        for ent in doc.ents:
            # check if the token's index falls within the start and end indices of the entity
            if token.i >= ent.start and token.i < ent.end:
                tokens.append(token.text)
                break

    return tokens

title_en_tok = title_en.apply(tokenization)
synopsis_tok = synopsis.apply(tokenization)  # This may take around 5 mins

In [18]:
# review tokenization
import random

test = random.choice(title_en.index.tolist())

print(title_en[test])
print(title_en_tok[test])
print('\n---\n')
print(synopsis[test])
print(synopsis_tok[test])

My Androgynous Boyfriend
['Androgynous', 'Boyfriend']

---

Wako Machida wants nothing more than to show the world how cute her boyfriend is. Meguru Soma is a stylish boy who loves the "genderless" fashion subculture and can make himself stand out in any crowd. Though their relationship may appear unconventional, Wako and Meguru's shared obsession with makeup and the latest trends is more than enough to bring them together.


['Wako', 'Machida', 'wants', 'more', 'show', 'world', 'cute', 'boyfriend', 'Meguru', 'Soma', 'stylish', 'boy', 'loves', 'genderless', 'fashion', 'subculture', 'make', 'stand', 'crowd', 'relationship', 'appear', 'unconventional', 'Wako', 'Meguru', 'shared', 'obsession', 'makeup', 'latest', 'trends', 'more', 'enough', 'bring', 'together']
