In [3]:
!where python

c:\Users\user\Desktop\MADS\696\696\env696\Scripts\python.exe
C:\Users\user\AppData\Local\Programs\Python\Python312\python.exe
C:\Users\user\AppData\Local\Microsoft\WindowsApps\python.exe


In [4]:
import pandas as pd
import numpy as np
import re

In [5]:
pd.set_option("display.max_columns", None)
RANDOM_SEED = 123

In [6]:
df_anime = pd.read_csv("assets/anime.csv")
df_manga = pd.read_csv("assets/manga.csv")

In [7]:
df_anime.shape, df_manga.shape

((24985, 39), (64833, 30))

## Data Prepocessing

In [8]:
def data_cleaning(input_anime, input_manga):
    df_anime = input_anime.copy()
    df_manga = input_manga.copy()

    # remove unnecessary columns
    df_anime = df_anime.drop(columns=['anime_id', 'main_picture', 'url', 'trailer_url', 'background', 'created_at', 'updated_at', 'episode_duration', 'broadcast_day', 'broadcast_time', 'licensors', 'title_synonyms'])
    df_manga = df_manga.drop(columns=['manga_id', 'main_picture', 'url', 'background', 'created_at_before', 'updated_at', 'title_synonyms'])

    # remove rows that are null in 'synopsis' and 'title', which are crucial for our project
    df_anime.dropna(subset=['title', 'synopsis', 'title_english', 'title_japanese'], inplace=True)
    df_manga.dropna(subset=['title', 'synopsis', 'title_english', 'title_japanese'], inplace=True)

    # remove '(Sources:...)' from synopsis
    df_anime['synopsis'] = df_anime['synopsis'].apply(lambda x: re.sub(r'\(Source:.*\)', '', x))
    df_manga['synopsis'] = df_manga['synopsis'].apply(lambda x: re.sub(r'\(Source:.*\)', '', x))

    # remove '[Written by ...]' from synopsis
    df_anime['synopsis'] = df_anime['synopsis'].apply(lambda x: re.sub(r'\[Written by.*\]', '', x))
    df_manga['synopsis'] = df_manga['synopsis'].apply(lambda x: re.sub(r'\[Written by.*\]', '', x))

    print('cleaned anime shape: ', df_anime.shape)
    print('cleaned manga shape: ', df_manga.shape)

    return df_anime, df_manga

df_anime_cleaned, df_manga_cleaned = data_cleaning(df_anime, df_manga)

cleaned anime shape:  (9506, 27)
cleaned manga shape:  (15668, 23)


In [9]:
print("anime extra columns: \n", [col for col in df_anime_cleaned.columns if not col in df_manga_cleaned.columns])

anime extra columns: 
 ['episodes', 'source', 'total_duration', 'rating', 'start_year', 'start_season', 'studios', 'producers']


In [10]:
print("manga extra columns: \n", [col for col in df_manga_cleaned.columns if not col in df_anime_cleaned.columns])

manga extra columns: 
 ['volumes', 'chapters', 'authors', 'serializations']


In [11]:
print("common columns: \n", [col for col in df_anime_cleaned.columns if col in df_manga_cleaned.columns])

common columns: 
 ['title', 'type', 'score', 'scored_by', 'status', 'start_date', 'end_date', 'members', 'favorites', 'sfw', 'approved', 'real_start_date', 'real_end_date', 'genres', 'themes', 'demographics', 'synopsis', 'title_english', 'title_japanese']


In [12]:
df_manga['type'].unique()

array(['manga', 'manhwa', 'light_novel', 'one_shot', 'manhua', 'novel',
       'doujinshi'], dtype=object)

In [13]:
df_anime['source'].unique()

array(['manga', 'visual_novel', 'original', 'web_manga', 'light_novel',
       'novel', 'game', '4_koma_manga', 'music', 'other', 'web_novel',
       'card_game', 'book', 'mixed_media', nan, 'picture_book', 'radio'],
      dtype=object)

In [14]:
df_anime['type'].unique()

array(['tv', 'movie', 'ona', 'ova', 'special', 'music', nan], dtype=object)

**Extra columns alignment**:
| anime columns | manga columns | combine |
| --- | --- | --- |
| episodes | chapters | episodes/chapters |
| NULL | volumes |
| source | type | original_source |
| type | NULL |
| total_duration | NULL |
| rating | NULL |
| start_year | NULL |
| start_season | NULL |
| studios | NULL |
| producers | NULL |
| NULL | authors |
| NULL | serializations |

In [15]:
def columns_alignment(input_anime, input_manga):
    df_anime = input_anime.copy()
    df_manga = input_manga.copy()

    # treat 'episodes' and 'chapters' the same, create null value for 'volume' in anime
    df_anime.rename(columns={'episodes': 'episodes/chapters'}, inplace=True)
    df_manga.rename(columns={'chapters': 'episodes/chapters'}, inplace=True)

    # 'source' in anime is corresponding to the 'type' in manga, so combine them
    df_anime.rename(columns={'source': 'original_source'}, inplace=True)
    df_manga.rename(columns={'type': 'original_source'}, inplace=True)

    return df_anime, df_manga

df_anime_aligned, df_manga_aligned = columns_alignment(df_anime_cleaned, df_manga_cleaned)


In [38]:
df_full = pd.concat([df_anime_aligned, df_manga_aligned], ignore_index=True)
df_full.head()

Unnamed: 0,title,type,score,scored_by,status,episodes/chapters,start_date,end_date,original_source,members,favorites,total_duration,rating,sfw,approved,start_year,start_season,real_start_date,real_end_date,genres,themes,demographics,studios,producers,synopsis,title_english,title_japanese,volumes,authors,serializations
0,Fullmetal Alchemist: Brotherhood,tv,9.1,2037075,finished_airing,64.0,2009-04-05,2010-07-04,manga,3206028,219036,1 days 01:57:20,r,True,True,2009.0,spring,2009-04-05,2010-07-04,"['Action', 'Adventure', 'Drama', 'Fantasy']",['Military'],['Shounen'],['Bones'],"['Aniplex', 'Square Enix', 'Mainichi Broadcast...",After a horrific alchemy experiment goes wrong...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,,,
1,Hunter x Hunter (2011),tv,9.04,1671587,finished_airing,148.0,2011-10-02,2014-09-24,manga,2688079,202109,2 days 10:15:16,pg_13,True,True,2011.0,fall,2011-10-02,2014-09-24,"['Action', 'Adventure', 'Fantasy']",[],['Shounen'],['Madhouse'],"['VAP', 'Nippon Television Network', 'Shueisha']",Hunters devote themselves to accomplishing haz...,Hunter x Hunter,HUNTER×HUNTER（ハンター×ハンター）,,,
2,Shingeki no Kyojin Season 3 Part 2,tv,9.05,1491491,finished_airing,10.0,2019-04-29,2019-07-01,manga,2133927,55644,0 days 03:59:10,r,True,True,2019.0,spring,2019-04-29,2019-07-01,"['Action', 'Drama']","['Gore', 'Military', 'Survival']",['Shounen'],['Wit Studio'],"['Production I.G', 'Dentsu', 'Mainichi Broadca...",Seeking to restore humanity's diminishing hope...,Attack on Titan Season 3 Part 2,進撃の巨人 Season3 Part.2,,,
3,Steins;Gate,tv,9.07,1348232,finished_airing,24.0,2011-04-06,2011-09-14,visual_novel,2463954,184312,0 days 09:44:00,pg_13,True,True,2011.0,spring,2011-04-06,2011-09-14,"['Drama', 'Sci-Fi', 'Suspense']","['Psychological', 'Time Travel']",[],['White Fox'],"['Frontier Works', 'Media Factory', 'Kadokawa ...",Eccentric scientist Rintarou Okabe has a never...,Steins;Gate,STEINS;GATE,,,
4,Koe no Katachi,movie,8.94,1540277,finished_airing,1.0,2016-09-17,2016-09-17,manga,2218467,84124,0 days 02:10:03,pg_13,True,True,2016.0,summer,2016-09-17,2016-09-17,"['Award Winning', 'Drama']",['Romantic Subtext'],['Shounen'],['Kyoto Animation'],"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","As a wild youth, elementary school student Sho...",A Silent Voice,聲の形,,,


### Handle Missing data

In [37]:
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

# Non-numerical columns
Non_Numerical = ['type', 'original_source', 'rating', 'studios', 'producers', 'authors' ,'serializations']

# Create an imputer for non-numerical data filling with 'Missing'
categorical_imputer = SimpleImputer(strategy='constant', fill_value='Missing')

# Impute non-numerical columns
df_full[Non_Numerical] = categorical_imputer.fit_transform(df_full[Non_Numerical])

# Numerical columns
Numerical = ['score', 'scored_by', 'episodes/chapters', 'members', 'favorites']

# Create an imputer for numerical data using KNNImputer
knn_imputer = KNNImputer(n_neighbors=3)

# Impute numerical columns
df_full[Numerical] = knn_imputer.fit_transform(df_full[Numerical])

# Display the final result with both numerical and categorical columns imputed
df_full.isnull().sum()


title                    0
type                     0
score                    0
scored_by                0
status                   0
episodes/chapters        0
start_date             454
end_date              4455
original_source          0
members                  0
favorites                0
total_duration       15880
rating                   0
sfw                      0
approved                 0
start_year           16367
start_season         16367
real_start_date        454
real_end_date         4455
genres                   0
themes                   0
demographics             0
studios                  0
producers                0
synopsis                 0
title_english            0
title_japanese           0
volumes              13922
authors                  0
serializations           0
dtype: int64

In [17]:
# train test split
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_full, test_size=0.2, random_state=RANDOM_SEED)
print('Number of rows in train set: ', len(train))

Number of rows in train set:  20139


### Data Normalization

In [27]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Selecting columns to normalize
columns_to_normalize = ["scored_by", "episodes/chapters", "members", "favorites"]

# Normalizing the selected columns
train_normalized = train.copy()
train_normalized[columns_to_normalize] = scaler.fit_transform(train[columns_to_normalize])
train_normalized

Unnamed: 0,title,type,score,scored_by,status,episodes/chapters,start_date,end_date,original_source,members,favorites,total_duration,rating,sfw,approved,start_year,start_season,real_start_date,real_end_date,genres,themes,demographics,studios,producers,synopsis,title_english,title_japanese,volumes,authors,serializations,title_en_token,synopsis_token
6972,Rakka-sei,ona,6.17,-0.195634,finished_airing,-0.403343,2021-01-09,2021-01-09,original,-0.229947,-0.112044,0 days 00:01:35,pg_13,True,True,2021.0,winter,2021-01-09,2021-01-09,['Supernatural'],['Music'],[],['Flat Studio'],[],An original concept movie directed by banishme...,Life of Falling,落下生,,,,"[life, fall]","[original, concept, movie, direct, banishment,..."
8659,Sora Iro Hana Iro,movie,,-0.196606,finished_airing,-0.403343,2005-01-01,2005-01-01,,-0.230403,-0.112044,0 days 00:06:40,pg,True,True,,,2005,2005,"['Drama', 'Slice of Life']",[],[],[],['Tomoyasu Murata Company'],A woman and a dog go on a spiritual journey an...,Sky Colour Flower Colour,空色花色,,,,"[Sky, Colour, Flower, Colour]","[woman, dog, go, spiritual, journey, undergo, ..."
13144,"Doumo, Suki na Hito ni Horegusuri wo Irai sare...",,7.33,-0.192360,currently_publishing,,2020-08-28,,manga,-0.219369,-0.109208,,,True,True,,,2020-08-28,,"['Fantasy', 'Romance']",[],[],,,"""I want you to make a love potion.""\r\n\r\nThe...","Hi, I'm a Witch, and My Crush Wants Me to Make...",どうも、好きな人に惚れ薬を依頼された魔女です。,,"[{'id': 26905, 'first_name': 'Misato', 'last_n...",['Flos Comic'],"[Hi, witch, my, Crush, want, I, make, Love, Po...","[want, make, love, potion, the, Good, Witch, o..."
11686,Okaeri Alice,,7.13,-0.149860,currently_publishing,,2020-04-09,,manga,-0.127496,-0.068903,,,True,True,,,2020-04-09,,"['Drama', 'Romance']","['Love Polygon', 'School']",['Shounen'],,,"Childhood friends Youhei, Kei, and Yui are reu...","Welcome Back, Alice",おかえりアリス,,"[{'id': 6579, 'first_name': 'Shuuzou', 'last_n...",['Bessatsu Shounen Magazine'],"[welcome, Alice]","[childhood, friend, Youhei, Kei, Yui, reunite,..."
24369,Usotsuki Kyoushi to Shinitagari,,,-0.197007,finished,-0.295353,2019-04-24,2019-08-01,manga,-0.231739,-0.111639,,,False,True,,,2019-04-24,2019-08,"['Boys Love', 'Erotica']",['School'],[],,,"""Sensei, if you lie to me, I'll jump off right...",Liar Teacher & Suicidal Boy,うそつき教師と死にたがり,1.0,"[{'id': 43084, 'first_name': '', 'last_name': ...",[],"[Liar, Teacher, Suicidal, Boy]","[Sensei, lie, jump, right, now, school, roofto..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15377,Deja-vu,,7.05,-0.194599,finished,-0.313351,2004-01-01,,manhwa,-0.227711,-0.111234,,,True,True,,,2004,,"['Drama', 'Fantasy', 'Romance', 'Sci-Fi']",['Historical'],[],,,"A collection of four oneshots (Spring, Summer,...","Deja-vu: Spring, Summer, Fall, Winter",데자부,1.0,"[{'id': 2049, 'first_name': 'In-Wan', 'last_na...",[],"[Deja, vu, Spring, ,, Summer, ,, Fall, Winter]","[collection, four, oneshot, Spring, ,, Summer,..."
21602,Sheets no Namima de Miru Yume Mitai na,,,-0.196595,finished,-0.241358,2019-03-01,2020-10-19,manga,-0.231100,-0.111639,,,False,True,,,2019-03-01,2020-10-19,['Erotica'],[],['Josei'],,,Tender caresses and kisses exchanged in a hote...,A Dream Between the Sheets,シーツの波間でみる夢みたいな,2.0,"[{'id': 59120, 'first_name': 'Ruka', 'last_nam...",[],"[dream, sheet]","[tender, caress, kiss, exchange, hotel, suite,..."
17730,Shichinin no Nana,,6.54,-0.194758,finished,0.064614,2001-11-08,2002-05-25,manga,-0.228655,-0.111436,,,True,True,,,2001-11-08,2002-05-25,"['Comedy', 'Romance']",[],[],,,The fiasco started one ordinary night when Nan...,Seven of Seven,七人のナナ,3.0,"[{'id': 3710, 'first_name': 'Yasuhiro', 'last_...",['Shounen Champion (Weekly)'],"[seven, Seven]","[fiasco, start, one, ordinary, night, Nana, Su..."
15725,Gakuen Police,,6.59,-0.190195,finished,-0.133368,2012-05-22,2014-06-21,manga,-0.218982,-0.111639,,,True,True,,,2012-05-22,2014-06-21,"['Comedy', 'Girls Love', 'Romance']",['School'],['Seinen'],,,Ever since Sasami was a little girl she's admi...,Gakuen Polizi,学園ポリーチェ,2.0,"[{'id': 3182, 'first_name': 'Milk', 'last_name...",['Comic High!'],"[Gakuen, Polizi]","[ever, Sasami, little, girl, admire, defend, j..."


### Text processing

Tokenization and lemmatization by keeping `{'NOUN', 'VERB', 'ADJ', 'PROPN', 'ADV'}` and named entity.

In [23]:
#!python -m spacy download en_core_web_sm

In [22]:
# tokenization and lemmatization
import spacy

POS_TO_KEEP = {'NOUN', 'VERB', 'ADJ', 'PROPN', 'ADV'}

nlp = spacy.load('en_core_web_sm')

def tokenization(doc):
    # filter tokens if it is specified POS or is part of the name entity, return its lemma
    return [
        token.lemma_ for token in doc if token.pos_ in POS_TO_KEEP or token.ent_type_
    ]

# use nlp.pipe for batch processing
train['title_en_token'] = [
    tokenization(doc) for doc in nlp.pipe(train['title_english'], batch_size=100, n_process=-1)
]
train['synopsis_token'] = [
    tokenization(doc) for doc in nlp.pipe(train['synopsis'], batch_size=100, n_process=-1)
]

# filter out empty token
train = train[train.title_en_token.apply(lambda x: len(x) > 0) & train.synopsis_token.apply(lambda x: len(x) > 0)] # both columns cannot be empty

print('Number of rows after filtering empty token: ', len(train))

Number of rows after filtering empty token:  20081


In [57]:
# review tokenization
with pd.option_context('display.max_colwidth', None):
    display(train[['title_english', 'title_en_token', 'synopsis', 'synopsis_token']].sample(1))

Unnamed: 0,title_english,title_en_token,synopsis,synopsis_token
22310,I'm Not a Bitch,[bitch],"Mr. Yamano, a physical education teacher working at a small elementary school in the countryside, lusted for Mr. Kiyono, a beautiful math teacher who had been transferred from the city. The two of them were staying alone in the night watch room, and found out that their hearts and crotch were swelling up with anticipation. This chastity slut has his seductive ways despite his love affair.\n\n\n\nIncluded one-shot: Sake to Tobacco to Kiss","[Mr., Yamano, physical, education, teacher, work, small, elementary, school, countryside, lust, Mr., Kiyono, beautiful, math, teacher, transfer, city, two, stay, alone, night, watch, room, find, heart, crotch, swell, anticipation, chastity, slut, have, seductive, way, love, affair, include, one, shot, sake, Tobacco, kiss]"


Since titles and synopses play different roles (titles are short and often genre-indicative, while synopses provide detailed content descriptions), we use two separate vectorizers.

Titles are shorter and often contain rare, context-rich words that are crucial for capturing unique meaning, while synopses are longer and contain more common words, making them less reliant on capturing rare vocabulary. So, we use a higher `max_features` or `vector_size` for title to ensures it capture these niche terms and their relationships, while use lower for synopsis to focus on the more frequently relevant words.

In [84]:
# vectorization

from sklearn.feature_extraction.text import TfidfVectorizer

# initialize title tfidf
tfidf_title = TfidfVectorizer(
    ngram_range=(1,1),  # uni-gram
    min_df=1,           # don't filter rare words as they are important for title
    max_df=0.8          # filter out very common words
)

# initialize synopsis tfidf
tfidf_synopsis = TfidfVectorizer(
    ngram_range=(1,2),  # uni-gram or bi-gram
    max_features=2000,  # focus on the more frequently relevant words
    min_df=2,           # filter out extremely rare words
    max_df=0.8          # filter out very common words
)

# train tfidf
title_en_tfidf_matrix = tfidf_title.fit_transform(train['title_en_token'].apply(lambda x: " ".join(x)))
synopsis_tfidf_matrix = tfidf_synopsis.fit_transform(train['synopsis_token'].apply(lambda x: " ".join(x)))

# add result to train df
train['title_en_tfidf'] = [title_en_tfidf_matrix[i] for i in range(title_en_tfidf_matrix.shape[0])]
train['synopsis_tfidf'] = [synopsis_tfidf_matrix[i] for i in range(synopsis_tfidf_matrix.shape[0])]

For word embedding, we choose **Word2Vec - Skip-gram** because Tends to capture rare words more effectively (e.g. niche anime/manga-specific vocabulary), comparing to Word2Vec - CBOW and GloVe.

In [85]:
# word embedding
import multiprocessing
from gensim.models import Word2Vec

# dynamically determine the number of CPU cores
num_workers = multiprocessing.cpu_count()

# train title skipgram model
skipgram_model_title = Word2Vec(
    train['title_en_token'].tolist(),
    sg=1,               # skip-gram
    vector_size=300,    # title use higher dim
    window=2,           # title use smaller window size
    min_count=1,        # titles may contain rare but important words
    epochs=30,          # title are shorter, need more epochs to train
    workers=num_workers,
    seed=RANDOM_SEED
)

# train synopsis skipgram model
skipgram_model_synopsis = Word2Vec(
    train['synopsis_token'].tolist(),
    sg=1,               # skip-gram
    vector_size=150,    # synopsis use lower dim
    window=5,           # synopsis use larger window size
    min_count=2,        # filter out extremely rare words
    epochs=15,
    workers=num_workers,
    seed=RANDOM_SEED
)

# clean tokens that does not exist in the skipgram vocab (because of `min_count`)
model_vocab = set(skipgram_model_synopsis.wv.index_to_key)
train['synopsis_token'] = train['synopsis_token'].apply(lambda x: [token for token in x if token in model_vocab])
train = train[train['synopsis_token'].apply(lambda x: len(x) > 0)]  # filter out empty entry after clean tokens

# apply skipgram model
train['title_en_skipgram'] = train['title_en_token'].apply(lambda x: skipgram_model_title.wv[x])
train['synopsis_skipgram'] = train['synopsis_token'].apply(lambda x: skipgram_model_synopsis.wv[x])

In [88]:
train[['title_japanese', 'title_english', 'synopsis', 'title_en_tfidf', 'title_en_skipgram', 'synopsis_tfidf', 'synopsis_skipgram']].sample(5)

Unnamed: 0,title_japanese,title_english,synopsis,title_en_tfidf,title_en_skipgram,synopsis_tfidf,synopsis_skipgram
389,マギ シンドバッドの冒険,Magi: Adventure of Sinbad,"In the small, impoverished Tison Village of th...","(0, 273)\t0.45261397809160764\n (0, 6551)\t...","[[-0.28387263, -0.3996428, 0.23688924, -0.2031...","(0, 471)\t0.10543107622974691\n (0, 803)\t0...","[[-0.42436963, -0.29327273, 0.2623991, -0.2764..."
2858,逮捕しちゃうぞ SECOND SEASON,You're Under Arrest: Fast & Furious,"AA! Megami Sama creator, Kosuke Fujishima, is ...","(0, 4055)\t0.6048475503419088\n (0, 740)\t0...","[[-0.08216671, -0.075498156, 0.03741265, -0.05...","(0, 1756)\t0.08728442659380917\n (0, 779)\t...","[[-0.20247231, -0.10974851, -0.16478583, 0.077..."
17895,セックスのあと男の子の汗はハチミツのにおいがする,Sweat and Honey,"A unique, unforgettable collection of stories ...","(0, 4969)\t0.5739562486160855\n (0, 10797)\...","[[-0.07085394, -0.0408827, 0.002115316, -0.054...","(0, 994)\t0.1175816603755288\n (0, 1523)\t0...","[[-0.08538792, 0.24516971, -0.24691118, -0.263..."
19589,闇芝居,Horror Theatre Yamishibai,A sinister storyteller always appears at dusk ...,"(0, 5003)\t0.5554956849124894\n (0, 11090)\...","[[-0.15762852, -0.07580402, 0.038750034, -0.11...","(0, 319)\t0.1913921025293429\n (0, 906)\t0....","[[0.0891665, -0.21489303, -0.23714808, 0.07771..."
5772,ブラック・ジャック Dr. ピノコの森の冒険,Black Jack: Dr. Pinoco's Adventure,"Dr. Black Jack forgot his doctor's bag, and hi...","(0, 273)\t0.3465742691796254\n (0, 1287)\t0...","[[-0.21829343, -0.21362858, -0.29476508, -0.06...","(0, 1533)\t0.17512417570769043\n (0, 677)\t...","[[-0.13861196, 0.34843922, 0.26259136, 0.05886..."
