In [1]:
!where python

c:\Users\HK-Laptop-V639\Documents\GitHub\696\env696\Scripts\python.exe
C:\Users\HK-Laptop-V639\AppData\Local\Programs\Python\Python312\python.exe
C:\Users\HK-Laptop-V639\AppData\Local\Microsoft\WindowsApps\python.exe


In [2]:
# run this code to make sure you install all the required libraries
# be sure you are in virtual environment before install, otherwise it will overwrite your local environment

#!pip install -r requirements.txt

In [3]:
import pandas as pd
import numpy as np
import re

In [4]:
# fix random state
import os
RANDOM_SEED = 123
os.environ["OMP_NUM_THREADS"] = "1"
np.random.seed(42)
import torch
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [5]:
import ast
df_anime = pd.read_csv(
    "assets/anime.csv", 
    converters={
        'genres': ast.literal_eval,
        'themes': ast.literal_eval,
        'demographics': ast.literal_eval,
        'studios': ast.literal_eval,
        'licensors': ast.literal_eval
    }
)

df_manga = pd.read_csv(
    "assets/manga.csv", 
    converters={
        'genres': ast.literal_eval,
        'themes': ast.literal_eval,
        'demographics': ast.literal_eval,
        'serializations': ast.literal_eval
    }
)

In [6]:
df_anime.shape, df_manga.shape

((24985, 39), (64833, 30))

## Data Prepocessing

Some short synopsis contain no information about the story of manga/title. This will introduce noise to our model. Therefore, we decide to remove those rows with extremely short synopsis.
Example:
- Second season of Mao Zhi Ming.
- The second season of Shen Lan Qi Yu Wushuang Zhu.
- Recap episode of Hakyuu Houshin Engi.
- Fifth Season of Bungou Stray Dogs
- 1-3. Ba_ku\n4-5. Mephisto
- An absurd film by Kuri Youji.
- Included one-shot:\nBougainvillea
- A collection of oneshots by Nishida Higashi.
- A movie adaptation of the TV series.
- Short film by Kurosaka Keita.
- Special episodes added to DVDs and Blu-rays.
- Movie based on the 1996 TV anime with an original plot.
- Third season of Yuan Long

In [7]:
def data_cleaning(input_anime, input_manga):
    df_anime = input_anime.copy()
    df_manga = input_manga.copy()

    # remove unnecessary columns
    df_anime = df_anime.drop(columns=['anime_id', 'total_duration', 'start_year', 'start_season', 'rating', 'main_picture', 'url', 'trailer_url', 'background', 'created_at', 'updated_at', 'episode_duration', 'broadcast_day', 'broadcast_time', 'licensors', 'title_synonyms', 'real_start_date', 'real_end_date', 'approved', 'themes'])
    df_manga = df_manga.drop(columns=['manga_id', 'main_picture', 'url', 'background', 'created_at_before', 'updated_at', 'title_synonyms', 'volumes', 'real_start_date', 'real_end_date', 'approved', 'themes'])

    # remove rows that are null in 'synopsis' and 'title', which are crucial for our project
    df_anime.dropna(subset=['title', 'synopsis', 'title_english', 'title_japanese'], inplace=True)
    df_manga.dropna(subset=['title', 'synopsis', 'title_english', 'title_japanese'], inplace=True)

    # remove '(Sources:...)' from synopsis
    df_anime['synopsis'] = df_anime['synopsis'].apply(lambda x: re.sub(r'\(Source:.*\)', '', x))
    df_manga['synopsis'] = df_manga['synopsis'].apply(lambda x: re.sub(r'\(Source:.*\)', '', x))

    # remove '[Written by ...]' from synopsis
    df_anime['synopsis'] = df_anime['synopsis'].apply(lambda x: re.sub(r'\[Written by.*\]', '', x))
    df_manga['synopsis'] = df_manga['synopsis'].apply(lambda x: re.sub(r'\[Written by.*\]', '', x))

    # remove rows that have extreme short synopsis
    df_anime = df_anime[df_anime['synopsis'].apply(lambda x: len(x) > 50)]
    df_manga = df_manga[df_manga['synopsis'].apply(lambda x: len(x) > 50)]

    print('cleaned anime shape: ', df_anime.shape)
    print('cleaned manga shape: ', df_manga.shape)

    return df_anime, df_manga

df_anime_cleaned, df_manga_cleaned = data_cleaning(df_anime, df_manga)

cleaned anime shape:  (8862, 19)
cleaned manga shape:  (15447, 18)


In [8]:
print("anime extra columns: \n", [col for col in df_anime_cleaned.columns if not col in df_manga_cleaned.columns])

anime extra columns: 
 ['episodes', 'source', 'studios', 'producers']


In [9]:
print("manga extra columns: \n", [col for col in df_manga_cleaned.columns if not col in df_anime_cleaned.columns])

manga extra columns: 
 ['chapters', 'authors', 'serializations']


In [10]:
print("common columns: \n", [col for col in df_anime_cleaned.columns if col in df_manga_cleaned.columns])

common columns: 
 ['title', 'type', 'score', 'scored_by', 'status', 'start_date', 'end_date', 'members', 'favorites', 'sfw', 'genres', 'demographics', 'synopsis', 'title_english', 'title_japanese']


In [11]:
import ast

def extract_author_name(author_list):
    try:
        author_list = ast.literal_eval(author_list)
        
        # Extract first and last names of authors, ignoring the others
        author_names = [f"{author['first_name']} {author['last_name']}".strip() for author in author_list if author['first_name'] or author['last_name']]
        
        # Join names for multiple authors
        return author_names
    
    except (ValueError, SyntaxError, KeyError, TypeError):
        # Handle Missing data
        return "Missing"

# Apply the function to the 'authors' column
def authors_extraction(input_manga):
    df_manga = input_manga.copy()
    df_manga['authors'] = df_manga['authors'].apply(extract_author_name)

    return df_manga

df_manga_extracted = authors_extraction(df_manga_cleaned)

In [12]:
df_manga_cleaned['authors'].head(1).values

array(["[{'id': 1868, 'first_name': 'Kentarou', 'last_name': 'Miura', 'role': 'Story & Art'}, {'id': 49592, 'first_name': '', 'last_name': 'Studio Gaga', 'role': 'Art'}]"],
      dtype=object)

In [13]:
df_manga_extracted['authors'].head(1).values

array([list(['Kentarou Miura', 'Studio Gaga'])], dtype=object)

**Extra columns alignment**:
| anime columns | manga columns | strategy |
| --- | --- | --- |
| episodes | chapters | episodes/chapters |
| source | NULL | impute const 'Missing' |
| studios | authors | creators |
| producers | serializations | production_source |

In [14]:
def columns_alignment(input_anime, input_manga):
    df_anime = input_anime.copy()
    df_manga = input_manga.copy()

    # treat 'episodes' and 'chapters' the same, create null value for 'volume' in anime
    df_anime.rename(columns={'episodes': 'episodes/chapters'}, inplace=True)
    df_manga.rename(columns={'chapters': 'episodes/chapters'}, inplace=True)

    # Combine studios and authors together to get creators columns
    df_anime.rename(columns={'studios': 'creators'}, inplace=True)
    df_manga.rename(columns={'authors': 'creators'}, inplace=True)

    # Also for producers and serialization
    df_anime.rename(columns={'producers': 'production_source'}, inplace=True)
    df_manga.rename(columns={'serializations': 'production_source'}, inplace=True)

    # To distinguish where the data from
    df_anime['is_anime'] = 1
    df_manga['is_anime'] = 0 
    
    return df_anime, df_manga

df_anime_aligned, df_manga_aligned = columns_alignment(df_anime_cleaned, df_manga_extracted)

In [15]:
df_full = pd.concat([df_anime_aligned, df_manga_aligned], ignore_index=True)

df_full.sample(5)

Unnamed: 0,title,type,score,scored_by,status,episodes/chapters,start_date,end_date,source,members,favorites,sfw,genres,demographics,creators,production_source,synopsis,title_english,title_japanese,is_anime
1227,Ansatsu Kyoushitsu: Deai no Jikan,special,7.43,60884,finished_airing,1.0,2014-11-09,2014-11-09,manga,132427,230,True,"[Action, Comedy]",[Shounen],[Lerche],[],Koro-sensei reminisces about his first meeting...,Assassination Classroom: Meeting Time,暗殺教室 episode:0 出会いの時間,1
2842,Meitantei Holmes,tv,7.38,3894,finished_airing,26.0,1984-11-06,1985-05-21,novel,11675,74,True,"[Action, Adventure, Comedy, Mystery]",[],"[Gallop, TMS Entertainment]",[],"Loosely based on the ""Sherlock Holmes"" series ...",Sherlock Hound,名探偵ホームズ,1
19085,Kimi wo Shinryaku Seyo!,manga,,29,finished,31.0,2018-05-21,2018-09-10,,268,0,True,"[Comedy, Romance, Supernatural]",[Shounen],[Kazusa Inaoka],[Shounen Jump (Weekly)],Hajime Sorajima's got a terrible hairstyle! It...,Invade You!,キミを侵略せよ!,0
23154,Dokoka no Heya de,manga,,9,finished,11.0,2018-11-29,2021-04-20,,45,0,False,[Hentai],[],[Iori Nishi],[Comic Shitsuraku-ten],1. Shoujikimono no Kuzu ga Suki: Prologue\n2. ...,In A Dark Room Somewhere,どこかの部屋で,0
3455,Kakushi Dere,ova,6.92,4856,finished_airing,3.0,2013-04-19,2013-11-29,manga,13022,45,False,[Hentai],[],[Peak Hunt],['Pashmina'],Nonoka goes out of her way to make a written c...,Lust-Struck Trilogy,かくしデレ,1


### Date Transformation

In [16]:
# Function to extract year and month
def extract_year_and_month(date):
    try:
        # Convert the date string to a datetime object
        datetime_date = pd.to_datetime(date, errors='raise')
        # Extract year and month
        return datetime_date.year, datetime_date.month
    except:
        # return NaN if fails
        return np.nan, np.nan
    
# Apply the function to the 'start_date' column
df_full[['start_year', 'start_month']] = df_full['start_date'].apply(lambda x: pd.Series(extract_year_and_month(x)))

# Apply the function to the 'end_date' column
df_full[['end_year', 'end_month']] = df_full['end_date'].apply(lambda x: pd.Series(extract_year_and_month(x)))

#remove useless columns
df_full = df_full.drop(columns=['start_date', 'end_date'])

In [17]:
# Function to transform Start or End month into season refer to Events of Anime
def month_to_season(month):
    # If the data is in range, return corresponding Season of events of Anime
    if month in [1, 2, 3]:
        return 'Winter'
    elif month in [4, 5, 6]:
        return 'Spring'
    elif month in [7, 8, 9]:
        return 'Summer'
    elif month in [10, 11, 12]:
        return 'Autumn'
    else:
        return np.nan  # Handle unexpected cases, though this shouldn't occur with valid months

# Apply the function to transform the month value to season categories
df_full['start_season'] = df_full['start_month'].apply(month_to_season)
df_full['end_season'] = df_full['end_month'].apply(month_to_season)

# convert year to int
df_full['start_year'] = df_full['start_year'].astype('Int64')
df_full['end_year'] = df_full['end_year'].astype('Int64')

# remove useless columns
df_full = df_full.drop(columns=['start_month', 'end_month'])

# Check for Year and Season feature
df_full[['start_year', 'end_year', 'start_season', 'end_season']].head()

Unnamed: 0,start_year,end_year,start_season,end_season
0,2009,2010,Spring,Summer
1,2011,2014,Autumn,Summer
2,2019,2019,Spring,Summer
3,2011,2011,Spring,Summer
4,2016,2016,Summer,Summer


### Train Test data split

In [18]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_full, test_size=0.2, random_state=RANDOM_SEED)
print('Number of rows in train set: ', len(train))

Number of rows in train set:  19447


### Normalization, one-hot encoding, and imputation

We will perform full preprocessing specifically for traditional machine learning models, while leaving these preprocessing steps within the deep learning model pipeline itself using PyTorch. So, we will end up with 2 sets of data:
- pre-processed train and test data
- non-processed train and test data

In [19]:
# check for missing data
print("Number of missing data in Training Set:\n")
print(train.isnull().sum())

Number of missing data in Training Set:

title                    0
type                     6
score                 5962
scored_by                0
status                   0
episodes/chapters     3898
source               12984
members                  0
favorites                0
sfw                      0
genres                   0
demographics             0
creators                 0
production_source        0
synopsis                 0
title_english            0
title_japanese           0
is_anime                 0
start_year             324
end_year              3490
start_season           324
end_season            3490
dtype: int64


In [20]:
# check for missing data 
print("Number of missing data in Testing Set:\n")
print(test.isnull().sum())

Number of missing data in Testing Set:

title                   0
type                    2
score                1496
scored_by               0
status                  0
episodes/chapters     959
source               3237
members                 0
favorites               0
sfw                     0
genres                  0
demographics            0
creators                0
production_source       0
synopsis                0
title_english           0
title_japanese          0
is_anime                0
start_year             79
end_year              863
start_season           79
end_season            863
dtype: int64


Approach:
1. Multi-Label Encoding Outside the Pipeline
    - MultiLabelBinarizer does not integrate smoothly within pipelines.
2. Fixed SVD Dimensions for Multi-Label Columns
    - Using a constant 5 dimensions to ensure consistent contribution from each multi-label column
    - avoid bias towards columns with more labels
3. Separate Pipelines for Single-Category and Numerical Columns
    - One-hot encoding the Single-Category Columns with `OneHotEncoder`
    - Scale numerical features using `StandardScaler`
4. KNN Imputer for Missing Values
    - fills missing values by looking at nearest neighbors

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import TruncatedSVD

# Single-category columns
single_cat_columns = ['type', 'status', 'source', 'start_season', 'end_season']
# Multi-category columns  
multi_cat_columns = ['genres', 'demographics', 'creators', 'production_source']
# Numerical columns
num_col = ['score', 'scored_by', 'episodes/chapters', 'members', 'favorites', 'start_year', 'end_year']
# Text columns
text_col = ['title', 'synopsis', 'title_english', 'title_japanese']

# As MultiLabelBinarizer can't work well within sklearn pipeline, we will do it outside pipeline
# Initialize MultiLabelBinarizer and fit on the training data
mlb = MultiLabelBinarizer()
mlb.fit(pd.concat([train[col] for col in multi_cat_columns]))

# Function to apply MultiLabelBinarizer and reduce dimension for each multi-category column
def apply_mlb_and_reduce_dim(df, columns, svd_models=None, train=True, svd_components=5):
    result_dfs = []
    if train:
        svd_models = {}  # Store SVD models if training

    for col in columns:
        # Transform the column using the fitted MultiLabelBinarizer
        transformed = mlb.transform(df[col])
        transformed_df = pd.DataFrame(
            transformed, columns=[f"{col}_{cls}" for cls in mlb.classes_], index=df.index
        )

        if train:
            # Fit SVD on training data
            svd = TruncatedSVD(n_components=min(svd_components, transformed_df.shape[1] - 1), random_state=RANDOM_SEED)
            reduced = svd.fit_transform(transformed_df)
            svd_models[col] = svd  # Save the SVD model
        else:
            # Apply the same SVD model to the test data
            svd = svd_models[col]
            reduced = svd.transform(transformed_df)

        # Store the reduced DataFrame
        reduced_df = pd.DataFrame(reduced, columns=[f"{col}_svd_{i}" for i in range(reduced.shape[1])], index=df.index)
        result_dfs.append(reduced_df)

    return pd.concat(result_dfs, axis=1), svd_models if train else None

# Apply to train set
train_multi_cat, svd_models = apply_mlb_and_reduce_dim(train, multi_cat_columns, train=True)

# Apply the same transformation to test set
test_multi_cat, _ = apply_mlb_and_reduce_dim(test, multi_cat_columns, svd_models=svd_models, train=False)

# Drop the original multi-category columns from train and test sets
train_dropped = train.drop(columns=multi_cat_columns)
test_dropped = test.drop(columns=multi_cat_columns)

# Define the pipeline for single-category columns
single_cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

# Define the pipeline for numerical columns
num_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine the categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat', single_cat_pipeline, single_cat_columns),
    ('numerical', num_pipeline, num_col)
], remainder='drop')  # Drop text columns explicitly

# Create a pipeline for preprocessing only
full_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),           # Apply preprocessing
    ('imputer', KNNImputer(n_neighbors=5))     # Apply KNN imputation on all processed features
])

# Fit the pipeline to the transformed training set and transform it
train_processed = full_pipeline.fit_transform(train_dropped)
test_processed = full_pipeline.transform(test_dropped)

# Get feature names from the pipeline and create DataFrames
train_processed_df = pd.DataFrame(train_processed, columns=full_pipeline.named_steps['preprocessing'].get_feature_names_out(), index=train.index)
test_processed_df = pd.DataFrame(test_processed, columns=full_pipeline.named_steps['preprocessing'].get_feature_names_out(), index=test.index)

# Concatenate the processed DataFrame with the multi-label and untransformed text columns
train_final_df = pd.concat([train_processed_df, train_multi_cat, train[text_col]], axis=1)
test_final_df = pd.concat([test_processed_df, test_multi_cat, test[text_col]], axis=1)



In [22]:
with pd.option_context("display.max_columns", None):
    display(train_final_df.sample(5))

Unnamed: 0,cat__type_Missing,cat__type_doujinshi,cat__type_light_novel,cat__type_manga,cat__type_manhua,cat__type_manhwa,cat__type_movie,cat__type_music,cat__type_novel,cat__type_ona,cat__type_one_shot,cat__type_ova,cat__type_special,cat__type_tv,cat__status_currently_airing,cat__status_currently_publishing,cat__status_discontinued,cat__status_finished,cat__status_finished_airing,cat__status_not_yet_aired,cat__status_on_hiatus,cat__source_4_koma_manga,cat__source_Missing,cat__source_book,cat__source_card_game,cat__source_game,cat__source_light_novel,cat__source_manga,cat__source_mixed_media,cat__source_music,cat__source_novel,cat__source_original,cat__source_other,cat__source_picture_book,cat__source_radio,cat__source_visual_novel,cat__source_web_manga,cat__source_web_novel,cat__start_season_Autumn,cat__start_season_Missing,cat__start_season_Spring,cat__start_season_Summer,cat__start_season_Winter,cat__end_season_Autumn,cat__end_season_Missing,cat__end_season_Spring,cat__end_season_Summer,cat__end_season_Winter,numerical__score,numerical__scored_by,numerical__episodes/chapters,numerical__members,numerical__favorites,numerical__start_year,numerical__end_year,genres_svd_0,genres_svd_1,genres_svd_2,genres_svd_3,genres_svd_4,demographics_svd_0,demographics_svd_1,demographics_svd_2,demographics_svd_3,demographics_svd_4,creators_svd_0,creators_svd_1,creators_svd_2,creators_svd_3,creators_svd_4,production_source_svd_0,production_source_svd_1,production_source_svd_2,production_source_svd_3,production_source_svd_4,title,synopsis,title_english,title_japanese
1303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.770576,0.217512,-0.226139,0.424297,0.080306,1.097994,1.064982,0.8696,0.124613,-0.279083,-0.306637,0.965727,0.0,0.0,0.0,0.0,0.0,-1.096007e-11,1.104345e-10,-1.811216e-10,8.631009e-10,-1.944595e-09,5.69734,-1.491238,-1.708944,0.825864,0.239026,Kanojo ga Koushaku-tei ni Itta Riyuu,"When, after her sudden death, Rinko Hanasaki i...",Why Raeliana Ended up at the Duke's Mansion,彼女が公爵邸に行った理由
480,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.389338,1.037806,0.001053,1.710585,0.7867,-0.376114,-0.377525,0.945048,0.533133,-0.213625,-0.041915,-0.888106,0.005124,0.020801,0.999771,4.468922e-07,4.2e-05,0.00707543,5.045732e-06,0.110171,0.001766085,0.9895353,3.763486,-0.156028,-0.363578,-0.164652,-0.439872,xxxHOLiC,Kimihiro Watanuki can see spirits and other as...,xxxHOLiC,×××HOLiC（ホリック）
6406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.776329,-0.20042,-0.036812,-0.231542,-0.114716,0.837857,0.895275,0.904567,-0.753358,0.32995,-0.044423,0.262828,0.0,0.0,0.0,0.0,0.0,-8.816652e-16,4.477609e-15,-5.234832e-15,1.419115e-15,2.853685e-14,0.568576,1.225659,-0.360193,0.077797,-0.063345,An Jie Shen Shi,"In order to find the missing friend Xiao Yuan,...",Divine Envoy of the Dark World,暗界神使
19279,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.285879,-0.201788,0.232032,-0.23473,-0.115123,0.491008,0.521921,0.659051,-0.279484,0.146411,-0.032432,0.430242,0.0,0.0,0.0,0.0,0.0,-1.7279e-15,-3.669807e-12,4.225665e-12,-2.970843e-11,8.470688e-11,0.0,0.0,0.0,0.0,0.0,"Yamizokusei no Mahoutsukai da ga, Naze ka Yuus...",After years of honing his magic deep in the mo...,Busy Wizard: This Warlock Just Wants to Provid...,闇属性の魔法使いだが、なぜか勇者になってしまった ~それはともかく嫁にいい暮らしをさせるため...
1676,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.396741,0.428029,-0.434398,0.493501,-0.052627,0.057447,0.046742,0.065261,-0.002604,0.020664,-0.066132,-0.10727,0.0,0.0,0.0,0.0,0.0,1.808405e-06,-2.491965e-05,2.9366e-05,-0.0001197456,0.000174764,0.568576,1.225659,-0.360193,0.077797,-0.063345,Kara no Kyoukai Movie 8: Shuushou,"While walking home, Mikiya Kokutou comes acros...",The Garden of Sinners Chapter 8: Epilogue,劇場版 空の境界 the Garden of sinners 終章


In [23]:
train_final_df.columns.tolist()

['cat__type_Missing',
 'cat__type_doujinshi',
 'cat__type_light_novel',
 'cat__type_manga',
 'cat__type_manhua',
 'cat__type_manhwa',
 'cat__type_movie',
 'cat__type_music',
 'cat__type_novel',
 'cat__type_ona',
 'cat__type_one_shot',
 'cat__type_ova',
 'cat__type_special',
 'cat__type_tv',
 'cat__status_currently_airing',
 'cat__status_currently_publishing',
 'cat__status_discontinued',
 'cat__status_finished',
 'cat__status_finished_airing',
 'cat__status_not_yet_aired',
 'cat__status_on_hiatus',
 'cat__source_4_koma_manga',
 'cat__source_Missing',
 'cat__source_book',
 'cat__source_card_game',
 'cat__source_game',
 'cat__source_light_novel',
 'cat__source_manga',
 'cat__source_mixed_media',
 'cat__source_music',
 'cat__source_novel',
 'cat__source_original',
 'cat__source_other',
 'cat__source_picture_book',
 'cat__source_radio',
 'cat__source_visual_novel',
 'cat__source_web_manga',
 'cat__source_web_novel',
 'cat__start_season_Autumn',
 'cat__start_season_Missing',
 'cat__start_se

In [24]:
# check for missing data 
print("Number of missing data in Training Set:\n")
print(train_final_df.isnull().sum().to_string())

Number of missing data in Training Set:

cat__type_Missing                   0
cat__type_doujinshi                 0
cat__type_light_novel               0
cat__type_manga                     0
cat__type_manhua                    0
cat__type_manhwa                    0
cat__type_movie                     0
cat__type_music                     0
cat__type_novel                     0
cat__type_ona                       0
cat__type_one_shot                  0
cat__type_ova                       0
cat__type_special                   0
cat__type_tv                        0
cat__status_currently_airing        0
cat__status_currently_publishing    0
cat__status_discontinued            0
cat__status_finished                0
cat__status_finished_airing         0
cat__status_not_yet_aired           0
cat__status_on_hiatus               0
cat__source_4_koma_manga            0
cat__source_Missing                 0
cat__source_book                    0
cat__source_card_game               0
cat__sour

In [25]:
# check for missing data 
print("Number of missing data in Testing Set:\n")
print(test_final_df.isnull().sum().to_string())

Number of missing data in Testing Set:

cat__type_Missing                   0
cat__type_doujinshi                 0
cat__type_light_novel               0
cat__type_manga                     0
cat__type_manhua                    0
cat__type_manhwa                    0
cat__type_movie                     0
cat__type_music                     0
cat__type_novel                     0
cat__type_ona                       0
cat__type_one_shot                  0
cat__type_ova                       0
cat__type_special                   0
cat__type_tv                        0
cat__status_currently_airing        0
cat__status_currently_publishing    0
cat__status_discontinued            0
cat__status_finished                0
cat__status_finished_airing         0
cat__status_not_yet_aired           0
cat__status_on_hiatus               0
cat__source_4_koma_manga            0
cat__source_Missing                 0
cat__source_book                    0
cat__source_card_game               0
cat__sourc

## Export necessary assets

In [26]:
import joblib

# train and test df for deep learning
joblib.dump(train, 'assets/train_deep_learning.joblib')
joblib.dump(test, 'assets/test_deep_learning.joblib')

# train and test df for traditional ML
joblib.dump(train_final_df, 'assets/train_traditional_ml.joblib')
joblib.dump(test_final_df, 'assets/test_traditional_ml.joblib')

['assets/test_traditional_ml.joblib']

In [27]:
# store library version
# run every time before you commit
!pip freeze > requirements.txt