In [1]:
!where python

c:\Users\HK-Laptop-V639\Documents\GitHub\696\env696\Scripts\python.exe
C:\Users\HK-Laptop-V639\AppData\Local\Programs\Python\Python312\python.exe
C:\Users\HK-Laptop-V639\AppData\Local\Microsoft\WindowsApps\python.exe


In [2]:
# run this code to make sure you install all the required libraries
# be sure you are in virtual environment before install, otherwise it will overwrite your local environment

# !pip install -r requirements.txt

In [3]:
import pandas as pd
import numpy as np
import re

In [4]:
pd.set_option("display.max_columns", None)
RANDOM_SEED = 123

In [5]:
df_anime = pd.read_csv("assets/anime.csv")
df_manga = pd.read_csv("assets/manga.csv")

In [6]:
df_anime.shape, df_manga.shape

((24985, 39), (64833, 30))

## Data Prepocessing

Some short synopsis contain no information about the story of manga/title. This will introduce noise to our model. Therefore, we decide to remove those rows with extremely short synopsis.
Example:
- Second season of Mao Zhi Ming.
- The second season of Shen Lan Qi Yu Wushuang Zhu.
- Recap episode of Hakyuu Houshin Engi.
- Fifth Season of Bungou Stray Dogs
- 1-3. Ba_ku\n4-5. Mephisto
- An absurd film by Kuri Youji.
- Included one-shot:\nBougainvillea
- A collection of oneshots by Nishida Higashi.
- A movie adaptation of the TV series.
- Short film by Kurosaka Keita.
- Special episodes added to DVDs and Blu-rays.
- Movie based on the 1996 TV anime with an original plot.
- Third season of Yuan Long

In [7]:
def data_cleaning(input_anime, input_manga):
    df_anime = input_anime.copy()
    df_manga = input_manga.copy()

    # remove unnecessary columns
    df_anime = df_anime.drop(columns=['anime_id', 'total_duration', 'start_year', 'start_season', 'rating', 'main_picture', 'url', 'trailer_url', 'background', 'created_at', 'updated_at', 'episode_duration', 'broadcast_day', 'broadcast_time', 'licensors', 'title_synonyms', 'real_start_date', 'real_end_date', 'approved', 'themes'])
    df_manga = df_manga.drop(columns=['manga_id', 'main_picture', 'url', 'background', 'created_at_before', 'updated_at', 'title_synonyms', 'volumes', 'real_start_date', 'real_end_date', 'approved', 'themes'])

    # remove rows that are null in 'synopsis' and 'title', which are crucial for our project
    df_anime.dropna(subset=['title', 'synopsis', 'title_english', 'title_japanese'], inplace=True)
    df_manga.dropna(subset=['title', 'synopsis', 'title_english', 'title_japanese'], inplace=True)

    # remove '(Sources:...)' from synopsis
    df_anime['synopsis'] = df_anime['synopsis'].apply(lambda x: re.sub(r'\(Source:.*\)', '', x))
    df_manga['synopsis'] = df_manga['synopsis'].apply(lambda x: re.sub(r'\(Source:.*\)', '', x))

    # remove '[Written by ...]' from synopsis
    df_anime['synopsis'] = df_anime['synopsis'].apply(lambda x: re.sub(r'\[Written by.*\]', '', x))
    df_manga['synopsis'] = df_manga['synopsis'].apply(lambda x: re.sub(r'\[Written by.*\]', '', x))

    # remove rows that have extreme short synopsis
    df_anime = df_anime[df_anime['synopsis'].apply(lambda x: len(x) > 50)]
    df_manga = df_manga[df_manga['synopsis'].apply(lambda x: len(x) > 50)]

    print('cleaned anime shape: ', df_anime.shape)
    print('cleaned manga shape: ', df_manga.shape)

    return df_anime, df_manga

df_anime_cleaned, df_manga_cleaned = data_cleaning(df_anime, df_manga)

cleaned anime shape:  (8862, 19)
cleaned manga shape:  (15447, 18)


In [8]:
print("anime extra columns: \n", [col for col in df_anime_cleaned.columns if not col in df_manga_cleaned.columns])

anime extra columns: 
 ['episodes', 'source', 'studios', 'producers']


In [9]:
print("manga extra columns: \n", [col for col in df_manga_cleaned.columns if not col in df_anime_cleaned.columns])

manga extra columns: 
 ['chapters', 'authors', 'serializations']


In [10]:
print("common columns: \n", [col for col in df_anime_cleaned.columns if col in df_manga_cleaned.columns])

common columns: 
 ['title', 'type', 'score', 'scored_by', 'status', 'start_date', 'end_date', 'members', 'favorites', 'sfw', 'genres', 'demographics', 'synopsis', 'title_english', 'title_japanese']


In [11]:
import ast

def extract_author_name(author_list):
    try:
        author_list = ast.literal_eval(author_list)
        
        # Extract first and last names of authors, ignoring the others
        author_names = [f"{author['first_name']} {author['last_name']}".strip() for author in author_list if author['first_name'] or author['last_name']]
        
        # Join names for multiple authors
        return ', '.join(author_names) if author_names else "Missing"
    
    except (ValueError, SyntaxError, KeyError, TypeError):
        # Handle Missing data
        return "Missing"

# Apply the function to the 'authors' column
def authors_extraction(input_manga):
    df_manga = input_manga.copy()
    df_manga['authors'] = df_manga['authors'].apply(extract_author_name)

    return df_manga

df_manga_extracted = authors_extraction(df_manga_cleaned)

In [12]:
df_manga_cleaned['authors'].head(1).values

array(["[{'id': 1868, 'first_name': 'Kentarou', 'last_name': 'Miura', 'role': 'Story & Art'}, {'id': 49592, 'first_name': '', 'last_name': 'Studio Gaga', 'role': 'Art'}]"],
      dtype=object)

In [13]:
df_manga_extracted['authors'].head(1).values

array(['Kentarou Miura, Studio Gaga'], dtype=object)

**Extra columns alignment**:
| anime columns | manga columns | strategy |
| --- | --- | --- |
| episodes | chapters | episodes/chapters |
| source | NULL | impute const 'Missing' |
| studios | authors | creators |
| producers | serializations | production_source |

In [14]:
def columns_alignment(input_anime, input_manga):
    df_anime = input_anime.copy()
    df_manga = input_manga.copy()

    # treat 'episodes' and 'chapters' the same, create null value for 'volume' in anime
    df_anime.rename(columns={'episodes': 'episodes/chapters'}, inplace=True)
    df_manga.rename(columns={'chapters': 'episodes/chapters'}, inplace=True)

    # Combine studios and authors together to get creators columns
    df_anime.rename(columns={'studios': 'creators'}, inplace=True)
    df_manga.rename(columns={'authors': 'creators'}, inplace=True)

    # Also for producers and serialization
    df_anime.rename(columns={'producers': 'production_source'}, inplace=True)
    df_manga.rename(columns={'serializations': 'production_source'}, inplace=True)

    # To distinguish where the data from
    df_anime['is_anime'] = 1
    df_manga['is_anime'] = 0 
    
    return df_anime, df_manga

df_anime_aligned, df_manga_aligned = columns_alignment(df_anime_cleaned, df_manga_extracted)

In [15]:
df_full = pd.concat([df_anime_aligned, df_manga_aligned], ignore_index=True)

df_full.head()

Unnamed: 0,title,type,score,scored_by,status,episodes/chapters,start_date,end_date,source,members,favorites,sfw,genres,demographics,creators,production_source,synopsis,title_english,title_japanese,is_anime
0,Fullmetal Alchemist: Brotherhood,tv,9.1,2037075,finished_airing,64.0,2009-04-05,2010-07-04,manga,3206028,219036,True,"['Action', 'Adventure', 'Drama', 'Fantasy']",['Shounen'],['Bones'],"['Aniplex', 'Square Enix', 'Mainichi Broadcast...",After a horrific alchemy experiment goes wrong...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,1
1,Hunter x Hunter (2011),tv,9.04,1671587,finished_airing,148.0,2011-10-02,2014-09-24,manga,2688079,202109,True,"['Action', 'Adventure', 'Fantasy']",['Shounen'],['Madhouse'],"['VAP', 'Nippon Television Network', 'Shueisha']",Hunters devote themselves to accomplishing haz...,Hunter x Hunter,HUNTER×HUNTER（ハンター×ハンター）,1
2,Shingeki no Kyojin Season 3 Part 2,tv,9.05,1491491,finished_airing,10.0,2019-04-29,2019-07-01,manga,2133927,55644,True,"['Action', 'Drama']",['Shounen'],['Wit Studio'],"['Production I.G', 'Dentsu', 'Mainichi Broadca...",Seeking to restore humanity's diminishing hope...,Attack on Titan Season 3 Part 2,進撃の巨人 Season3 Part.2,1
3,Steins;Gate,tv,9.07,1348232,finished_airing,24.0,2011-04-06,2011-09-14,visual_novel,2463954,184312,True,"['Drama', 'Sci-Fi', 'Suspense']",[],['White Fox'],"['Frontier Works', 'Media Factory', 'Kadokawa ...",Eccentric scientist Rintarou Okabe has a never...,Steins;Gate,STEINS;GATE,1
4,Koe no Katachi,movie,8.94,1540277,finished_airing,1.0,2016-09-17,2016-09-17,manga,2218467,84124,True,"['Award Winning', 'Drama']",['Shounen'],['Kyoto Animation'],"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","As a wild youth, elementary school student Sho...",A Silent Voice,聲の形,1


In [16]:
df_full = pd.concat([df_anime_aligned, df_manga_aligned], ignore_index=True)

df_full.head()

Unnamed: 0,title,type,score,scored_by,status,episodes/chapters,start_date,end_date,source,members,favorites,sfw,genres,demographics,creators,production_source,synopsis,title_english,title_japanese,is_anime
0,Fullmetal Alchemist: Brotherhood,tv,9.1,2037075,finished_airing,64.0,2009-04-05,2010-07-04,manga,3206028,219036,True,"['Action', 'Adventure', 'Drama', 'Fantasy']",['Shounen'],['Bones'],"['Aniplex', 'Square Enix', 'Mainichi Broadcast...",After a horrific alchemy experiment goes wrong...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,1
1,Hunter x Hunter (2011),tv,9.04,1671587,finished_airing,148.0,2011-10-02,2014-09-24,manga,2688079,202109,True,"['Action', 'Adventure', 'Fantasy']",['Shounen'],['Madhouse'],"['VAP', 'Nippon Television Network', 'Shueisha']",Hunters devote themselves to accomplishing haz...,Hunter x Hunter,HUNTER×HUNTER（ハンター×ハンター）,1
2,Shingeki no Kyojin Season 3 Part 2,tv,9.05,1491491,finished_airing,10.0,2019-04-29,2019-07-01,manga,2133927,55644,True,"['Action', 'Drama']",['Shounen'],['Wit Studio'],"['Production I.G', 'Dentsu', 'Mainichi Broadca...",Seeking to restore humanity's diminishing hope...,Attack on Titan Season 3 Part 2,進撃の巨人 Season3 Part.2,1
3,Steins;Gate,tv,9.07,1348232,finished_airing,24.0,2011-04-06,2011-09-14,visual_novel,2463954,184312,True,"['Drama', 'Sci-Fi', 'Suspense']",[],['White Fox'],"['Frontier Works', 'Media Factory', 'Kadokawa ...",Eccentric scientist Rintarou Okabe has a never...,Steins;Gate,STEINS;GATE,1
4,Koe no Katachi,movie,8.94,1540277,finished_airing,1.0,2016-09-17,2016-09-17,manga,2218467,84124,True,"['Award Winning', 'Drama']",['Shounen'],['Kyoto Animation'],"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","As a wild youth, elementary school student Sho...",A Silent Voice,聲の形,1


### Date Transformation

In [17]:
# Function to extract year and month
def extract_year_and_month(date):
    try:
        # Convert the date string to a datetime object
        datetime_date = pd.to_datetime(date, errors='raise')
        # Extract year and month
        return datetime_date.year, datetime_date.month
    except:
        # return NaN if fails
        return np.nan, np.nan
    
# Apply the function to the 'start_date' column
df_full[['start_year', 'start_month']] = df_full['start_date'].apply(lambda x: pd.Series(extract_year_and_month(x)))

# Apply the function to the 'end_date' column
df_full[['end_year', 'end_month']] = df_full['end_date'].apply(lambda x: pd.Series(extract_year_and_month(x)))

#remove useless columns
df_full = df_full.drop(columns=['start_date', 'end_date'])

In [18]:
# Function to transform Start or End month into season refer to Events of Anime
def month_to_season(month):
    # If the data is in range, return corresponding Season of events of Anime
    if month in [1, 2, 3]:
        return 'Winter'
    elif month in [4, 5, 6]:
        return 'Spring'
    elif month in [7, 8, 9]:
        return 'Summer'
    elif month in [10, 11, 12]:
        return 'Autumn'
    else:
        return np.nan  # Handle unexpected cases, though this shouldn't occur with valid months

# Apply the function to transform the month value to season categories
df_full['start_season'] = df_full['start_month'].apply(month_to_season)
df_full['end_season'] = df_full['end_month'].apply(month_to_season)

# convert year to int
df_full['start_year'] = df_full['start_year'].astype('Int64')
df_full['end_year'] = df_full['end_year'].astype('Int64')

# remove useless columns
df_full = df_full.drop(columns=['start_month', 'end_month'])

# Check for Year and Season feature
df_full[['start_year', 'end_year', 'start_season', 'end_season']].head()

Unnamed: 0,start_year,end_year,start_season,end_season
0,2009,2010,Spring,Summer
1,2011,2014,Autumn,Summer
2,2019,2019,Spring,Summer
3,2011,2011,Spring,Summer
4,2016,2016,Summer,Summer


### Train Test data split

In [19]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_full, test_size=0.2, random_state=RANDOM_SEED)
print('Number of rows in train set: ', len(train))

Number of rows in train set:  19447


### Normalization, one-hot encoding, and imputation

We will perform full preprocessing specifically for traditional machine learning models, while leaving these preprocessing steps within the deep learning model pipeline itself using PyTorch. So, we will end up with 2 sets of data:
- pre-processed train and test data
- non-processed train and test data

In [20]:
# check for missing data 
print("Number of missing data in Training Set:\n")
print(train.isnull().sum())

Number of missing data in Training Set:

title                    0
type                     6
score                 5962
scored_by                0
status                   0
episodes/chapters     3898
source               12984
members                  0
favorites                0
sfw                      0
genres                   0
demographics             0
creators                 0
production_source        0
synopsis                 0
title_english            0
title_japanese           0
is_anime                 0
start_year             324
end_year              3490
start_season           324
end_season            3490
dtype: int64


In [21]:
# check for missing data 
print("Number of missing data in Testing Set:\n")
print(test.isnull().sum())

Number of missing data in Testing Set:

title                   0
type                    2
score                1496
scored_by               0
status                  0
episodes/chapters     959
source               3237
members                 0
favorites               0
sfw                     0
genres                  0
demographics            0
creators                0
production_source       0
synopsis                0
title_english           0
title_japanese          0
is_anime                0
start_year             79
end_year              863
start_season           79
end_season            863
dtype: int64


In [52]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd

# Single-category columns
single_cat_columns = ['type', 'status', 'source', 'start_season', 'end_season']
# Multi-category columns  
multi_cat_columns = ['genres', 'demographics', 'creators', 'production_source']
# Numerical columns
num_col = ['score', 'scored_by', 'episodes/chapters', 'members', 'favorites', 'start_year', 'end_year']
# Text columns
text_col = ['title', 'synopsis', 'title_english', 'title_japanese']

# As MultiLabelBinarizer can't work well within sklearn pipeline, we will do it outside pipeline
mlb = MultiLabelBinarizer()

# Function to apply MultiLabelBinarizer to each multi-category column
def apply_mlb(df, columns):
    transformed_dfs = []
    for col in columns:
        transformed = mlb.fit_transform(df[col])
        transformed_df = pd.DataFrame(transformed, columns=[f"{col}_{cls}" for cls in mlb.classes_], index=df.index)
        transformed_dfs.append(transformed_df)
    return pd.concat(transformed_dfs, axis=1)

# Transform the multi-category columns for both train and test sets
train_multi_cat = apply_mlb(train, multi_cat_columns)
test_multi_cat = apply_mlb(test, multi_cat_columns)

# Drop the original multi-category columns from train and test sets
train_dropped = train.drop(columns=multi_cat_columns)
test_dropped = test.drop(columns=multi_cat_columns)

# Concatenate the transformed multi-category columns back to the datasets
train_with_mlb = pd.concat([train_dropped, train_multi_cat], axis=1)
test_with_mlb = pd.concat([test_dropped, test_multi_cat], axis=1)

# Define the pipeline for single-category columns
single_cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(sparse_output=False))
])

# Define the pipeline for numerical columns
num_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine the categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat', single_cat_pipeline, single_cat_columns),
    ('numerical', num_pipeline, num_col)
], remainder='drop')  # Drop text columns explicitly

# Create a pipeline for preprocessing only
full_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),           # Apply preprocessing
    ('imputer', KNNImputer(n_neighbors=5))     # Apply KNN imputation on all processed features
])

# Fit the pipeline to the transformed training set and transform it
train_processed = full_pipeline.fit_transform(train_with_mlb)
test_processed = full_pipeline.transform(test_with_mlb)

# Get feature names from the pipeline and create DataFrames
train_processed_df = pd.DataFrame(train_processed, columns=full_pipeline.named_steps['preprocessing'].get_feature_names_out(), index=train.index)
test_processed_df = pd.DataFrame(test_processed, columns=full_pipeline.named_steps['preprocessing'].get_feature_names_out(), index=test.index)

# Concatenate the processed DataFrame with the untransformed text columns
train_final_df = pd.concat([train_processed_df, train[text_col]], axis=1)
test_final_df = pd.concat([test_processed_df, test[text_col]], axis=1)

In [55]:
train_final_df.sample(3)

Unnamed: 0,cat__type_Missing,cat__type_doujinshi,cat__type_light_novel,cat__type_manga,cat__type_manhua,cat__type_manhwa,cat__type_movie,cat__type_music,cat__type_novel,cat__type_ona,cat__type_one_shot,cat__type_ova,cat__type_special,cat__type_tv,cat__status_currently_airing,cat__status_currently_publishing,cat__status_discontinued,cat__status_finished,cat__status_finished_airing,cat__status_not_yet_aired,cat__status_on_hiatus,cat__source_4_koma_manga,cat__source_Missing,cat__source_book,cat__source_card_game,cat__source_game,cat__source_light_novel,cat__source_manga,cat__source_mixed_media,cat__source_music,cat__source_novel,cat__source_original,cat__source_other,cat__source_picture_book,cat__source_radio,cat__source_visual_novel,cat__source_web_manga,cat__source_web_novel,cat__start_season_Autumn,cat__start_season_Missing,cat__start_season_Spring,cat__start_season_Summer,cat__start_season_Winter,cat__end_season_Autumn,cat__end_season_Missing,cat__end_season_Spring,cat__end_season_Summer,cat__end_season_Winter,numerical__score,numerical__scored_by,numerical__episodes/chapters,numerical__members,numerical__favorites,numerical__start_year,numerical__end_year,title,synopsis,title_english,title_japanese
23499,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.526246,-0.20208,-0.241285,-0.236061,-0.115327,0.664433,0.640715,Itoshigo no Shouzou,"When she was seventeen, Angela fell pregnant—b...",A Natural Mother,愛し子の肖像
8715,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-1.047038,-0.201694,-0.434398,-0.235842,-0.115327,0.924569,0.895275,Yomoyama Tanpenshuu,"Set in a certain cafe, the unusual daily life ...",Yomoyama Short Stories,よもやま短編集
3373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.577213,-0.180339,-0.3776,-0.199231,-0.111866,0.317584,0.301302,Mekakucity V's,Mekakucity V's compiles more songs of the Kage...,Mekakucity V's,Mekakucity V's


In [54]:
train_final_df.columns

Index(['cat__type_Missing', 'cat__type_doujinshi', 'cat__type_light_novel',
       'cat__type_manga', 'cat__type_manhua', 'cat__type_manhwa',
       'cat__type_movie', 'cat__type_music', 'cat__type_novel',
       'cat__type_ona', 'cat__type_one_shot', 'cat__type_ova',
       'cat__type_special', 'cat__type_tv', 'cat__status_currently_airing',
       'cat__status_currently_publishing', 'cat__status_discontinued',
       'cat__status_finished', 'cat__status_finished_airing',
       'cat__status_not_yet_aired', 'cat__status_on_hiatus',
       'cat__source_4_koma_manga', 'cat__source_Missing', 'cat__source_book',
       'cat__source_card_game', 'cat__source_game', 'cat__source_light_novel',
       'cat__source_manga', 'cat__source_mixed_media', 'cat__source_music',
       'cat__source_novel', 'cat__source_original', 'cat__source_other',
       'cat__source_picture_book', 'cat__source_radio',
       'cat__source_visual_novel', 'cat__source_web_manga',
       'cat__source_web_novel', 'cat__s

In [56]:
# check for missing data 
print("Number of missing data in Training Set:\n")
print(train_final_df.isnull().sum())

Number of missing data in Training Set:

cat__type_Missing                   0
cat__type_doujinshi                 0
cat__type_light_novel               0
cat__type_manga                     0
cat__type_manhua                    0
cat__type_manhwa                    0
cat__type_movie                     0
cat__type_music                     0
cat__type_novel                     0
cat__type_ona                       0
cat__type_one_shot                  0
cat__type_ova                       0
cat__type_special                   0
cat__type_tv                        0
cat__status_currently_airing        0
cat__status_currently_publishing    0
cat__status_discontinued            0
cat__status_finished                0
cat__status_finished_airing         0
cat__status_not_yet_aired           0
cat__status_on_hiatus               0
cat__source_4_koma_manga            0
cat__source_Missing                 0
cat__source_book                    0
cat__source_card_game               0
cat__sour

In [57]:
# check for missing data 
print("Number of missing data in Testing Set:\n")
print(test_final_df.isnull().sum())

Number of missing data in Testing Set:

cat__type_Missing                   0
cat__type_doujinshi                 0
cat__type_light_novel               0
cat__type_manga                     0
cat__type_manhua                    0
cat__type_manhwa                    0
cat__type_movie                     0
cat__type_music                     0
cat__type_novel                     0
cat__type_ona                       0
cat__type_one_shot                  0
cat__type_ova                       0
cat__type_special                   0
cat__type_tv                        0
cat__status_currently_airing        0
cat__status_currently_publishing    0
cat__status_discontinued            0
cat__status_finished                0
cat__status_finished_airing         0
cat__status_not_yet_aired           0
cat__status_on_hiatus               0
cat__source_4_koma_manga            0
cat__source_Missing                 0
cat__source_book                    0
cat__source_card_game               0
cat__sourc

## Export necessary assets

In [61]:
import joblib

# train and test df for deep learning
joblib.dump(train, 'assets/train_deep_learning.joblib')
joblib.dump(test, 'assets/test_deep_learning.joblib')

# train and test df for traditional ML
joblib.dump(train_final_df, 'assets/train_traditional_ml.joblib')
joblib.dump(test_final_df, 'assets/test_traditional_ml.joblib')

['assets/test_deep_learning.joblib']

In [59]:
# store library version
# run every time before you commit
!pip freeze > requirements.txt