In [70]:
!where python

c:\Users\user\Desktop\MADS\696\696\env696\Scripts\python.exe
C:\Users\user\AppData\Local\Programs\Python\Python312\python.exe
C:\Users\user\AppData\Local\Microsoft\WindowsApps\python.exe


In [71]:
# run this code to make sure you install all the required libraries
# be sure you are in virtual environment before install, otherwise it will overwrite your local environment

# !pip install -r requirements.txt

In [72]:
import pandas as pd
import numpy as np
import re

In [73]:
pd.set_option("display.max_columns", None)
RANDOM_SEED = 123

In [74]:
df_anime = pd.read_csv("assets/anime.csv")
df_manga = pd.read_csv("assets/manga.csv")

In [75]:
df_anime.shape, df_manga.shape

((24985, 39), (64833, 30))

## Data Prepocessing

Some short synopsis contain no information about the story of manga/title. This will introduce noise to our model. Therefore, we decide to remove those rows with extremely short synopsis.
Example:
- Second season of Mao Zhi Ming.
- The second season of Shen Lan Qi Yu Wushuang Zhu.
- Recap episode of Hakyuu Houshin Engi.
- Fifth Season of Bungou Stray Dogs
- 1-3. Ba_ku\n4-5. Mephisto
- An absurd film by Kuri Youji.
- Included one-shot:\nBougainvillea
- A collection of oneshots by Nishida Higashi.
- A movie adaptation of the TV series.
- Short film by Kurosaka Keita.
- Special episodes added to DVDs and Blu-rays.
- Movie based on the 1996 TV anime with an original plot.
- Third season of Yuan Long

In [76]:
def data_cleaning(input_anime, input_manga):
    df_anime = input_anime.copy()
    df_manga = input_manga.copy()

    # remove unnecessary columns
    df_anime = df_anime.drop(columns=['anime_id', 'total_duration', 'start_year', 'start_season', 'rating', 'main_picture', 'url', 'trailer_url', 'background', 'created_at', 'updated_at', 'episode_duration', 'broadcast_day', 'broadcast_time', 'licensors', 'title_synonyms', 'real_start_date', 'real_end_date'])
    df_manga = df_manga.drop(columns=['manga_id', 'main_picture', 'url', 'background', 'created_at_before', 'updated_at', 'title_synonyms', 'volumes', 'real_start_date', 'real_end_date'])

    # remove rows that are null in 'synopsis' and 'title', which are crucial for our project
    df_anime.dropna(subset=['title', 'synopsis', 'title_english', 'title_japanese'], inplace=True)
    df_manga.dropna(subset=['title', 'synopsis', 'title_english', 'title_japanese'], inplace=True)

    # remove '(Sources:...)' from synopsis
    df_anime['synopsis'] = df_anime['synopsis'].apply(lambda x: re.sub(r'\(Source:.*\)', '', x))
    df_manga['synopsis'] = df_manga['synopsis'].apply(lambda x: re.sub(r'\(Source:.*\)', '', x))

    # remove '[Written by ...]' from synopsis
    df_anime['synopsis'] = df_anime['synopsis'].apply(lambda x: re.sub(r'\[Written by.*\]', '', x))
    df_manga['synopsis'] = df_manga['synopsis'].apply(lambda x: re.sub(r'\[Written by.*\]', '', x))

    # remove rows that have extreme short synopsis
    df_anime = df_anime[df_anime['synopsis'].apply(lambda x: len(x) > 50)]
    df_manga = df_manga[df_manga['synopsis'].apply(lambda x: len(x) > 50)]

    print('cleaned anime shape: ', df_anime.shape)
    print('cleaned manga shape: ', df_manga.shape)

    return df_anime, df_manga

df_anime_cleaned, df_manga_cleaned = data_cleaning(df_anime, df_manga)

cleaned anime shape:  (8863, 21)
cleaned manga shape:  (15454, 20)


In [77]:
print("anime extra columns: \n", [col for col in df_anime_cleaned.columns if not col in df_manga_cleaned.columns])

anime extra columns: 
 ['episodes', 'source', 'studios', 'producers']


In [78]:
print("manga extra columns: \n", [col for col in df_manga_cleaned.columns if not col in df_anime_cleaned.columns])

manga extra columns: 
 ['chapters', 'authors', 'serializations']


In [79]:
print("common columns: \n", [col for col in df_anime_cleaned.columns if col in df_manga_cleaned.columns])

common columns: 
 ['title', 'type', 'score', 'scored_by', 'status', 'start_date', 'end_date', 'members', 'favorites', 'sfw', 'approved', 'genres', 'themes', 'demographics', 'synopsis', 'title_english', 'title_japanese']


In [80]:
import ast

def extract_author_name(author_list):
    try:
        author_list = ast.literal_eval(author_list)
        
        # Extract first and last names of authors, ignoring the others
        author_names = [f"{author['first_name']} {author['last_name']}".strip() for author in author_list if author['first_name'] or author['last_name']]
        
        # Join names for multiple authors
        return ', '.join(author_names) if author_names else "Missing"
    
    except (ValueError, SyntaxError, KeyError, TypeError):
        # Handle Missing data
        return "Missing"

# Apply the function to the 'authors' column
def authors_extraction(input_manga):
    df_manga = input_manga.copy()
    df_manga['authors'] = df_manga['authors'].apply(extract_author_name)

    return df_manga

df_manga_extracted = authors_extraction(df_manga_cleaned)

In [81]:
df_manga_cleaned['authors'].head(1).values

array(["[{'id': 1868, 'first_name': 'Kentarou', 'last_name': 'Miura', 'role': 'Story & Art'}, {'id': 49592, 'first_name': '', 'last_name': 'Studio Gaga', 'role': 'Art'}]"],
      dtype=object)

In [82]:
df_manga_extracted['authors'].head(1).values

array(['Kentarou Miura, Studio Gaga'], dtype=object)

**Extra columns alignment**:
| anime columns | manga columns | strategy |
| --- | --- | --- |
| episodes | chapters | episodes/chapters |
| source | NULL | impute const 'Missing' |
| studios | authors | creators |
| producers | serializations | production_source |

In [83]:
def columns_alignment(input_anime, input_manga):
    df_anime = input_anime.copy()
    df_manga = input_manga.copy()

    # treat 'episodes' and 'chapters' the same, create null value for 'volume' in anime
    df_anime.rename(columns={'episodes': 'episodes/chapters'}, inplace=True)
    df_manga.rename(columns={'chapters': 'episodes/chapters'}, inplace=True)

    # Combine studios and authors together to get creators columns
    df_anime.rename(columns={'studios': 'creators'}, inplace=True)
    df_manga.rename(columns={'authors': 'creators'}, inplace=True)

    # Also for producers and serialization
    df_anime.rename(columns={'producers': 'production_source'}, inplace=True)
    df_manga.rename(columns={'serializations': 'production_source'}, inplace=True)

    # To distinguish where the data from
    df_anime['is_anime'] = 1
    df_manga['is_anime'] = 0 
    
    return df_anime, df_manga

df_anime_aligned, df_manga_aligned = columns_alignment(df_anime_cleaned, df_manga_extracted)

<span style="color:red"><b><<<<<<< local</b></span>

In [84]:
df_full = pd.concat([df_anime_aligned, df_manga_aligned], ignore_index=True)

df_full.head()

Unnamed: 0,title,type,score,scored_by,status,episodes/chapters,start_date,end_date,source,members,favorites,sfw,approved,genres,themes,demographics,creators,production_source,synopsis,title_english,title_japanese,is_anime
0,Fullmetal Alchemist: Brotherhood,tv,9.1,2037075,finished_airing,64.0,2009-04-05,2010-07-04,manga,3206028,219036,True,True,"['Action', 'Adventure', 'Drama', 'Fantasy']",['Military'],['Shounen'],['Bones'],"['Aniplex', 'Square Enix', 'Mainichi Broadcast...",After a horrific alchemy experiment goes wrong...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,1
1,Hunter x Hunter (2011),tv,9.04,1671587,finished_airing,148.0,2011-10-02,2014-09-24,manga,2688079,202109,True,True,"['Action', 'Adventure', 'Fantasy']",[],['Shounen'],['Madhouse'],"['VAP', 'Nippon Television Network', 'Shueisha']",Hunters devote themselves to accomplishing haz...,Hunter x Hunter,HUNTER×HUNTER（ハンター×ハンター）,1
2,Shingeki no Kyojin Season 3 Part 2,tv,9.05,1491491,finished_airing,10.0,2019-04-29,2019-07-01,manga,2133927,55644,True,True,"['Action', 'Drama']","['Gore', 'Military', 'Survival']",['Shounen'],['Wit Studio'],"['Production I.G', 'Dentsu', 'Mainichi Broadca...",Seeking to restore humanity's diminishing hope...,Attack on Titan Season 3 Part 2,進撃の巨人 Season3 Part.2,1
3,Steins;Gate,tv,9.07,1348232,finished_airing,24.0,2011-04-06,2011-09-14,visual_novel,2463954,184312,True,True,"['Drama', 'Sci-Fi', 'Suspense']","['Psychological', 'Time Travel']",[],['White Fox'],"['Frontier Works', 'Media Factory', 'Kadokawa ...",Eccentric scientist Rintarou Okabe has a never...,Steins;Gate,STEINS;GATE,1
4,Koe no Katachi,movie,8.94,1540277,finished_airing,1.0,2016-09-17,2016-09-17,manga,2218467,84124,True,True,"['Award Winning', 'Drama']",['Romantic Subtext'],['Shounen'],['Kyoto Animation'],"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","As a wild youth, elementary school student Sho...",A Silent Voice,聲の形,1


In [85]:
df_full = pd.concat([df_anime_aligned, df_manga_aligned], ignore_index=True)

df_full.head()

Unnamed: 0,title,type,score,scored_by,status,episodes/chapters,start_date,end_date,source,members,favorites,sfw,approved,genres,themes,demographics,creators,production_source,synopsis,title_english,title_japanese,is_anime
0,Fullmetal Alchemist: Brotherhood,tv,9.1,2037075,finished_airing,64.0,2009-04-05,2010-07-04,manga,3206028,219036,True,True,"['Action', 'Adventure', 'Drama', 'Fantasy']",['Military'],['Shounen'],['Bones'],"['Aniplex', 'Square Enix', 'Mainichi Broadcast...",After a horrific alchemy experiment goes wrong...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,1
1,Hunter x Hunter (2011),tv,9.04,1671587,finished_airing,148.0,2011-10-02,2014-09-24,manga,2688079,202109,True,True,"['Action', 'Adventure', 'Fantasy']",[],['Shounen'],['Madhouse'],"['VAP', 'Nippon Television Network', 'Shueisha']",Hunters devote themselves to accomplishing haz...,Hunter x Hunter,HUNTER×HUNTER（ハンター×ハンター）,1
2,Shingeki no Kyojin Season 3 Part 2,tv,9.05,1491491,finished_airing,10.0,2019-04-29,2019-07-01,manga,2133927,55644,True,True,"['Action', 'Drama']","['Gore', 'Military', 'Survival']",['Shounen'],['Wit Studio'],"['Production I.G', 'Dentsu', 'Mainichi Broadca...",Seeking to restore humanity's diminishing hope...,Attack on Titan Season 3 Part 2,進撃の巨人 Season3 Part.2,1
3,Steins;Gate,tv,9.07,1348232,finished_airing,24.0,2011-04-06,2011-09-14,visual_novel,2463954,184312,True,True,"['Drama', 'Sci-Fi', 'Suspense']","['Psychological', 'Time Travel']",[],['White Fox'],"['Frontier Works', 'Media Factory', 'Kadokawa ...",Eccentric scientist Rintarou Okabe has a never...,Steins;Gate,STEINS;GATE,1
4,Koe no Katachi,movie,8.94,1540277,finished_airing,1.0,2016-09-17,2016-09-17,manga,2218467,84124,True,True,"['Award Winning', 'Drama']",['Romantic Subtext'],['Shounen'],['Kyoto Animation'],"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","As a wild youth, elementary school student Sho...",A Silent Voice,聲の形,1


### Date Transformation

In [86]:
# Function to extract year and month
def extract_year_and_month(date):
    try:
        # Convert the date string to a datetime object
        datetime_date = pd.to_datetime(date, errors='raise')
        # Extract year and month
        return datetime_date.year, datetime_date.month
    except:
        # return NaN if fails
        return np.nan, np.nan
    
# Apply the function to the 'start_date' column
df_full[['start_year', 'start_month']] = df_full['start_date'].apply(lambda x: pd.Series(extract_year_and_month(x)))

# Apply the function to the 'end_date' column
df_full[['end_year', 'end_month']] = df_full['end_date'].apply(lambda x: pd.Series(extract_year_and_month(x)))

#remove useless columns
df_full = df_full.drop(columns=['start_date', 'end_date'])

In [87]:
# Function to transform Start or End month into season refer to Events of Anime
def month_to_season(month):
    # If the data is in range, return corresponding Season of events of Anime
    if month in [1, 2, 3]:
        return 'Winter'
    elif month in [4, 5, 6]:
        return 'Spring'
    elif month in [7, 8, 9]:
        return 'Summer'
    elif month in [10, 11, 12]:
        return 'Autumn'
    else:
        return np.nan  # Handle unexpected cases, though this shouldn't occur with valid months

# Apply the function to transform the month value to season categories
df_full['start_season'] = df_full['start_month'].apply(month_to_season)
df_full['end_season'] = df_full['end_month'].apply(month_to_season)

# convert year to int
df_full['start_year'] = df_full['start_year'].astype('Int64')
df_full['end_year'] = df_full['end_year'].astype('Int64')

# remove useless columns
df_full = df_full.drop(columns=['start_month', 'end_month'])

# Check for Year and Season feature
df_full[['start_year', 'end_year', 'start_season', 'end_season']].head()

Unnamed: 0,start_year,end_year,start_season,end_season
0,2009,2010,Spring,Summer
1,2011,2014,Autumn,Summer
2,2019,2019,Spring,Summer
3,2011,2011,Spring,Summer
4,2016,2016,Summer,Summer


### Train Test data split

In [88]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_full, test_size=0.2, random_state=RANDOM_SEED)
print('Number of rows in train set: ', len(train))

Number of rows in train set:  19453


In [89]:
# check for missing data 
print("Number of missing data in Training Set:\n")
print(train.isnull().sum())

Number of missing data in Training Set:

title                    0
type                     7
score                 5969
scored_by                0
status                   0
episodes/chapters     3887
source               12989
members                  0
favorites                0
sfw                      0
approved                 0
genres                   0
themes                   0
demographics             0
creators                 0
production_source        0
synopsis                 0
title_english            0
title_japanese           0
is_anime                 0
start_year             328
end_year              3478
start_season           328
end_season            3478
dtype: int64


In [90]:
# check for missing data 
print("Number of missing data in Testing Set:\n")
print(test.isnull().sum())

Number of missing data in Testing Set:

title                   0
type                    1
score                1494
scored_by               0
status                  0
episodes/chapters     970
source               3239
members                 0
favorites               0
sfw                     0
approved                0
genres                  0
themes                  0
demographics            0
creators                0
production_source       0
synopsis                0
title_english           0
title_japanese          0
is_anime                0
start_year             75
end_year              875
start_season           75
end_season            875
dtype: int64


### Handle Missing data

In [96]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_selector

# Single-category columns
string_columns = ['type', 'status', 'source', 'sfw', 'approved', 'start_season', 'end_season']
# Multi-category columns  
list_columns = ['genres', 'themes', 'demographics', 'creators', 'production_source']  

single_category_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(sparse_output=False))
])

# Define the pipeline for multi-category columns
multi_category_pipeline = Pipeline(steps=[
    ('mlb', MultiLabelBinarizer())
])

# Combine both pipelines in a ColumnTransformer
pipeline = ColumnTransformer([
    ('single_category', single_category_pipeline, string_columns),
    ('multi_category', multi_category_pipeline, list_columns)
], remainder='passthrough')  # Pass through all other columns

# Fit the pipeline to the training set and transform it
train_processed = pipeline.fit_transform(train)
# Transform the test data using the same pipeline
test_processed = pipeline.transform(test)

TypeError: MultiLabelBinarizer.fit_transform() takes 2 positional arguments but 3 were given

In [278]:
"""from sklearn.impute import SimpleImputer

# Non-numerical columns
Non_Numerical = ['type', 'source', 'start_season', 'end_season']

# Create an imputer for non-numerical data filling with 'Missing'
categorical_imputer = SimpleImputer(strategy='constant', fill_value='Missing')

# Impute non-numerical columns
train[Non_Numerical] = categorical_imputer.fit_transform(train[Non_Numerical])"""


In [284]:
"""from sklearn.preprocessing import OneHotEncoder

# Columns need for one-hot encoding
# contain single category
string_columns = ['type', 'status', 'source', 'sfw', 'approved']
# contain multi categories
list_columns = ['genres', 'themes', 'demographics', 'creators', 'production_source']

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(sparse_output=False, drop='first') 

# Fit and transform single-category columns
encoded_single = onehot_encoder.fit_transform(train[string_columns])

# Create DataFrame for one-hot encoded columns pf string columns
encoded_single_category_df = pd.DataFrame(encoded_single, columns = onehot_encoder.get_feature_names_out(string_columns))"""

In [None]:
"""# Function to convert list columns into one-hot encoded columns
def one_hot_encode_list_column(df, column_name):
    # Create a DataFrame with each list exploded into individual rows
    df_exploded = df[column_name].explode()
    
    # Create dummy variables for the exploded column (one-hot encoding)
    dummies = pd.get_dummies(df_exploded, prefix=column_name)
    
    # Re-group the dummies back into the original rows by summing up the dummy columns
    one_hot_encoded = dummies.groupby(df.index).sum()
    
    return one_hot_encoded
"""

In [None]:
"""from sklearn.impute import KNNImputer

# Numerical columns
Numerical = ['score', 'scored_by', 'episodes/chapters', 'members', 'favorites', 'start_year' , 'end_year']

# Create an imputer for numerical data using KNNImputer
knn_imputer = KNNImputer(n_neighbors=3)

# Impute numerical columns
train[Numerical] = knn_imputer.fit_transform(train[Numerical])

# Check for result
print("Number of missing data in Training Set:\n")
print(train.isnull().sum())"""

### Data Normalization

In [23]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Selecting columns to normalize
columns_to_normalize = ["scored_by", "episodes/chapters", "members", "favorites"]

earliest_start_year = train['start_year'].min()

# function for Normalizing the selected columns
def Normalizing(data):
    data_normalized = data.copy()
    data_normalized[columns_to_normalize] = scaler.fit_transform(train[columns_to_normalize])

    # Calculate the duration by subtracting the earliest start year from all years
    data_normalized['elapsed_start_time '] = data_normalized['start_year'] - earliest_start_year
    data_normalized['elapsed_end_time '] = data_normalized['end_year'] - earliest_start_year

    data_normalized = data_normalized.drop(columns=['start_year', 'end_year'])

    return data_normalized

train_normalized = Normalizing(train)
train_normalized.head()

Unnamed: 0,title,type,score,scored_by,status,episodes/chapters,source,members,favorites,sfw,approved,genres,themes,demographics,creators,production_source,synopsis,title_english,title_japanese,is_anime,start_season,end_season,elapsed_start_time,elapsed_end_time
7048,Toyama Kankou Anime Project,special,5.25,-0.198164,finished_airing,-0.394637,original,-0.22816,-0.11492,True,True,['Slice of Life'],['Historical'],[],['P.A. Works'],['The Berich'],The Toyama Kankou Anime Project is a visualiza...,Toyama Tourism Anime Project,富山観光アニメプロジェクト,1,Spring,Spring,92.0,92.0
2307,Slayers: The Motion Picture,movie,7.27,-0.065672,finished_airing,-0.496178,light_novel,-0.08636,-0.10983,True,True,"['Adventure', 'Comedy', 'Fantasy']",[],[],['J.C.Staff'],"['Kadokawa Shoten', 'Marubeni']",In this prequel movie to the Slayers televison...,Slayers: The Motion Picture,劇場版スレイヤーズ,1,Summer,Summer,78.0,78.0
9709,Arte,manga,7.91,-0.170972,currently_publishing,0.810314,Missing,-0.159651,-0.06993,True,True,[],"['Historical', 'Visual Arts']",['Seinen'],Kei Ookubo,['Comic Zenon'],"It is early 16th-century Italy, and the city o...",Arte,アルテ,0,Autumn,Missing,96.0,101.0
23892,Maou to Yuusha no Tatakai no Ura de: Game Seka...,light_novel,6.556667,-0.202049,currently_publishing,-0.083245,Missing,-0.236136,-0.115327,True,True,"['Action', 'Fantasy']","['Isekai', 'Reincarnation', 'Video Game']",[],"Sanshouuo, Yuuki Suzuki",[],A modern man is reincarnated into the world of...,Reincarnated into a Game as the Hero's Friend:...,魔王と勇者の戦いの裏で ～ゲーム世界に転生したけど友人の勇者が魔王討伐に旅立ったあとの国内お...,0,Winter,Missing,105.0,104.666667
15217,The Devil's Temptation,manhwa,6.98,-0.199772,finished,1.73772,Missing,-0.232042,-0.11492,False,True,"['Boys Love', 'Erotica', 'Supernatural']",[],[],Youn,['Lezhin Comics Webtoon'],Hyun was supposed to spend Christmas snuggling...,The Devil's Temptation,악마의 유혹,0,Spring,Winter,101.0,105.0


## Export necessary assets

In [29]:
import joblib

# train and test df
joblib.dump(train, 'assets/train.joblib')
joblib.dump(test, 'assets/test.joblib')

['assets/skipgram_model_synopsis.joblib']

In [30]:
# store library version
# run every time before you commit
!pip freeze > requirements.txt