In [204]:
!where python

c:\Users\user\Desktop\MADS\696\696\env696\Scripts\python.exe
C:\Users\user\AppData\Local\Programs\Python\Python312\python.exe
C:\Users\user\AppData\Local\Microsoft\WindowsApps\python.exe


In [205]:
# run this code to make sure you install all the required libraries
# be sure you are in virtual environment before install, otherwise it will overwrite your local environment

# !pip install -r requirements.txt

In [206]:
import pandas as pd
import numpy as np
import re

In [207]:
pd.set_option("display.max_columns", None)
RANDOM_SEED = 123

In [208]:
df_anime = pd.read_csv("assets/anime.csv")
df_manga = pd.read_csv("assets/manga.csv")

In [209]:
df_anime.shape, df_manga.shape

((24985, 39), (64833, 30))

## Data Prepocessing

Some short synopsis contain no information about the story of manga/title. This will introduce noise to our model. Therefore, we decide to remove those rows with extremely short synopsis.
Example:
- Second season of Mao Zhi Ming.
- The second season of Shen Lan Qi Yu Wushuang Zhu.
- Recap episode of Hakyuu Houshin Engi.
- Fifth Season of Bungou Stray Dogs
- 1-3. Ba_ku\n4-5. Mephisto
- An absurd film by Kuri Youji.
- Included one-shot:\nBougainvillea
- A collection of oneshots by Nishida Higashi.
- A movie adaptation of the TV series.
- Short film by Kurosaka Keita.
- Special episodes added to DVDs and Blu-rays.
- Movie based on the 1996 TV anime with an original plot.
- Third season of Yuan Long

In [210]:
def data_cleaning(input_anime, input_manga):
    df_anime = input_anime.copy()
    df_manga = input_manga.copy()

    # remove unnecessary columns
    df_anime = df_anime.drop(columns=['anime_id', 'total_duration', 'start_year', 'start_season', 'rating', 'main_picture', 'url', 'trailer_url', 'background', 'created_at', 'updated_at', 'episode_duration', 'broadcast_day', 'broadcast_time', 'licensors', 'title_synonyms'])
    df_manga = df_manga.drop(columns=['manga_id', 'main_picture', 'url', 'background', 'created_at_before', 'updated_at', 'title_synonyms'])

    # remove rows that are null in 'synopsis' and 'title', which are crucial for our project
    df_anime.dropna(subset=['title', 'synopsis', 'title_english', 'title_japanese'], inplace=True)
    df_manga.dropna(subset=['title', 'synopsis', 'title_english', 'title_japanese'], inplace=True)

    # remove '(Sources:...)' from synopsis
    df_anime['synopsis'] = df_anime['synopsis'].apply(lambda x: re.sub(r'\(Source:.*\)', '', x))
    df_manga['synopsis'] = df_manga['synopsis'].apply(lambda x: re.sub(r'\(Source:.*\)', '', x))

    # remove '[Written by ...]' from synopsis
    df_anime['synopsis'] = df_anime['synopsis'].apply(lambda x: re.sub(r'\[Written by.*\]', '', x))
    df_manga['synopsis'] = df_manga['synopsis'].apply(lambda x: re.sub(r'\[Written by.*\]', '', x))

    # remove rows that have extreme short synopsis
    df_anime = df_anime[df_anime['synopsis'].apply(lambda x: len(x) > 50)]
    df_manga = df_manga[df_manga['synopsis'].apply(lambda x: len(x) > 50)]

    print('cleaned anime shape: ', df_anime.shape)
    print('cleaned manga shape: ', df_manga.shape)

    return df_anime, df_manga

df_anime_cleaned, df_manga_cleaned = data_cleaning(df_anime, df_manga)

cleaned anime shape:  (8863, 23)
cleaned manga shape:  (15454, 23)


In [211]:
print("anime extra columns: \n", [col for col in df_anime_cleaned.columns if not col in df_manga_cleaned.columns])

anime extra columns: 
 ['episodes', 'source', 'studios', 'producers']


In [212]:
print("manga extra columns: \n", [col for col in df_manga_cleaned.columns if not col in df_anime_cleaned.columns])

manga extra columns: 
 ['volumes', 'chapters', 'authors', 'serializations']


In [213]:
print("common columns: \n", [col for col in df_anime_cleaned.columns if col in df_manga_cleaned.columns])

common columns: 
 ['title', 'type', 'score', 'scored_by', 'status', 'start_date', 'end_date', 'members', 'favorites', 'sfw', 'approved', 'real_start_date', 'real_end_date', 'genres', 'themes', 'demographics', 'synopsis', 'title_english', 'title_japanese']


In [214]:
df_manga['type'].unique()

array(['manga', 'manhwa', 'light_novel', 'one_shot', 'manhua', 'novel',
       'doujinshi'], dtype=object)

In [215]:
df_anime['source'].unique()

array(['manga', 'visual_novel', 'original', 'web_manga', 'light_novel',
       'novel', 'game', '4_koma_manga', 'music', 'other', 'web_novel',
       'card_game', 'book', 'mixed_media', nan, 'picture_book', 'radio'],
      dtype=object)

In [216]:
df_anime['type'].unique()

array(['tv', 'movie', 'ona', 'ova', 'special', 'music', nan], dtype=object)

**Extra columns alignment**:
| anime columns | manga columns | combine |
| --- | --- | --- |
| episodes | chapters | episodes/chapters |
| NULL | volumes |
| source | type | original_source |
| type | NULL |
| total_duration | NULL |
| rating | NULL |
| start_year | NULL |
| start_season | NULL |
| studios | NULL |
| producers | NULL |
| NULL | authors |
| NULL | serializations |

In [217]:
import ast

def extract_author_name(author_list):
    try:
        author_list = ast.literal_eval(author_list)
        
        # Extract first and last names of authors, ignoring the others
        author_names = [f"{author['first_name']} {author['last_name']}".strip() for author in author_list if author['first_name'] or author['last_name']]
        
        # Join names for multiple authors
        return ', '.join(author_names) if author_names else "Missing"
    
    except (ValueError, SyntaxError, KeyError, TypeError):
        # Handle Missing data
        return "Missing"

# Apply the function to the 'authors' column
def authors_extraction(input_manga):
    df_manga = input_manga.copy()
    df_manga['authors'] = df_manga['authors'].apply(extract_author_name)

    return df_manga

df_manga_extracted = authors_extraction(df_manga_cleaned)

In [218]:
def columns_alignment(input_anime, input_manga):
    df_anime = input_anime.copy()
    df_manga = input_manga.copy()

    # treat 'episodes' and 'chapters' the same, create null value for 'volume' in anime
    df_anime.rename(columns={'episodes': 'episodes/chapters'}, inplace=True)
    df_manga.rename(columns={'chapters': 'episodes/chapters'}, inplace=True)

    # Combine studios and authors together to get creators columns
    df_anime.rename(columns={'studios': 'creators'}, inplace=True)
    df_manga.rename(columns={'authors': 'creators'}, inplace=True)

    # Also for producers and serialization
    df_anime.rename(columns={'producers': 'production_source'}, inplace=True)
    df_manga.rename(columns={'serializations': 'production_source'}, inplace=True)

    # To distinguish where the data from
    df_anime['is_anime'] = 1
    df_manga['is_anime'] = 0 
    
    return df_anime, df_manga

df_anime_aligned, df_manga_aligned = columns_alignment(df_anime_cleaned, df_manga_extracted)

In [219]:
df_full = pd.concat([df_anime_aligned, df_manga_aligned], ignore_index=True)

#remove unnecessary common columns
df_full = df_full.drop(columns=['real_start_date', 'real_end_date', 'volumes'])

df_full.head()

Unnamed: 0,title,type,score,scored_by,status,episodes/chapters,start_date,end_date,source,members,favorites,sfw,approved,genres,themes,demographics,creators,production_source,synopsis,title_english,title_japanese,is_anime
0,Fullmetal Alchemist: Brotherhood,tv,9.1,2037075,finished_airing,64.0,2009-04-05,2010-07-04,manga,3206028,219036,True,True,"['Action', 'Adventure', 'Drama', 'Fantasy']",['Military'],['Shounen'],['Bones'],"['Aniplex', 'Square Enix', 'Mainichi Broadcast...",After a horrific alchemy experiment goes wrong...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,1
1,Hunter x Hunter (2011),tv,9.04,1671587,finished_airing,148.0,2011-10-02,2014-09-24,manga,2688079,202109,True,True,"['Action', 'Adventure', 'Fantasy']",[],['Shounen'],['Madhouse'],"['VAP', 'Nippon Television Network', 'Shueisha']",Hunters devote themselves to accomplishing haz...,Hunter x Hunter,HUNTER×HUNTER（ハンター×ハンター）,1
2,Shingeki no Kyojin Season 3 Part 2,tv,9.05,1491491,finished_airing,10.0,2019-04-29,2019-07-01,manga,2133927,55644,True,True,"['Action', 'Drama']","['Gore', 'Military', 'Survival']",['Shounen'],['Wit Studio'],"['Production I.G', 'Dentsu', 'Mainichi Broadca...",Seeking to restore humanity's diminishing hope...,Attack on Titan Season 3 Part 2,進撃の巨人 Season3 Part.2,1
3,Steins;Gate,tv,9.07,1348232,finished_airing,24.0,2011-04-06,2011-09-14,visual_novel,2463954,184312,True,True,"['Drama', 'Sci-Fi', 'Suspense']","['Psychological', 'Time Travel']",[],['White Fox'],"['Frontier Works', 'Media Factory', 'Kadokawa ...",Eccentric scientist Rintarou Okabe has a never...,Steins;Gate,STEINS;GATE,1
4,Koe no Katachi,movie,8.94,1540277,finished_airing,1.0,2016-09-17,2016-09-17,manga,2218467,84124,True,True,"['Award Winning', 'Drama']",['Romantic Subtext'],['Shounen'],['Kyoto Animation'],"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","As a wild youth, elementary school student Sho...",A Silent Voice,聲の形,1


### Date Transformation

In [220]:
# Function to extract year and month
def extract_year_and_month(date):
    try:
        # Convert the date string to a datetime object
        datetime_date = pd.to_datetime(date, errors='raise')
        # Extract year and month
        return datetime_date.year, datetime_date.month
    except:
        # return NaN if fails
        return np.nan, np.nan
    
# Apply the function to the 'start_date' column
df_full[['start_year', 'start_month']] = df_full['start_date'].apply(lambda x: pd.Series(extract_year_and_month(x)))

# Apply the function to the 'end_date' column
df_full[['end_year', 'end_month']] = df_full['end_date'].apply(lambda x: pd.Series(extract_year_and_month(x)))

#remove useless columns
df_full = df_full.drop(columns=['start_date', 'end_date'])

In [221]:
# Function to transform Start or End month into season refer to Events of Anime
def month_to_season(month):
    # If the data is in range, return corresponding Season of events of Anime
    if month in [1, 2, 3]:
        return 'Winter'
    elif month in [4, 5, 6]:
        return 'Spring'
    elif month in [7, 8, 9]:
        return 'Summer'
    elif month in [10, 11, 12]:
        return 'Autumn'
    else:
        return np.nan  # Handle unexpected cases, though this shouldn't occur with valid months

# Apply the function to transform the month value to season categories
df_full['start_season'] = df_full['start_month'].apply(month_to_season)
df_full['end_season'] = df_full['end_month'].apply(month_to_season)

#remove useless columns
df_full = df_full.drop(columns=['start_month', 'end_month'])

#Check for Year and Season feature
df_full[['start_year', 'end_year', 'start_season', 'end_season']].head()

Unnamed: 0,start_year,end_year,start_season,end_season
0,2009.0,2010.0,Spring,Summer
1,2011.0,2014.0,Autumn,Summer
2,2019.0,2019.0,Spring,Summer
3,2011.0,2011.0,Spring,Summer
4,2016.0,2016.0,Summer,Summer


### Train Test data split

In [252]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_full, test_size=0.2, random_state=RANDOM_SEED)
print('Number of rows in train set: ', len(train))

Number of rows in train set:  19453


In [254]:
# check for missing data 
print("Number of missing data in Training Set:\n")
print(train.isnull().sum())

Number of missing data in Training Set:

title                    0
type                     7
score                 5969
scored_by                0
status                   0
episodes/chapters     3887
source               12989
members                  0
favorites                0
sfw                      0
approved                 0
genres                   0
themes                   0
demographics             0
creators                 0
production_source        0
synopsis                 0
title_english            0
title_japanese           0
is_anime                 0
start_year             328
end_year              3478
start_season           328
end_season            3478
dtype: int64


In [245]:
# check for missing data 
print("Number of missing data in Testing Set:\n")
print(test.isnull().sum())

Number of missing data in Testing Set:

title                   0
type                    1
score                1494
scored_by               0
status                  0
episodes/chapters     970
start_date             75
end_date              875
source               3239
members                 0
favorites               0
sfw                     0
approved                0
genres                  0
themes                  0
demographics            0
creators                0
production_source       0
synopsis                0
title_english           0
title_japanese          0
is_anime                0
start_year             75
start_month            75
end_year              875
end_month             875
dtype: int64


### Handle Missing data

In [255]:
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

# Non-numerical columns
Non_Numerical = ['type', 'source', 'start_season', 'end_season']

# Create an imputer for non-numerical data filling with 'Missing'
categorical_imputer = SimpleImputer(strategy='constant', fill_value='Missing')

# Impute non-numerical columns
train[Non_Numerical] = categorical_imputer.fit_transform(train[Non_Numerical])

# Numerical columns
Numerical = ['score', 'scored_by', 'episodes/chapters', 'members', 'favorites', 'start_year' , 'end_year']

# Create an imputer for numerical data using KNNImputer
knn_imputer = KNNImputer(n_neighbors=3)

# Impute numerical columns
train[Numerical] = knn_imputer.fit_transform(train[Numerical])

# Check for result
print("Number of missing data in Training Set:\n")
print(train.isnull().sum())

Number of missing data in Training Set:

title                0
type                 0
score                0
scored_by            0
status               0
episodes/chapters    0
source               0
members              0
favorites            0
sfw                  0
approved             0
genres               0
themes               0
demographics         0
creators             0
production_source    0
synopsis             0
title_english        0
title_japanese       0
is_anime             0
start_year           0
end_year             0
start_season         0
end_season           0
dtype: int64


### Data Normalization

In [256]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Selecting columns to normalize
columns_to_normalize = ["scored_by", "episodes/chapters", "members", "favorites"]

earliest_start_year = train['start_year'].min()

# function for Normalizing the selected columns
def Normalizing(data):
    data_normalized = data.copy()
    data_normalized[columns_to_normalize] = scaler.fit_transform(train[columns_to_normalize])

    # Calculate the duration by subtracting the earliest start year from all years
    data_normalized['elapsed_start_time '] = data_normalized['start_year'] - earliest_start_year
    data_normalized['elapsed_end_time '] = data_normalized['end_year'] - earliest_start_year

    data_normalized = data_normalized.drop(columns=['start_year', 'end_year'])

    return data_normalized

train_normalized = Normalizing(train)
train_normalized.head()

Unnamed: 0,title,type,score,scored_by,status,episodes/chapters,source,members,favorites,sfw,approved,genres,themes,demographics,creators,production_source,synopsis,title_english,title_japanese,is_anime,start_season,end_season,elapsed_start_time,elapsed_end_time
13870,Souzai Saishuka no Isekai Ryokouki,manga,6.53,-0.178503,currently_publishing,-0.285907,Missing,-0.185451,-0.111569,True,True,"['Adventure', 'Fantasy']","['Isekai', 'Reincarnation']",[],"Tomozo, Masuo Kinoko",['AlphaPolis Web Manga'],Takeru Kamishiro is an ordinary middle aged sa...,A Gatherer's Adventure in Isekai,素材採取家の異世界旅行記,0,Autumn,Missing,100.0,92.333333
8825,Kingyo no Isshou,movie,6.583333,-0.202222,finished_airing,-0.462556,original,-0.236163,-0.114968,True,True,['Slice of Life'],[],[],[],[],An elementary school student named Mitsui won ...,Short and Happy Life of a Goldfish,金魚の一生,1,Winter,Winter,76.0,76.0
6178,Takara-sagashi,movie,6.17,-0.199299,finished_airing,-0.462556,picture_book,-0.224216,-0.114968,True,True,"['Adventure', 'Fantasy']",[],['Kids'],['Studio Ghibli'],[],Short film shown only in the Ghibli Museum in ...,Treasure Hunting,たからさがし,1,Spring,Spring,94.0,94.0
5372,Nakedyouth,ona,5.69,-0.162739,finished_airing,-0.462556,original,-0.187413,-0.112969,True,True,"['Boys Love', 'Romance', 'Slice of Life', 'Spo...",[],[],[],['Kojiro Shishido Animation Works'],"In Nakedyouth, Shishido takes us on a journey ...",Naked Youth,Nakedyouth,1,Winter,Winter,89.0,89.0
15978,Yu☆Gi☆Oh! GX Tokubetsu-hen,one_shot,6.55,-0.197994,finished,-0.462556,Missing,-0.230989,-0.114568,True,True,"['Adventure', 'Comedy', 'Fantasy']","['School', 'Strategy Game']",['Shounen'],"Kazuki Takahashi, Naoyuki Kageyama",['V-Jump'],A special one-shot released in commemoration o...,Yu-Gi-Oh! GX,遊☆戯☆王GX特別編,0,Spring,Spring,97.0,97.0


### Text processing

Tokenization and lemmatization by keeping `{'NOUN', 'VERB', 'ADJ', 'PROPN', 'ADV'}` and replace named entity with place holder.

In [57]:
# tokenization and lemmatization
import spacy

POS_TO_KEEP = {'NOUN', 'VERB', 'ADJ', 'PROPN', 'ADV'}

nlp = spacy.load('en_core_web_sm')

stop_words = nlp.Defaults.stop_words

def tokenization(doc):
    """
    Filter out number.
    Replace person, organization, and location entities with '<ent_type_>'
    Return lemma if its POS is in `POS_TO_KEEP` and not in stop_words.
    """
    tokens = []

    for token in doc:
        if token.is_digit:
            # filter out numeric tokens
            continue

        if token.ent_type_ in ['PERSON', 'ORG', 'GPE']:
            # replace person, organization, and location entities
            tokens.append(f'<{token.ent_type_}>')
            
        elif token.pos_ in POS_TO_KEEP and not token.lemma_ in stop_words:
            # return lemma if its POS is in `POS_TO_KEEP` and not in stop_words
            tokens.append(token.lemma_)
    
    return tokens

# use nlp.pipe for batch processing
train['title_en_token'] = [
    tokenization(doc) for doc in nlp.pipe(train['title_english'], batch_size=100, n_process=-1)
]
train['synopsis_token'] = [
    tokenization(doc) for doc in nlp.pipe(train['synopsis'], batch_size=100, n_process=-1)
]

# filter out empty token
train = train[train['title_en_token'].apply(lambda x: len(x) > 0) & train['synopsis_token'].apply(lambda x: len(x) > 0)] # both columns cannot be empty

print('Number of rows after filtering empty token: ', len(train))

Number of rows after filtering empty token:  19309


In [58]:
# review tokenization
with pd.option_context('display.max_colwidth', None):
    display(train[['title_english', 'title_en_token', 'synopsis', 'synopsis_token']].sample(1))

Unnamed: 0,title_english,title_en_token,synopsis,synopsis_token
16373,Marked By King Bs,"[Marked, King, Bs]","High school is hard enough without a target on your back, but that's exactly the situation Annie finds herself in when she crosses a group of the most popular kids in school. Marked by the king bee himself, the notorious Ashton Griffin, Annie becomes his newest fixation—and he is determined to make her life miserable. Now at his beck and call, Annie must stay on Ashton's good side to maintain her peaceful life and avoid becoming a social pariah. As she navigates her way through alienating social cliques, persistent old crushes, and the hot upstairs neighbor who never puts a shirt on, Annie will soon learn that there's more to being popular than meets the eye. She just wanted to live a normal life, but maybe there's no escaping these king bees.\r\n\r\n","[high, school, hard, target, exactly, situation, <PERSON>, find, cross, group, popular, kid, school, mark, king, bee, notorious, <PERSON>, <PERSON>, <PERSON>, new, fixation, determined, life, miserable, beck, <PERSON>, stay, <ORG>, good, maintain, peaceful, life, avoid, social, pariah, navigate, way, alienate, social, clique, persistent, old, crush, hot, upstairs, neighbor, shirt, <PERSON>, soon, learn, popular, meet, eye, want, live, normal, life, maybe, escape, king, bee]"


Since titles and synopses play different roles (titles are short and often genre-indicative, while synopses provide detailed content descriptions), we use two separate vectorizers.

Titles are shorter and often contain rare, context-rich words that are crucial for capturing unique meaning, while synopses are longer and contain more common words, making them less reliant on capturing rare vocabulary. So, we use a higher `max_features` or `vector_size` for title to ensures it capture these niche terms and their relationships, while use lower for synopsis to focus on the more frequently relevant words.

In [59]:
# tfidf

from sklearn.feature_extraction.text import TfidfVectorizer

# initialize title tfidf
tfidf_title = TfidfVectorizer(
    ngram_range=(1,1),  # uni-gram
    min_df=1,           # don't filter rare words as they are important for title
    max_df=0.8,         # filter out very common words
)

# initialize synopsis tfidf
tfidf_synopsis = TfidfVectorizer(
    ngram_range=(1,2),  # uni-gram or bi-gram
    max_features=2000,  # focus on the more frequently relevant words
    min_df=2,           # filter out extremely rare words
    max_df=0.8,         # filter out very common words
)

# train tfidf
title_en_tfidf_matrix = tfidf_title.fit_transform(train['title_en_token'].apply(lambda x: " ".join(x)))
synopsis_tfidf_matrix = tfidf_synopsis.fit_transform(train['synopsis_token'].apply(lambda x: " ".join(x)))

# add result to train df
train['title_en_tfidf'] = [title_en_tfidf_matrix[i] for i in range(title_en_tfidf_matrix.shape[0])]
train['synopsis_tfidf'] = [synopsis_tfidf_matrix[i] for i in range(synopsis_tfidf_matrix.shape[0])]

For word embedding, we choose **Word2Vec - Skip-gram** because it tends to capture rare words more effectively (e.g. niche anime/manga-specific vocabulary), comparing to Word2Vec - CBOW and GloVe.

In [60]:
# word embedding
import multiprocessing
from gensim.models import Word2Vec

# dynamically determine the number of CPU cores
num_workers = multiprocessing.cpu_count()

# train title skipgram model
skipgram_model_title = Word2Vec(
    train['title_en_token'].tolist(),
    sg=1,               # skip-gram
    vector_size=300,    # title use higher dim
    window=2,           # title use smaller window size
    min_count=1,        # titles may contain rare but important words
    epochs=30,          # title are shorter, need more epochs to train
    workers=num_workers,
    seed=RANDOM_SEED
)

# train synopsis skipgram model
skipgram_model_synopsis = Word2Vec(
    train['synopsis_token'].tolist(),
    sg=1,               # skip-gram
    vector_size=150,    # synopsis use lower dim
    window=5,           # synopsis use larger window size
    min_count=2,        # filter out extremely rare words
    epochs=15,
    workers=num_workers,
    seed=RANDOM_SEED
)

# clean tokens that does not exist in the skipgram vocab (because of `min_count`)
model_vocab = set(skipgram_model_synopsis.wv.index_to_key)
train['synopsis_token'] = train['synopsis_token'].apply(lambda x: [token for token in x if token in model_vocab])
train = train[train['synopsis_token'].apply(lambda x: len(x) > 0)]  # filter out empty entry after clean tokens

# apply skipgram model
train['title_en_skipgram'] = train['title_en_token'].apply(lambda x: skipgram_model_title.wv[x])
train['synopsis_skipgram'] = train['synopsis_token'].apply(lambda x: skipgram_model_synopsis.wv[x])

In [61]:
train[['title_japanese', 'title_english', 'synopsis', 'title_en_tfidf', 'title_en_skipgram', 'synopsis_tfidf', 'synopsis_skipgram']].sample(5)

Unnamed: 0,title_japanese,title_english,synopsis,title_en_tfidf,title_en_skipgram,synopsis_tfidf,synopsis_skipgram
23943,愛を知った一週間,Chosen As The Frenchman's Bride,"After losing her father at a young age, Jane g...","(0, 5365)\t0.20308269236008522\n (0, 1298)\...","[[-0.050672267, -0.07246053, 0.14060606, -0.11...","(0, 1990)\t0.12745880457602696\n (0, 969)\t...","[[0.37020886, 0.024892427, -0.081325725, -0.54..."
20488,ふたりぽっち,The Two Daughters,"Kaoru has to stay with Reiko's family, a girl ...","(0, 1793)\t1.0","[[-0.03223555, -0.11463614, 0.18719757, -0.009...","(0, 1224)\t0.26511441322114115\n (0, 523)\t...","[[-0.05985926, 0.18823454, -0.102404416, -0.12..."
12351,奪う者 奪われる者,Bereave or Bereaved,"Seto Yu is a 12-year-old boy, despite being ve...","(0, 731)\t0.7071067811865476\n (0, 732)\t0....","[[-0.009359805, -0.022338165, 0.03214639, -0.0...","(0, 1968)\t0.12348505189125379\n (0, 162)\t...","[[-0.05985926, 0.18823454, -0.102404416, -0.12..."
23146,まんがグリム童話 血と悦楽に溺れた実在姫君たち,Real Princess who Indulge in Blood & Lust,1-2. Last Emperor no Tsuma: Enyou Kougou\r\n3....,"(0, 5365)\t0.5041341482361837\n (0, 5886)\t...","[[-0.02228411, -0.12334833, 0.120474376, -0.08...","(0, 1224)\t0.6939879716446256\n (0, 1290)\t...","[[-0.17345911, 0.00636432, -0.08250168, -0.332..."
2815,王ドロボウ JING in Seventh Heaven,Jing: King of Bandits - Seventh Heaven,"Jing, the infamous King of Bandits, finds hims...","(0, 4020)\t0.31691060798982196\n (0, 3364)\...","[[-0.03958812, -0.035999063, 0.12214554, 4.222...","(0, 1968)\t0.11052129494573408\n (0, 1152)\...","[[0.051790062, 0.025802549, -0.02216094, -0.11..."


## Export necessary assets

In [62]:
import joblib

# train and test df
joblib.dump(train, 'assets/train.joblib')
joblib.dump(test, 'assets/test.joblib')

# tfidf_matrix
joblib.dump(title_en_tfidf_matrix, 'assets/title_en_tfidf_matrix.joblib')
joblib.dump(synopsis_tfidf_matrix, 'assets/synopsis_tfidf_matrix.joblib')

# tfidf vectorizer
joblib.dump(tfidf_title, 'assets/tfidf_title_vectorizer.joblib')
joblib.dump(tfidf_synopsis, 'assets/tfidf_synopsis_vectorizer.joblib')

# skipgram
joblib.dump(skipgram_model_title, 'assets/skipgram_model_title.joblib')
joblib.dump(skipgram_model_synopsis, 'assets/skipgram_model_synopsis.joblib')

['assets/skipgram_model_synopsis.joblib']

In [63]:
# store library version
# run every time before you commit
!pip freeze > requirements.txt