In [1]:
!where python

c:\Users\HK-Laptop-V639\Documents\GitHub\696\env696\Scripts\python.exe
C:\Users\HK-Laptop-V639\AppData\Local\Programs\Python\Python312\python.exe
C:\Users\HK-Laptop-V639\AppData\Local\Microsoft\WindowsApps\python.exe


In [2]:
# run this code to make sure you install all the required libraries
# be sure you are in virtual environment before install, otherwise it will overwrite your local environment

# !pip install -r requirements.txt

In [3]:
import pandas as pd
import numpy as np
import re

In [4]:
pd.set_option("display.max_columns", None)
RANDOM_SEED = 123

In [5]:
df_anime = pd.read_csv("assets/anime.csv")
df_manga = pd.read_csv("assets/manga.csv")

In [6]:
df_anime.shape, df_manga.shape

((24985, 39), (64833, 30))

## Data Prepocessing

Some short synopsis contain no information about the story of manga/title. This will introduce noise to our model. Therefore, we decide to remove those rows with extremely short synopsis.
Example:
- Second season of Mao Zhi Ming.
- The second season of Shen Lan Qi Yu Wushuang Zhu.
- Recap episode of Hakyuu Houshin Engi.
- Fifth Season of Bungou Stray Dogs
- 1-3. Ba_ku\n4-5. Mephisto
- An absurd film by Kuri Youji.
- Included one-shot:\nBougainvillea
- A collection of oneshots by Nishida Higashi.
- A movie adaptation of the TV series.
- Short film by Kurosaka Keita.
- Special episodes added to DVDs and Blu-rays.
- Movie based on the 1996 TV anime with an original plot.
- Third season of Yuan Long

In [7]:
def data_cleaning(input_anime, input_manga):
    df_anime = input_anime.copy()
    df_manga = input_manga.copy()

    # remove unnecessary columns
    df_anime = df_anime.drop(columns=['anime_id', 'rating', 'main_picture', 'url', 'trailer_url', 'background', 'created_at', 'updated_at', 'episode_duration', 'broadcast_day', 'broadcast_time', 'licensors', 'title_synonyms'])
    df_manga = df_manga.drop(columns=['manga_id', 'main_picture', 'url', 'background', 'created_at_before', 'updated_at', 'title_synonyms'])

    # remove rows that are null in 'synopsis' and 'title', which are crucial for our project
    df_anime.dropna(subset=['title', 'synopsis', 'title_english', 'title_japanese'], inplace=True)
    df_manga.dropna(subset=['title', 'synopsis', 'title_english', 'title_japanese'], inplace=True)

    # remove '(Sources:...)' from synopsis
    df_anime['synopsis'] = df_anime['synopsis'].apply(lambda x: re.sub(r'\(Source:.*\)', '', x))
    df_manga['synopsis'] = df_manga['synopsis'].apply(lambda x: re.sub(r'\(Source:.*\)', '', x))

    # remove '[Written by ...]' from synopsis
    df_anime['synopsis'] = df_anime['synopsis'].apply(lambda x: re.sub(r'\[Written by.*\]', '', x))
    df_manga['synopsis'] = df_manga['synopsis'].apply(lambda x: re.sub(r'\[Written by.*\]', '', x))

    # remove rows that have extreme short synopsis
    df_anime = df_anime[df_anime['synopsis'].apply(lambda x: len(x) > 50)]
    df_manga = df_manga[df_manga['synopsis'].apply(lambda x: len(x) > 50)]

    print('cleaned anime shape: ', df_anime.shape)
    print('cleaned manga shape: ', df_manga.shape)

    return df_anime, df_manga

df_anime_cleaned, df_manga_cleaned = data_cleaning(df_anime, df_manga)

cleaned anime shape:  (8862, 26)
cleaned manga shape:  (15447, 23)


In [8]:
print("anime extra columns: \n", [col for col in df_anime_cleaned.columns if not col in df_manga_cleaned.columns])

anime extra columns: 
 ['episodes', 'source', 'total_duration', 'start_year', 'start_season', 'studios', 'producers']


In [9]:
print("manga extra columns: \n", [col for col in df_manga_cleaned.columns if not col in df_anime_cleaned.columns])

manga extra columns: 
 ['volumes', 'chapters', 'authors', 'serializations']


In [10]:
print("common columns: \n", [col for col in df_anime_cleaned.columns if col in df_manga_cleaned.columns])

common columns: 
 ['title', 'type', 'score', 'scored_by', 'status', 'start_date', 'end_date', 'members', 'favorites', 'sfw', 'approved', 'real_start_date', 'real_end_date', 'genres', 'themes', 'demographics', 'synopsis', 'title_english', 'title_japanese']


In [11]:
df_manga['type'].unique()

array(['manga', 'manhwa', 'light_novel', 'one_shot', 'manhua', 'novel',
       'doujinshi'], dtype=object)

In [12]:
df_anime['source'].unique()

array(['manga', 'visual_novel', 'original', 'web_manga', 'light_novel',
       'novel', 'game', '4_koma_manga', 'music', 'other', 'web_novel',
       'card_game', 'book', 'mixed_media', nan, 'picture_book', 'radio'],
      dtype=object)

In [13]:
df_anime['type'].unique()

array(['tv', 'movie', 'ona', 'ova', 'special', 'music', nan], dtype=object)

**Extra columns alignment**:
| anime columns | manga columns | combine |
| --- | --- | --- |
| episodes | chapters | episodes/chapters |
| NULL | volumes |
| source | type | original_source |
| type | NULL |
| total_duration | NULL |
| rating | NULL |
| start_year | NULL |
| start_season | NULL |
| studios | NULL |
| producers | NULL |
| NULL | authors |
| NULL | serializations |

In [14]:
import ast

def extract_author_name(author_list):
    try:
        author_list = ast.literal_eval(author_list)
        
        # Extract first and last names of authors, ignoring the others
        author_names = [f"{author['first_name']} {author['last_name']}".strip() for author in author_list if author['first_name'] or author['last_name']]
        
        # Join names for multiple authors
        return ', '.join(author_names) if author_names else "Missing"
    
    except (ValueError, SyntaxError, KeyError, TypeError):
        # Handle Missing data
        return "Missing"

# Apply the function to the 'authors' column
def authors_extraction(input_manga):
    df_manga = input_manga.copy()
    df_manga['authors'] = df_manga['authors'].apply(extract_author_name)

    return df_manga

df_manga_extracted = authors_extraction(df_manga_cleaned)

In [15]:
def columns_alignment(input_anime, input_manga):
    df_anime = input_anime.copy()
    df_manga = input_manga.copy()

    # treat 'episodes' and 'chapters' the same, create null value for 'volume' in anime
    df_anime.rename(columns={'episodes': 'episodes/chapters'}, inplace=True)
    df_manga.rename(columns={'chapters': 'episodes/chapters'}, inplace=True)

    # Combine studios and authors together to get creators columns
    df_anime.rename(columns={'studios': 'creators'}, inplace=True)
    df_manga.rename(columns={'authors': 'creators'}, inplace=True)

    # Also for producers and serialization
    df_anime.rename(columns={'producers': 'production_source'}, inplace=True)
    df_manga.rename(columns={'serializations': 'production_source'}, inplace=True)

    # To distinguish where the data from
    df_anime['is_anime'] = 1
    df_manga['is_anime'] = 0 
    
    return df_anime, df_manga

df_anime_aligned, df_manga_aligned = columns_alignment(df_anime_cleaned, df_manga_extracted)

In [16]:
df_full = pd.concat([df_anime_aligned, df_manga_aligned], ignore_index=True)
df_full

Unnamed: 0,title,type,score,scored_by,status,episodes/chapters,start_date,end_date,source,members,favorites,total_duration,sfw,approved,start_year,start_season,real_start_date,real_end_date,genres,themes,demographics,creators,production_source,synopsis,title_english,title_japanese,is_anime,volumes
0,Fullmetal Alchemist: Brotherhood,tv,9.10,2037075,finished_airing,64.0,2009-04-05,2010-07-04,manga,3206028,219036,1 days 01:57:20,True,True,2009.0,spring,2009-04-05,2010-07-04,"['Action', 'Adventure', 'Drama', 'Fantasy']",['Military'],['Shounen'],['Bones'],"['Aniplex', 'Square Enix', 'Mainichi Broadcast...",After a horrific alchemy experiment goes wrong...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,1,
1,Hunter x Hunter (2011),tv,9.04,1671587,finished_airing,148.0,2011-10-02,2014-09-24,manga,2688079,202109,2 days 10:15:16,True,True,2011.0,fall,2011-10-02,2014-09-24,"['Action', 'Adventure', 'Fantasy']",[],['Shounen'],['Madhouse'],"['VAP', 'Nippon Television Network', 'Shueisha']",Hunters devote themselves to accomplishing haz...,Hunter x Hunter,HUNTER×HUNTER（ハンター×ハンター）,1,
2,Shingeki no Kyojin Season 3 Part 2,tv,9.05,1491491,finished_airing,10.0,2019-04-29,2019-07-01,manga,2133927,55644,0 days 03:59:10,True,True,2019.0,spring,2019-04-29,2019-07-01,"['Action', 'Drama']","['Gore', 'Military', 'Survival']",['Shounen'],['Wit Studio'],"['Production I.G', 'Dentsu', 'Mainichi Broadca...",Seeking to restore humanity's diminishing hope...,Attack on Titan Season 3 Part 2,進撃の巨人 Season3 Part.2,1,
3,Steins;Gate,tv,9.07,1348232,finished_airing,24.0,2011-04-06,2011-09-14,visual_novel,2463954,184312,0 days 09:44:00,True,True,2011.0,spring,2011-04-06,2011-09-14,"['Drama', 'Sci-Fi', 'Suspense']","['Psychological', 'Time Travel']",[],['White Fox'],"['Frontier Works', 'Media Factory', 'Kadokawa ...",Eccentric scientist Rintarou Okabe has a never...,Steins;Gate,STEINS;GATE,1,
4,Koe no Katachi,movie,8.94,1540277,finished_airing,1.0,2016-09-17,2016-09-17,manga,2218467,84124,0 days 02:10:03,True,True,2016.0,summer,2016-09-17,2016-09-17,"['Award Winning', 'Drama']",['Romantic Subtext'],['Shounen'],['Kyoto Animation'],"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","As a wild youth, elementary school student Sho...",A Silent Voice,聲の形,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24304,Bijo to Yajuu?,manga,,1,finished,,2015-07-01,2015-07-01,,4,0,,True,True,,,2015-07-01,2015-07-01,['Romance'],[],[],"Carol Devine, JET",[],Born to a noble politician in a prominent poli...,Beauty and the Beastmaster,美女と野獣?,0,1.0
24305,Madarame Shunin wa Ecchi de Zurui!,manga,,1,finished,16.0,2019-04-12,2020-07-15,,4,0,,False,True,,,2019-04-12,2020-07-15,['Erotica'],[],['Josei'],Roca Katou,[],"""If you can get me excited, then I'll give you...",No Fair! Chief Madarame Is a Pervert! I'll Do ...,班目主任はエッチでずるい!,0,2.0
24306,PSO2 New Genesis: Central!,manga,,1,currently_publishing,,2021-09-15,,,4,0,,True,True,,,2021-09-15,,"['Comedy', 'Sci-Fi']",[],[],Fudechin,[],Story following the exploits and updates in Ph...,Central!,PSO2 ニュージェネシス せんとらるっ!,0,
24307,Itazura na Ai no Shisha,manga,,1,finished,,2017-09-30,2017-09-30,,4,0,,True,True,,,2017-09-30,2017-09-30,['Romance'],[],['Josei'],"Lucy Gordon, Shizuku Katsuragi",['Bessatsu Harlequin'],"Ellie, who works at a law firm, is at her wits...",Expecting the Fellani Heir,いたずらな愛の使者,0,1.0


### Handle Missing data

In [17]:
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

# Non-numerical columns
Non_Numerical = ['type', 'source']

# Create an imputer for non-numerical data filling with 'Missing'
categorical_imputer = SimpleImputer(strategy='constant', fill_value='Missing')

# Impute non-numerical columns
df_full[Non_Numerical] = categorical_imputer.fit_transform(df_full[Non_Numerical])

# Numerical columns
Numerical = ['score', 'scored_by', 'episodes/chapters', 'members', 'favorites']

# Create an imputer for numerical data using KNNImputer
knn_imputer = KNNImputer(n_neighbors=3)

# Impute numerical columns
df_full[Numerical] = knn_imputer.fit_transform(df_full[Numerical])

# Display the final result with both numerical and categorical columns imputed
df_full.isnull().sum()


title                    0
type                     0
score                    0
scored_by                0
status                   0
episodes/chapters        0
start_date             403
end_date              4353
source                   0
members                  0
favorites                0
total_duration       15613
sfw                      0
approved                 0
start_year           16019
start_season         16019
real_start_date        403
real_end_date         4353
genres                   0
themes                   0
demographics             0
creators                 0
production_source        0
synopsis                 0
title_english            0
title_japanese           0
is_anime                 0
volumes              13248
dtype: int64

In [18]:
# train test split
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_full, test_size=0.2, random_state=RANDOM_SEED)
print('Number of rows in train set: ', len(train))

Number of rows in train set:  19447


### Data Normalization

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Selecting columns to normalize
columns_to_normalize = ["scored_by", "episodes/chapters", "members", "favorites"]

# Normalizing the selected columns
train_normalized = train.copy()
train_normalized[columns_to_normalize] = scaler.fit_transform(train[columns_to_normalize])
train_normalized

Unnamed: 0,title,type,score,scored_by,status,episodes/chapters,start_date,end_date,source,members,favorites,total_duration,sfw,approved,start_year,start_season,real_start_date,real_end_date,genres,themes,demographics,creators,production_source,synopsis,title_english,title_japanese,is_anime,volumes
7048,Toyama Kankou Anime Project,special,5.250000,-0.198164,finished_airing,-0.393546,2009-06-17,2009-06-25,original,-0.228160,-0.114920,0 days 00:30:00,True,True,2009.0,spring,2009-06-17,2009-06-25,['Slice of Life'],['Historical'],[],['P.A. Works'],['The Berich'],The Toyama Kankou Anime Project is a visualiza...,Toyama Tourism Anime Project,富山観光アニメプロジェクト,1,
2307,Slayers: The Motion Picture,movie,7.270000,-0.065672,finished_airing,-0.494581,1995-07-29,1995-07-29,light_novel,-0.086360,-0.109830,0 days 01:04:44,True,True,1995.0,summer,1995-07-29,1995-07-29,"['Adventure', 'Comedy', 'Fantasy']",[],[],['J.C.Staff'],"['Kadokawa Shoten', 'Marubeni']",In this prequel movie to the Slayers televison...,Slayers: The Motion Picture,劇場版スレイヤーズ,1,
9709,Arte,manga,7.910000,-0.170972,currently_publishing,0.805399,2013-10-25,,Missing,-0.159651,-0.069930,,True,True,,,2013-10-25,,[],"['Historical', 'Visual Arts']",['Seinen'],Kei Ookubo,['Comic Zenon'],"It is early 16th-century Italy, and the city o...",Arte,アルテ,0,
23892,Maou to Yuusha no Tatakai no Ura de: Game Seka...,light_novel,6.556667,-0.202049,currently_publishing,-0.373339,2022-03-25,,Missing,-0.236136,-0.115327,,True,True,,,2022-03-25,,"['Action', 'Fantasy']","['Isekai', 'Reincarnation', 'Video Game']",[],"Sanshouuo, Yuuki Suzuki",[],A modern man is reincarnated into the world of...,Reincarnated into a Game as the Hero's Friend:...,魔王と勇者の戦いの裏で ～ゲーム世界に転生したけど友人の勇者が魔王討伐に旅立ったあとの国内お...,0,
15217,The Devil's Temptation,manhwa,6.980000,-0.199772,finished,1.728183,2018-05-25,2022-01-14,Missing,-0.232042,-0.114920,,False,True,,,2018-05-25,2022-01-14,"['Boys Love', 'Erotica', 'Supernatural']",[],[],Youn,['Lezhin Comics Webtoon'],Hyun was supposed to spend Christmas snuggling...,The Devil's Temptation,악마의 유혹,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15377,Alice in Junk Box,manga,6.990000,-0.199981,finished,-0.252098,2014-03-22,2014-03-22,Missing,-0.231654,-0.113698,,True,True,,,2014-03-22,2014-03-22,['Fantasy'],['Reverse Harem'],['Shoujo'],Mamenosuke Fujimaru,[],"Short stories of the Alice franchise, also fea...",Alice in the Country of Hearts: Junk Box Stories,Alice in Junk Box ～藤丸豆ノ介アリスシリーズ短編集～,0,1.0
21602,Ninshin x5,manga,6.583333,-0.201882,finished,-0.433960,2012-11-22,2015-09-05,Missing,-0.235649,-0.115327,,False,True,,,2012-11-22,2015-09-05,['Hentai'],[],[],Akira Kabashima,['Comic Mugen Tensei'],1. Isekai kara Sayounara\n2. Maid no Gakkou\n3...,Pregnancy x5,妊娠×5,0,1.0
17730,Eclipse,manga,6.620000,-0.200848,finished,-0.312718,2011-11-10,2013-07-17,Missing,-0.233855,-0.115123,,False,True,,,2011-11-10,2013-07-17,['Hentai'],[],[],Menou Kuroiwa,['Manga Bangaichi'],1. Eclipse\n2-4. Tsuki wa Mujihi na Yoru no Jo...,Eclipse,イクリプス,0,1.0
15725,World Apartment Horror,manga,6.490000,-0.196378,finished,-0.433960,1991-01-01,1991-01-01,Missing,-0.217859,-0.114716,,True,True,,,1991,1991,"['Action', 'Comedy', 'Horror', 'Mystery', 'Sli...","['Mythology', 'Samurai']",['Seinen'],"Katsuhiro Otomo, Satoshi Kon",['Young Magazine (Monthly)'],Collection of four stories by Satoshi Kon:\n\n...,World Apartment Horror,ワールドアパートメントホラー,0,1.0


### Text processing

Tokenization and lemmatization by keeping `{'NOUN', 'VERB', 'ADJ', 'PROPN', 'ADV'}` and replace named entity with place holder.

In [20]:
# tokenization and lemmatization
import spacy

POS_TO_KEEP = {'NOUN', 'VERB', 'ADJ', 'PROPN', 'ADV'}

nlp = spacy.load('en_core_web_sm')

stop_words = nlp.Defaults.stop_words

def tokenization(doc):
    """
    Filter out number.
    Replace person, organization, and location entities with '<ent_type_>'
    Return lemma if its POS is in `POS_TO_KEEP` and not in stop_words.
    """
    tokens = []

    for token in doc:
        if token.is_digit:
            # filter out numeric tokens
            continue

        if token.ent_type_ in ['PERSON', 'ORG', 'GPE']:
            # replace person, organization, and location entities
            tokens.append(f'<{token.ent_type_}>')
            
        elif token.pos_ in POS_TO_KEEP and not token.lemma_ in stop_words:
            # return lemma if its POS is in `POS_TO_KEEP` and not in stop_words
            tokens.append(token.lemma_)
    
    return tokens

# use nlp.pipe for batch processing
train['title_en_token'] = [
    tokenization(doc) for doc in nlp.pipe(train['title_english'], batch_size=100, n_process=-1)
]
train['synopsis_token'] = [
    tokenization(doc) for doc in nlp.pipe(train['synopsis'], batch_size=100, n_process=-1)
]

# filter out empty token
train = train[train['title_en_token'].apply(lambda x: len(x) > 0) & train['synopsis_token'].apply(lambda x: len(x) > 0)] # both columns cannot be empty

print('Number of rows after filtering empty token: ', len(train))

Number of rows after filtering empty token:  19301


In [21]:
# review tokenization
with pd.option_context('display.max_colwidth', None):
    display(train[['title_english', 'title_en_token', 'synopsis', 'synopsis_token']].sample(1))

Unnamed: 0,title_english,title_en_token,synopsis,synopsis_token
1990,The Squid Girl OVA,"[Squid, Girl, OVA]","Original anime DVD bundled with the 12th, 14th, and 17th volumes of the manga.","[original, anime, dvd, bundle, 12th, 17th, volume, manga]"


Since titles and synopses play different roles (titles are short and often genre-indicative, while synopses provide detailed content descriptions), we use two separate vectorizers.

Titles are shorter and often contain rare, context-rich words that are crucial for capturing unique meaning, while synopses are longer and contain more common words, making them less reliant on capturing rare vocabulary. So, we use a higher `max_features` or `vector_size` for title to ensures it capture these niche terms and their relationships, while use lower for synopsis to focus on the more frequently relevant words.

In [22]:
# tfidf

from sklearn.feature_extraction.text import TfidfVectorizer

# initialize title tfidf
tfidf_title = TfidfVectorizer(
    ngram_range=(1,1),  # uni-gram
    min_df=1,           # don't filter rare words as they are important for title
    max_df=0.8,         # filter out very common words
)

# initialize synopsis tfidf
tfidf_synopsis = TfidfVectorizer(
    ngram_range=(1,2),  # uni-gram or bi-gram
    max_features=2000,  # focus on the more frequently relevant words
    min_df=2,           # filter out extremely rare words
    max_df=0.8,         # filter out very common words
)

# train tfidf
title_en_tfidf_matrix = tfidf_title.fit_transform(train['title_en_token'].apply(lambda x: " ".join(x)))
synopsis_tfidf_matrix = tfidf_synopsis.fit_transform(train['synopsis_token'].apply(lambda x: " ".join(x)))

# add result to train df
train['title_en_tfidf'] = [title_en_tfidf_matrix[i] for i in range(title_en_tfidf_matrix.shape[0])]
train['synopsis_tfidf'] = [synopsis_tfidf_matrix[i] for i in range(synopsis_tfidf_matrix.shape[0])]

For word embedding, we choose **Word2Vec - Skip-gram** because it tends to capture rare words more effectively (e.g. niche anime/manga-specific vocabulary), comparing to Word2Vec - CBOW and GloVe.

In [23]:
# word embedding
import multiprocessing
from gensim.models import Word2Vec

# dynamically determine the number of CPU cores
num_workers = multiprocessing.cpu_count()

# train title skipgram model
skipgram_model_title = Word2Vec(
    train['title_en_token'].tolist(),
    sg=1,               # skip-gram
    vector_size=300,    # title use higher dim
    window=2,           # title use smaller window size
    min_count=1,        # titles may contain rare but important words
    epochs=30,          # title are shorter, need more epochs to train
    workers=num_workers,
    seed=RANDOM_SEED
)

# train synopsis skipgram model
skipgram_model_synopsis = Word2Vec(
    train['synopsis_token'].tolist(),
    sg=1,               # skip-gram
    vector_size=150,    # synopsis use lower dim
    window=5,           # synopsis use larger window size
    min_count=2,        # filter out extremely rare words
    epochs=15,
    workers=num_workers,
    seed=RANDOM_SEED
)

# clean tokens that does not exist in the skipgram vocab (because of `min_count`)
model_vocab = set(skipgram_model_synopsis.wv.index_to_key)
train['synopsis_token'] = train['synopsis_token'].apply(lambda x: [token for token in x if token in model_vocab])
train = train[train['synopsis_token'].apply(lambda x: len(x) > 0)]  # filter out empty entry after clean tokens

# apply skipgram model
train['title_en_skipgram'] = train['title_en_token'].apply(lambda x: skipgram_model_title.wv[x])
train['synopsis_skipgram'] = train['synopsis_token'].apply(lambda x: skipgram_model_synopsis.wv[x])

In [24]:
train[['title_japanese', 'title_english', 'synopsis', 'title_en_tfidf', 'title_en_skipgram', 'synopsis_tfidf', 'synopsis_skipgram']].sample(5)

Unnamed: 0,title_japanese,title_english,synopsis,title_en_tfidf,title_en_skipgram,synopsis_tfidf,synopsis_skipgram
15563,大人になっても,Becoming the adult.,1-2. Otona ni Nattemo\n3. Kimi to Fumou na Lov...,"(0, 112)\t1.0","[[-0.09924946, -0.07319166, 0.1075789, -0.0808...","(0, 1149)\t0.12195531882731071\n (0, 712)\t...","[[0.17589037, -0.19478656, 0.22825679, -0.0996..."
14624,킬 더 드래곤,Kill the Dragon,The human race plunged into a war against drag...,"(0, 2128)\t0.6581688308428325\n (0, 4022)\t...","[[-0.06586574, -0.28722557, 0.095829286, -0.22...","(0, 712)\t0.04413061168050446\n (0, 284)\t0...","[[0.34266984, -0.8500777, 0.05640744, -0.30880..."
9966,BM ネクタール,BioMeat: Nectar,Japan was in need of food. Bio-engineers had t...,"(0, 5387)\t0.2568160053943383\n (0, 767)\t0...","[[0.0033345725, 0.007895966, -0.0041998257, 0....","(0, 1149)\t0.0684607796686462\n (0, 712)\t0...","[[0.17589037, -0.19478656, 0.22825679, -0.0996..."
4541,女子かう生,Joshi Kausei,Momoko Futo is an average high-school girl goi...,"(0, 5631)\t1.0","[[-0.054694336, -0.3946445, 0.0029606859, -0.2...","(0, 1149)\t0.1265606622906114\n (0, 1216)\t...","[[0.07261991, -0.13106517, 0.23346084, 0.12240..."
21077,弟達の追跡事情,Brothers Chasing Situation,A boy gets angry when discover that his brothe...,"(0, 978)\t0.5481339475583463\n (0, 6979)\t0...","[[-0.100997485, -0.05481459, 0.119421735, -0.0...","(0, 1920)\t0.18208233017206646\n (0, 644)\t...","[[-0.16524214, -0.18854012, -0.06676363, -0.14..."


## Export necessary assets

In [27]:
import joblib

# train and test df
joblib.dump(train, 'assets/train.joblib')
joblib.dump(test, 'assets/test.joblib')

# tfidf_matrix
joblib.dump(title_en_tfidf_matrix, 'assets/title_en_tfidf_matrix.joblib')
joblib.dump(synopsis_tfidf_matrix, 'assets/synopsis_tfidf_matrix.joblib')

# tfidf vectorizer
joblib.dump(tfidf_title, 'assets/tfidf_title_vectorizer.joblib')
joblib.dump(tfidf_synopsis, 'assets/tfidf_synopsis_vectorizer.joblib')

# skipgram
joblib.dump(skipgram_model_title, 'assets/skipgram_model_title.joblib')
joblib.dump(skipgram_model_synopsis, 'assets/skipgram_model_synopsis.joblib')

['assets/skipgram_model_synopsis.joblib']

In [26]:
# store library version
# run every time before you commit
!pip freeze > requirements.txt