In [2]:
# importing packages
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# loading the datasets

titles = pd.read_csv('../assets/titles_200p_cleaned.csv')
tags = pd.read_csv('../assets/tags_200p.csv')

In [4]:
titles.head(2)

Unnamed: 0,title_id,title_english,title_romaji,type,duration,start_year,chapters,volume,publishing_status,country,...,Sci-Fi,Slice of Life,Sports,Supernatural,Thriller,title_romaji_type,synopsis_cleaned,synopsis_source,synopsis_wc,synopsis_cleaned_token
0,30002,Berserk,Berserk,MANGA,,1989.0,,,RELEASING,JP,...,0,0,0,0,0,Berserk_MANGA,His name Guts Black Swordsman feared warrior s...,Dark Horse,425,"['name', 'feared', 'warrior', 'spoken', 'whisp..."
1,31706,,JoJo no Kimyou na Bouken: Steel Ball Run,MANGA,,2004.0,95.0,24.0,FINISHED,JP,...,0,0,1,1,0,JoJo no Kimyou na Bouken: Steel Ball Run_MANGA,Originally presented unrelated story series la...,Wikipedia,346,"['presented', 'unrelated', 'story', 'series', ..."


In [6]:
tags_groupby = tags.groupby('title_id').agg({'tag_name':list}).reset_index()
tags_groupby.head(2)

Unnamed: 0,title_id,tag_name
0,1,"[Space, Crime, Episodic, Ensemble Cast, Primar..."
1,5,"[Terrorism, Primarily Adult Cast, Martial Arts..."


# Basic content-based filtering algorithm
We have used two basic approaches for building basic content-based filtering systems
- cosine similarity matrix of title-feature matrix
- cosine similarity matrix of title-latent factor matrix (using truncated SVD)

Features used are following:
- genres of titles
- synopsis
- tags

In [6]:
from sentence_transformers import SentenceTransformer

bert_base = 'bert-base-nli-mean-tokens'
all_mini = 'all-MiniLM-L6-v2'
sentence_t5 = 'Sentence-T5'

model = SentenceTransformer(all_mini)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [7]:
# Create a sentence embedding matrix with synopsis using all-MiniLM-L6-v2 model
sentence_embeddings = model.encode(titles.synopsis.fillna('').reset_index(drop= True))
sentence_embeddings.shape

(8786, 384)

In [9]:
# check sanity
wv = model.encode(['genius protagonists who are undefeatable'])
rec_df = pd.Series(cosine_similarity(wv, sentence_embeddings).flatten(), 
                   index = titles.title_id, 
                   name = 'similarity'
                  ).sort_values(ascending = False).to_frame()
rec_df.reset_index().merge(titles[['title_id','title_english','title_romaji','synopsis']], 
                           how = 'left', 
                           on = 'title_id').head(20)

Unnamed: 0,title_id,similarity,title_english,title_romaji,synopsis
0,102905,0.517188,,Tenseisha no Watashi ni Idonde Kuru Mubou de Y...,"“A prodigy at ten, a genius at fifteen, a comm..."
1,117343,0.501518,Talentless Nana,Munou na Nana,It is the year 20XX. Earth has been assaulted ...
2,124786,0.501246,Godzilla Singular Point,Godzilla: Singular Point,"This series features an original story, which ..."
3,132093,0.473711,Hero Killer,Hero Killer,"The world is in constant turmoil, with constan..."
4,136673,0.468811,,Wo Zai Yijie De Shi Shen Zhi Lu,The genius top student played games in an atte...
5,101132,0.467269,,Mi Yu Xingzhe,"A magician, a female doctor, a gangster, an ar..."
6,21459,0.458755,My Hero Academia,Boku no Hero Academia,What would the world be like if 80 percent of ...
7,120288,0.455014,"Spare Me, Great Lord!",Dawang Raoming,"This is the story of an orphan, Lü Shu. He is ..."
8,120220,0.455014,,Dawang Raoming,"This is the story of an orphan, Lü Shu. He is ..."
9,137757,0.453392,,Wanmei Shijie 2,Born into a unique world where villages fight ...


In [26]:
# now let's move on to creating a one-hot encoding for tags of the titles

tags_dummy = pd.get_dummies(tags.tag_name)
tags_dummy.index = tags.title_id
tags_dummy = tags_dummy.groupby(tags_dummy.index).sum()
tag_feature = tags_dummy.loc[lambda x : x.index.isin(titles.title_id)]
tag_feature.head()

Unnamed: 0_level_0,4-koma,Achromatic,Achronological Order,Acting,Adoption,Advertisement,Afterlife,Age Gap,Age Regression,Agender,...,Witch,Work,Wrestling,Writing,Wuxia,Yakuza,Yandere,Youkai,Yuri,Zombie
title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# checking out the tags for the title 'Berserk'
tag_feature.loc[30002].loc[lambda x : x == 1].index

Index(['Achronological Order', 'Adoption', 'Age Regression', 'Amnesia',
       'Anti-Hero', 'Body Horror', 'Coming of Age', 'Cosmic Horror', 'Demons',
       'Disability', 'Elf', 'Ensemble Cast', 'Gods', 'Gore', 'Heterosexual',
       'Love Triangle', 'Magic', 'Male Protagonist', 'Mermaid', 'Military',
       'Nudity', 'Philosophy', 'Politics', 'Primarily Adult Cast', 'Rape',
       'Religion', 'Revenge', 'Sadism', 'Seinen', 'Ships', 'Slavery',
       'Swordplay', 'Tomboy', 'Torture', 'Tragedy', 'War', 'Witch'],
      dtype='object')

In [29]:
# combine genre, synopsis, tag to create title-feature matrix
title_features = pd.concat([titles.set_index('title_id')[genres], pd.DataFrame(sentence_embeddings, index = titles.title_id), tag_feature], axis = 1)
title_features.index = title_features.index.map(titles.set_index('title_id')['title_romaji'])
title_features.head()

Unnamed: 0_level_0,Action,Adventure,Comedy,Drama,Ecchi,Fantasy,Hentai,Horror,Mahou Shoujo,Mecha,...,Witch,Work,Wrestling,Writing,Wuxia,Yakuza,Yandere,Youkai,Yuri,Zombie
title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cowboy Bebop,1,1,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Cowboy Bebop: Tengoku no Tobira,1,0,0,1,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRIGUN,1,1,1,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Witch Hunter ROBIN,1,0,0,1,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Eyeshield 21,1,0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# create a simple cosine similarity matrix
title_sim = pd.DataFrame(cosine_similarity(title_features.fillna(0)), 
                        columns = title_features.index, index = title_features.index)

In [32]:
title_sim.head()

title_id,Cowboy Bebop,Cowboy Bebop: Tengoku no Tobira,TRIGUN,Witch Hunter ROBIN,Eyeshield 21,Hachimitsu to Clover,Hungry Heart: Wild Striker,Initial D FOURTH STAGE,MONSTER,NARUTO,...,Kamiya,Batsuichi de Nakimushi na Otonari-san Aratame,Geomsulmyeongga Mangnaeadeul,Ki ni Natteru Hito ga Otoko Janakatta,Ranma 1/2: Nettou-hen,Bad Thinking Diary,Ending Maker,Musingwihwallok,Academy Wijangchwieopdanghaetda,Sinhwageup Gwisok Item-eul Sone Neoeotda
title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cowboy Bebop,1.0,0.663294,0.483569,0.253026,0.142706,0.139778,0.004105,0.217611,0.361715,0.272599,...,0.219949,0.13208,0.199388,0.070768,0.204892,0.064001,0.218649,0.13657,0.20087,0.166039
Cowboy Bebop: Tengoku no Tobira,0.663294,1.0,0.438579,0.294553,0.120147,0.166824,0.001664,0.252522,0.397407,0.19085,...,0.069257,0.157819,0.163026,0.086473,0.150901,0.076549,0.196495,0.091642,0.226323,0.13553
TRIGUN,0.483569,0.438579,1.0,0.189207,0.347738,0.226129,0.1634,0.257591,0.334297,0.290315,...,0.134442,0.16339,0.255567,0.002813,0.301951,0.0776,0.263223,0.176778,0.182582,0.202442
Witch Hunter ROBIN,0.253026,0.294553,0.189207,1.0,0.091361,0.211748,0.011318,0.248859,0.273017,0.070243,...,0.168908,0.205384,0.113509,0.103945,0.135536,0.180061,0.248083,0.111761,0.229075,0.249932
Eyeshield 21,0.142706,0.120147,0.347738,0.091361,1.0,0.268977,0.48327,0.322019,0.094167,0.510397,...,0.082923,0.107052,0.200625,0.095339,0.301731,0.013233,0.235648,0.107158,0.141863,0.1571


In [33]:
# to check sanity, filter out less popular titles
popular_title = titles.loc[lambda x : x.popularity > 10000].title_romaji.tolist()

In [34]:
# Let's check the performance of the first content-based filtering system

title_sim_popular = title_sim[popular_title]
title_sim_popular.loc['Berserk'].drop_duplicates().sort_values(ascending = False)[1:21].to_frame()
# For the title 'Berserk', the system pushed out some titles that give off 'dark' vibes, 
# such as 'Shingeki no Kyojin' or 'Devilman'

Unnamed: 0_level_0,Berserk
title_id,Unnamed: 1_level_1
Kenpuu Denki Berserk,0.651021
Berserk: Ougon Jidaihen III - Kourin,0.579788
Berserk: Ougon Jidaihen II - Doldrey Kouryaku,0.54823
Berserk: Ougon Jidaihen I - Haou no Tamago,0.524136
Hagane no Renkinjutsushi: FULLMETAL ALCHEMIST,0.521745
Vinland Saga,0.517599
Vinland Saga,0.517308
Shingeki no Kyojin,0.498971
Ousama Ranking,0.493657
Eden: It's an Endless World!,0.491587


In [35]:
title_sim_popular.loc['JoJo no Kimyou na Bouken: Steel Ball Run'].drop_duplicates().sort_values(ascending = False)[1:21].to_frame()

Unnamed: 0_level_0,JoJo no Kimyou na Bouken: Steel Ball Run
title_id,Unnamed: 1_level_1
JoJo no Kimyou na Bouken: Stardust Crusaders,0.58694
JoJo no Kimyou na Bouken: Ougon no Kaze,0.566295
JoJo no Kimyou na Bouken,0.563809
JoJo no Kimyou na Bouken: Ougon no Kaze,0.53882
JoJo no Kimyou na Bouken: Stardust Crusaders,0.538793
JoJo no Kimyou na Bouken: Diamond wa Kudakenai,0.519777
JoJo no Kimyou na Bouken: Stardust Crusaders - Egypt-hen,0.517349
JoJo no Kimyou na Bouken: JoJolion,0.50995
Baccano!,0.50546
JoJo no Kimyou na Bouken: Diamond wa Kudakenai,0.486964


In [36]:
title_sim_popular.loc['Boku no Hero Academia'].drop_duplicates().iloc[0].sort_values(ascending = False)[1:21].to_frame()

Unnamed: 0_level_0,Boku no Hero Academia
title_id,Unnamed: 1_level_1
Boku no Hero Academia,1.0
Boku no Hero Academia 2,0.77019
Boku no Hero Academia 3,0.736622
Boku no Hero Academia: Sukue! Kyuujo Kunren!,0.662832
Boku no Hero Academia 4,0.623499
Boku no Hero Academia 5,0.615133
Yuu☆Yuu☆Hakusho,0.575707
Yuu☆Yuu☆Hakusho,0.575707
Haikyuu!! 2nd Season,0.568966
Boku no Hero Academia: Ikinokore! Kesshi no Survival Kunren,0.568336


In [37]:
title_sim_popular.loc['Kidou Senshi Gundam Thunderbolt'].drop_duplicates().iloc[0].sort_values(ascending = False)[1:21].to_frame()

Unnamed: 0_level_0,Kidou Senshi Gundam Thunderbolt
title_id,Unnamed: 1_level_1
Kidou Senshi Gundam Thunderbolt,0.680371
Kidou Senshi Zeta Gundam,0.621149
Kidou Senshi Gundam SEED,0.603953
Kidou Senshi Gundam: THE ORIGIN,0.584826
Aldnoah.Zero 2,0.581852
Sidonia no Kishi,0.567957
Kidou Senshi Gundam: Tekketsu no Orphans,0.562216
Kidou Senshi Gundam 0080: Pocket no Naka no Sensou,0.559689
Choujikuu Yousai Macross,0.556502
All You Need Is Kill,0.555146


In [38]:
title_sim_popular.loc['Re:Zero kara Hajimeru Isekai Seikatsu'].drop_duplicates().iloc[0].sort_values(ascending = False)[1:21].to_frame()

Unnamed: 0_level_0,Re:Zero kara Hajimeru Isekai Seikatsu
title_id,Unnamed: 1_level_1
Re:Zero kara Hajimeru Isekai Seikatsu,1.0
Re:Zero kara Hajimeru Isekai Seikatsu,0.732957
Re:Zero kara Hajimeru Isekai Seikatsu,0.732957
Re:Zero kara Hajimeru Isekai Seikatsu 2nd Season,0.723427
Re:Zero kara Hajimeru Isekai Seikatsu 2nd Season Part 2,0.699005
Pandora Hearts,0.52724
Pandora Hearts,0.52724
Overlord,0.509686
Overlord,0.509686
Princess Connect! Re:Dive Season 2,0.46707


In [40]:
# now let us apply some more complicated approach, that is utilizing title - latent factor matrix
# to achieve the matrix, we will use truncated SVD method

from sklearn.decomposition import TruncatedSVD

n_topics = 200
lsi = TruncatedSVD(n_components=n_topics, random_state=0)

reduced_term_matrix = lsi.fit_transform(title_features.fillna(0))
sig = np.diag(lsi.singular_values_)

print(reduced_term_matrix.shape)
print(sig.shape)
print(lsi.components_.shape)



(8786, 200)
(200, 200)
(200, 732)


In [41]:
# now that we have title - latent factor matrix, we can create a cosine similarity matrix out of it
latent_mat = pd.DataFrame(reduced_term_matrix, index = titles['title_romaji'])
latent_sim = pd.DataFrame(cosine_similarity(latent_mat),
                         columns = title_features.index, index = title_features.index)

In [42]:
# to check sanity, filter out less popular titles
latent_sim_popular = latent_sim[popular_title]

In [48]:
# Let's check the performance 

latent_sim_popular.loc['Berserk'].drop_duplicates().sort_values(ascending = False)[1:11].to_frame('Similarity')
# For the title 'Berserk', the system pushed out similar titles as the first system

Unnamed: 0_level_0,Similarity
title_id,Unnamed: 1_level_1
Kenpuu Denki Berserk,0.709864
Berserk: Ougon Jidaihen III - Kourin,0.627843
Hagane no Renkinjutsushi: FULLMETAL ALCHEMIST,0.575783
Berserk: Ougon Jidaihen II - Doldrey Kouryaku,0.571022
Ousama Ranking,0.55769
Vinland Saga,0.555819
Vinland Saga,0.550611
Berserk: Ougon Jidaihen I - Haou no Tamago,0.54915
Shingeki no Kyojin,0.531167
DEVILMAN crybaby,0.530484


In [44]:
latent_sim_popular.loc['JoJo no Kimyou na Bouken: Steel Ball Run'].drop_duplicates().sort_values(ascending = False)[1:21].to_frame()

Unnamed: 0_level_0,JoJo no Kimyou na Bouken: Steel Ball Run
title_id,Unnamed: 1_level_1
JoJo no Kimyou na Bouken,0.613237
JoJo no Kimyou na Bouken: Stardust Crusaders,0.607264
JoJo no Kimyou na Bouken: Ougon no Kaze,0.604684
JoJo no Kimyou na Bouken: Stardust Crusaders,0.573343
JoJo no Kimyou na Bouken: Ougon no Kaze,0.564239
JoJo no Kimyou na Bouken: Stardust Crusaders - Egypt-hen,0.550815
Baccano!,0.548983
JoJo no Kimyou na Bouken: JoJolion,0.546865
JoJo no Kimyou na Bouken: Diamond wa Kudakenai,0.544341
Bungou Stray Dogs,0.532176


In [45]:
latent_sim_popular.loc['Boku no Hero Academia'].iloc[0].sort_values(ascending = False)[1:21].to_frame()

Unnamed: 0_level_0,Boku no Hero Academia
title_id,Unnamed: 1_level_1
Boku no Hero Academia,1.0
Boku no Hero Academia 2,0.823521
Boku no Hero Academia 3,0.764677
Boku no Hero Academia: Sukue! Kyuujo Kunren!,0.681276
Boku no Hero Academia 4,0.668483
Boku no Hero Academia 5,0.635776
Haikyuu!! 2nd Season,0.603719
Yuu☆Yuu☆Hakusho,0.603688
Yuu☆Yuu☆Hakusho,0.603688
Boku no Hero Academia: Ikinokore! Kesshi no Survival Kunren,0.593661


In [46]:
latent_sim_popular.loc['Re:Zero kara Hajimeru Isekai Seikatsu'].iloc[0].drop_duplicates().sort_values(ascending = False)[1:21].to_frame()

Unnamed: 0_level_0,Re:Zero kara Hajimeru Isekai Seikatsu
title_id,Unnamed: 1_level_1
Re:Zero kara Hajimeru Isekai Seikatsu,0.753232
Re:Zero kara Hajimeru Isekai Seikatsu 2nd Season,0.748201
Re:Zero kara Hajimeru Isekai Seikatsu 2nd Season Part 2,0.715848
Pandora Hearts,0.544374
Overlord,0.544139
Genjitsu Shugi Yuusha no Oukoku Saikenki Part 2,0.499795
Overlord II,0.488819
Dog Days,0.481825
Princess Connect! Re:Dive Season 2,0.480134
Shokei Shoujo no Virgin Road,0.478898
