# How to tweak the baseline content-based filtering algorithm with ContentBasedFiltering class

In [1]:
# First you have to append the system path to import the module
import sys
sys.path.append('/home/.Import')

# load the ContentBasedFiltering class
from cbfilter import ContentBasedFiltering

In [2]:
import pandas as pd
import numpy as np

This time, we will only use the tags data as a feature for the system.

In [3]:
titles = pd.read_csv('assets/titles_200p_cleaned.csv')
tags = pd.read_csv('assets/tags_200p.csv')
tags.head(2)

Unnamed: 0,tag_id,tag_name,tag_category,tag_rank,title_id,title_english,title_romaji
0,85,Tragedy,Theme-Drama,96,30002,Berserk,Berserk
1,50,Seinen,Demographic,95,30002,Berserk,Berserk


To use the ContentBasedFiltering class, first you should create a title-feature dataframe

A title-feature dataframe is a Pandas DataFrame object that takes title_ids as an index and their features as column

In [4]:
# preprocess the tags data to build a title-feature dataframe
tags_dummy = pd.get_dummies(tags.tag_name)
tags_dummy.index = tags.title_id
tags_dummy = tags_dummy.groupby(tags_dummy.index).sum()
title_feature = tags_dummy.loc[lambda x : x.index.isin(titles.title_id)]
title_feature.head()

Unnamed: 0_level_0,4-koma,Achromatic,Achronological Order,Acting,Adoption,Advertisement,Afterlife,Age Gap,Age Regression,Agender,...,Witch,Work,Wrestling,Writing,Wuxia,Yakuza,Yandere,Youkai,Yuri,Zombie
title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# now that the title-feature dataframe is ready, let's bring out the ContentBasedFiltering class

cbf = ContentBasedFiltering()

now, we will build a similarity matrix within the ContentBasedFiltering object with the title-feature dataframe.

there are three ways to calculate the similarity:
- cosine similarity
- euclidean distance
- manhattan distance

This time we will go with the cosine similarity

In [19]:
# use create_sim_mat method to build similarity matrix
"""
Create a similarity matrix with a title-feature dataframe using chosen method.
a title-feature dataframe should be formatted as follows:
- title_ids in the indexes
- features(e.g., genres, synopsis, etc.) in the columns

*parameters
- df(Pandas DataFrame object): title-feature dataframe
- method(String): ['cosine_similarity', ' manhattan_distances', 'euclidean_distances']

*attributes
- self.sim_mat(Pandas DataFrame object): similarity matrix created from title-feature dataframe
"""

cbf.create_sim_mat(title_feature, method = 'cosine_similarity')

In [20]:
# check created similarity matrix df
display(cbf.sim_mat.head())
print(cbf.sim_mat.shape)

title_id,1,5,6,7,15,16,17,18,19,20,...,148312,149291,149332,149544,149939,150109,150170,150319,150836,151025
title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.665912,0.405096,0.196116,0.118262,0.118262,0.0,0.138675,0.393863,0.269953,...,0.236525,0.196116,0.098058,0.080064,0.209657,0.0,0.160128,0.0,0.124035,0.069338
5,0.665912,1.0,0.375735,0.16169,0.073127,0.146254,0.0,0.171499,0.398527,0.166924,...,0.073127,0.242536,0.121268,0.099015,0.129641,0.0,0.19803,0.0,0.230089,0.085749
6,0.405096,0.375735,1.0,0.086066,0.3114,0.1557,0.129099,0.182574,0.377124,0.23694,...,0.1557,0.258199,0.129099,0.0,0.276026,0.0,0.105409,0.0,0.08165,0.091287
7,0.196116,0.16169,0.086066,1.0,0.0,0.201008,0.0,0.117851,0.243432,0.0,...,0.201008,0.333333,0.0,0.136083,0.089087,0.136083,0.272166,0.0,0.210819,0.235702
15,0.118262,0.073127,0.3114,0.0,1.0,0.272727,0.452267,0.213201,0.110096,0.4842,...,0.090909,0.150756,0.150756,0.123091,0.241747,0.0,0.123091,0.0,0.095346,0.1066


(8679, 8679)


To manually check the sanity of the system's performance, let's take some samples from the titles and check the similar titles pushed by the system

In [21]:
"""
Check the sanity of the system with the chosen title_id.
The system will push similar titles to given title_id

*parameters
- title_id(Integer): title_id of a title
- max_num(Integer): number of titles the system will push
- in_romaji(Boolean): if True, the result will be presented with title_romajis instead of title_ids
- only_popular(Boolean): if True, the system will push titles whose popularity exceeds 10,000

*return
- sim_rank(Pandas DataFrame object): a list of similar titles to given title_id sorted by similarity
"""

cbf.check_sanity(title_id = 30002, max_num = 10, in_romaji = True, only_popular = True)

Unnamed: 0_level_0,Berserk
title_id,Unnamed: 1_level_1
Kenpuu Denki Berserk,0.62137
Berserk: Ougon Jidaihen III - Kourin,0.534297
Berserk: Ougon Jidaihen I - Haou no Tamago,0.509372
Vinland Saga,0.501958
Berserk: Ougon Jidaihen II - Doldrey Kouryaku,0.501557
Vinland Saga,0.497096
Shingeki no Kyojin,0.495682
Hagane no Renkinjutsushi: FULLMETAL ALCHEMIST,0.495682
Ginga Eiyuu Densetsu,0.490304
Eden: It's an Endless World!,0.480043


In [22]:
cbf.check_sanity(title_id = 30001, max_num = 10, in_romaji = True, only_popular = True)

Unnamed: 0_level_0,MONSTER
title_id,Unnamed: 1_level_1
MONSTER,0.819892
20th Century Boys,0.513265
Bungou Stray Dogs,0.505181
Tokyo Ghoul:re,0.502079
Babylon,0.489979
Innocent,0.481125
Vanitas no Carte,0.468191
Billy Bat,0.456435
Lupin III: Cagliostro no Shiro,0.451848
Odd Taxi,0.447214
