In [159]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.metrics.pairwise import linear_kernel 
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
df = pd.read_csv('asian_dramas.csv')

In [26]:
del df['Unnamed: 0']
df['title'] = df['title'].str.strip()
df['title'] = df['title'].str.lower()

In [27]:
description = df['description'].to_list()
def clean_description(text):
    text = re.sub(r'\n+', ' ', text)  # Replace multiple newlines with a space
    text = re.sub(r'\(Source:.*?\)', '', text)  # Remove "(Source: ...)"
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces  
    text = text.split("~~")[0].strip()
    text = text.replace('Edit Translation', ' ') 
    text = text.replace('\"', '')
    return text  

In [None]:
cleaned = [clean_description(i) for i in description]
cleaned_series = pd.Series(cleaned)
df['description'] = cleaned_series
df['description'] = df['description'].replace(' ', 'no description')

In [178]:
descriptions = df['description']

In [42]:
title_reversed = pd.Series(df['title'].index, index= df['title'])

In [46]:
title_reversed['oasis']

10177

#### RECOMMENDATION BY DESCRIPTION

In [179]:
desc_tv = TfidfVectorizer()
transformed_desc = desc_tv.fit_transform(descriptions) 
desc_sim = linear_kernel(transformed_desc, transformed_desc)

In [180]:
def get_recommendation_bydesc(title, desc_sim = desc_sim):
    idx = title_reversed[title]
    rec_sim = list(enumerate(desc_sim[idx]))
    rec_sort = sorted(rec_sim, key= lambda x: x[1], reverse=True)
    top_n = rec_sort[0:10]
    rec_indices = [x[0] for x in top_n ]
    sim = [x[1] for x in top_n]
    rec = pd.Series(list(df['title'].iloc[rec_indices])).to_frame()
    country = pd.Series(list(df['country'].iloc[rec_indices])).to_frame()
    year = pd.Series(list(df['year'].iloc[rec_indices])).to_frame()
    sim_df = pd.Series(sim).to_frame()
    data = pd.concat([rec, country, year, sim_df], axis= 1)
    data.columns = ['titles', 'country' , 'year' , 'similarity']
    return data

    

In [103]:
get_recommendation_bydesc('the glory')

Unnamed: 0,titles,country,year,similarity
0,the glory,Korean Drama,2022,1.0
1,aim high,Korean Drama,2017,0.258481
2,jungle fish 2,Korean Drama,2010,0.25309
3,kodomo keishi,Japanese Drama,2013,0.249275
4,ugly cake,Korean Drama,2012,0.234017
5,biscuit teacher and star candy,Korean Drama,2005,0.224728
6,himitsu no kankei - sensei wa doukyonin,Japanese Drama,2011,0.21719
7,boys over flowers,Korean Drama,2009,0.212546
8,with you,Chinese Drama,2016,0.210158
9,pyramid game,Korean Drama,2024,0.209708


#### RECOMMENDATION BY TAGS

In [None]:
tags = df['tags']
tags = tags.fillna('no tag')
tags = tags.str.strip()
tags = tags.str.lower()
tags = tags.str.replace(' ', '')



tags
notag                                                                                                                                     923
miniseries                                                                                                                                298
shortlengthseries,webseries                                                                                                               142
shortlengthseries,filmedvertically,webseries                                                                                              110
shortlengthseries,miniseries,webseries                                                                                                     68
                                                                                                                                         ... 
adaptedfromamanga,restaurantsetting,waitressfemalelead,cheffemalelead,miniseries                                                            1
b

In [99]:
tag_cv = CountVectorizer()
tag_transformed = tag_cv.fit_transform(tags)
tag_sim = cosine_similarity(tag_transformed, tag_transformed)

In [169]:
def get_recommendation_bytags(title, tag_sim = tag_sim):
    idx = title_reversed[title]
    rec_sim = list(enumerate(tag_sim[idx]))
    rec_sort = sorted(rec_sim, key= lambda x: x[1], reverse=True)
    top_n = rec_sort[0:20]
    rec_indices = [x[0] for x in top_n ]
    sim = [round(((x[1])*100), 2) for x in top_n]
    rec = pd.Series(list(df['title'].iloc[rec_indices])).to_frame()
    country = pd.Series(list(df['country'].iloc[rec_indices])).to_frame()
    year = pd.Series(list(df['year'].iloc[rec_indices])).to_frame()
    sim_df = pd.Series(sim).to_frame()
    data = pd.concat([rec, country, year, sim_df], axis= 1)
    data.columns = ['titles', 'country' , 'year' , 'similarity']
    return data


In [171]:
get_recommendation_bytags('hospital playlist')

Unnamed: 0,titles,country,year,similarity
0,hospital playlist,Korean Drama,2020,100.0
1,love is back,Chinese Drama,2014,54.77
2,soulmate,Korean Drama,2006,51.64
3,the gentlemen of wolgyesu tailor shop,Korean Drama,2016,51.64
4,flower ever after,Korean Drama,2018,47.81
5,men with sword,Chinese Drama,2016,47.67
6,a little mood for love,Chinese Drama,2021,44.72
7,along with me,Chinese Drama,2023,44.72
8,why get married,Chinese Drama,2016,42.43
9,if time flows back,Chinese Drama,2020,42.43


##### RECOMMENDATION BY GENRE

In [125]:
genre = df['genres']
genre = genre.fillna('no genre')
genre = genre.str.replace(' ', '')
genre = genre.str.lower()
genre = genre.str.strip()

In [128]:
genre_cv = CountVectorizer()
gen_transformed = genre_cv.fit_transform(genre)
gen_sim = cosine_similarity(gen_transformed, gen_transformed)

In [165]:
def get_recommendation_bygenre(title, gen_sim = gen_sim):
    idx = title_reversed[title]
    rec_sim = list(enumerate(gen_sim[idx]))
    rec_sort = sorted(rec_sim, key= lambda x: x[1], reverse=True)
    top_n = rec_sort[0:20]
    rec_indices = [x[0] for x in top_n ]
    sim = [round(((x[1])*100), 2) for x in top_n]
    rec = pd.Series(list(df['title'].iloc[rec_indices])).to_frame()
    country = pd.Series(list(df['country'].iloc[rec_indices])).to_frame()
    year = pd.Series(list(df['year'].iloc[rec_indices])).to_frame()
    sim_df = pd.Series(sim).to_frame()
    data = pd.concat([rec, country, year, sim_df], axis= 1)
    data.columns = ['titles', 'country' , 'year' , 'similarity(%)']
    return data


In [166]:
get_recommendation_bygenre('extracurricular')

Unnamed: 0,titles,country,year,similarity(%)
0,extracurricular,Korean Drama,2020,100.0
1,life,Japanese Drama,2007,86.6
2,yume wo ataeru,Japanese Drama,2015,86.6
3,hokuto,Japanese Drama,2017,86.6
4,kakegurui,Japanese Drama,2018,86.6
5,kakegurui season 2,Japanese Drama,2019,86.6
6,yuri dano kan dano,Japanese Drama,2019,86.6
7,hope or dope,Korean Drama,2022,86.6
8,95,Japanese Drama,2024,86.6
9,hidamari no basho 〜 hatsukoi 〜,Japanese Drama,2010,75.0


#### RECOMMENDATION BY CAST

In [133]:
cast = df['cast']
cast = cast.fillna('no cast')
cast = cast.str.replace(' ', '')
cast = cast.str.lower()
cast = cast.str.strip()

In [134]:
cast_cv = CountVectorizer()
cast_transformed = cast_cv.fit_transform(cast)
cast_sim = cosine_similarity(cast_transformed, cast_transformed)

In [167]:
def get_recommendation_bycast(title, cast_sim = cast_sim):
    idx = title_reversed[title]
    rec_sim = list(enumerate(cast_sim[idx]))
    rec_sort = sorted(rec_sim, key= lambda x: x[1], reverse=True)
    top_n = rec_sort[0:20]
    rec_indices = [x[0] for x in top_n ]
    sim = [round(((x[1])*100), 2) for x in top_n]
    rec = pd.Series(list(df['title'].iloc[rec_indices])).to_frame()
    country = pd.Series(list(df['country'].iloc[rec_indices])).to_frame()
    year = pd.Series(list(df['year'].iloc[rec_indices])).to_frame()
    sim_df = pd.Series(sim).to_frame()
    data = pd.concat([rec, country, year, sim_df], axis= 1)
    data.columns = ['titles', 'country' , 'year' , 'similarity']
    return data

In [168]:
get_recommendation_bycast('extracurricular')

Unnamed: 0,titles,country,year,similarity
0,extracurricular,Korean Drama,2020,100.0
1,surf 101: intro to romance,Korean Drama,2018,20.41
2,want more 19,Korean Drama,2018,18.26
3,numbers,Korean Drama,2023,18.26
4,legends of love,Korean Drama,2000,16.67
5,attic cat,Korean Drama,2003,16.67
6,south of the sun,Korean Drama,2003,16.67
7,she is nineteen,Korean Drama,2004,16.67
8,han river ballad,Korean Drama,2004,16.67
9,that summer typhoon,Korean Drama,2005,16.67


In [189]:
combined_feature = tags + genre + cast

In [190]:
cf_cv = CountVectorizer()
cf_transformed = cf_cv.fit_transform(combined_feature)
cf_sim = cosine_similarity(cf_transformed, cf_transformed)

In [191]:
def get_recommendation_bycf(title, cf_sim = cf_sim):
    idx = title_reversed[title]
    rec_sim = list(enumerate(cf_sim[idx]))
    rec_sort = sorted(rec_sim, key= lambda x: x[1], reverse=True)
    top_n = rec_sort[0:20]
    rec_indices = [x[0] for x in top_n ]
    sim = [round(((x[1])*100), 2) for x in top_n]
    rec = pd.Series(list(df['title'].iloc[rec_indices])).to_frame()
    country = pd.Series(list(df['country'].iloc[rec_indices])).to_frame()
    year = pd.Series(list(df['year'].iloc[rec_indices])).to_frame()
    sim_df = pd.Series(sim).to_frame()
    data = pd.concat([rec, country, year, sim_df], axis= 1)
    data.columns = ['titles', 'country' , 'year' , 'similarity']
    return data

In [196]:
get_recommendation_bycf('hospital playlist')

Unnamed: 0,titles,country,year,similarity
0,hospital playlist,Korean Drama,2020,100.0
1,hospital playlist season 2,Korean Drama,2021,61.11
2,revive,Chinese Drama,2016,29.46
3,pride,Japanese Drama,2004,28.58
4,men with sword,Chinese Drama,2016,28.58
5,graceful friends,Korean Drama,2020,28.58
6,racket boys,Korean Drama,2021,27.78
7,run on,Korean Drama,2020,27.04
8,the gentlemen of wolgyesu tailor shop,Korean Drama,2016,25.2
9,flower ever after,Korean Drama,2018,25.2
