In [1]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.metrics.pairwise import linear_kernel 
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('asian_dramas.csv')

In [3]:
del df['Unnamed: 0']
df['title'] = df['title'].str.strip()
df['title'] = df['title'].str.lower()

In [4]:
description = df['description'].to_list()
def clean_description(text):
    text = re.sub(r'\n+', ' ', text)  # Replace multiple newlines with a space
    text = re.sub(r'\(Source:.*?\)', '', text)  # Remove "(Source: ...)"
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces  
    text = text.split("~~")[0].strip()
    text = text.replace('Edit Translation', ' ') 
    text = text.replace('\"', '')
    return text  

In [5]:
cleaned = [clean_description(i) for i in description]
cleaned_series = pd.Series(cleaned)
df['description'] = cleaned_series
df['description'] = df['description'].replace(' ', 'no description')

In [6]:
descriptions = df['description']

In [None]:
app_df = pd.DataFrame(df['title']).join(pd.DataFrame(df['country'])).join(pd.DataFrame(df['year']))

In [62]:
title_reversed = pd.Series(app_df['title'].index, index= app_df['title'])

In [63]:
title_reversed['oasis']

10177

In [None]:
mat = pd.DataFrame()


#### RECOMMENDATION BY DESCRIPTION

In [9]:
desc_tv = TfidfVectorizer()
transformed_desc = desc_tv.fit_transform(descriptions) 
desc_sim = linear_kernel(transformed_desc, transformed_desc)

In [10]:
def get_recommendation_bydesc(title, desc_sim = desc_sim):
    idx = title_reversed[title]
    rec_sim = list(enumerate(desc_sim[idx]))
    rec_sort = sorted(rec_sim, key= lambda x: x[1], reverse=True)
    top_n = rec_sort[0:10]
    rec_indices = [x[0] for x in top_n ]
    sim = [x[1] for x in top_n]
    rec = pd.Series(list(df['title'].iloc[rec_indices])).to_frame()
    country = pd.Series(list(df['country'].iloc[rec_indices])).to_frame()
    year = pd.Series(list(df['year'].iloc[rec_indices])).to_frame()
    sim_df = pd.Series(sim).to_frame()
    data = pd.concat([rec, country, year, sim_df], axis= 1)
    data.columns = ['titles', 'country' , 'year' , 'similarity']
    return data

    

In [11]:
get_recommendation_bydesc('the glory')

Unnamed: 0,titles,country,year,similarity
0,the glory,Korean Drama,2022,1.0
1,aim high,Korean Drama,2017,0.258481
2,jungle fish 2,Korean Drama,2010,0.25309
3,kodomo keishi,Japanese Drama,2013,0.249275
4,ugly cake,Korean Drama,2012,0.234017
5,biscuit teacher and star candy,Korean Drama,2005,0.224728
6,himitsu no kankei - sensei wa doukyonin,Japanese Drama,2011,0.21719
7,boys over flowers,Korean Drama,2009,0.212546
8,with you,Chinese Drama,2016,0.210158
9,pyramid game,Korean Drama,2024,0.209708


#### RECOMMENDATION BY TAGS

In [12]:
tags = df['tags']
tags = tags.fillna('no tag')
tags = tags.str.strip()
tags = tags.str.lower()
tags = tags.str.replace(' ', '')



In [13]:
tag_cv = CountVectorizer()
tag_transformed = tag_cv.fit_transform(tags)
tag_sim = cosine_similarity(tag_transformed, tag_transformed)

In [14]:
def get_recommendation_bytags(title, tag_sim = tag_sim):
    idx = title_reversed[title]
    rec_sim = list(enumerate(tag_sim[idx]))
    rec_sort = sorted(rec_sim, key= lambda x: x[1], reverse=True)
    top_n = rec_sort[0:20]
    rec_indices = [x[0] for x in top_n ]
    sim = [round(((x[1])*100), 2) for x in top_n]
    rec = pd.Series(list(df['title'].iloc[rec_indices])).to_frame()
    country = pd.Series(list(df['country'].iloc[rec_indices])).to_frame()
    year = pd.Series(list(df['year'].iloc[rec_indices])).to_frame()
    sim_df = pd.Series(sim).to_frame()
    data = pd.concat([rec, country, year, sim_df], axis= 1)
    data.columns = ['titles', 'country' , 'year' , 'similarity']
    return data


In [15]:
get_recommendation_bytags('hospital playlist')

Unnamed: 0,titles,country,year,similarity
0,hospital playlist,Korean Drama,2020,100.0
1,love is back,Chinese Drama,2014,54.77
2,soulmate,Korean Drama,2006,51.64
3,the gentlemen of wolgyesu tailor shop,Korean Drama,2016,51.64
4,flower ever after,Korean Drama,2018,47.81
5,men with sword,Chinese Drama,2016,47.67
6,a little mood for love,Chinese Drama,2021,44.72
7,along with me,Chinese Drama,2023,44.72
8,why get married,Chinese Drama,2016,42.43
9,if time flows back,Chinese Drama,2020,42.43


##### RECOMMENDATION BY GENRE

In [16]:
genre = df['genres']
genre = genre.fillna('no genre')
genre = genre.str.replace(' ', '')
genre = genre.str.lower()
genre = genre.str.strip()

In [17]:
genre_cv = CountVectorizer()
gen_transformed = genre_cv.fit_transform(genre)
gen_sim = cosine_similarity(gen_transformed, gen_transformed)

In [18]:
def get_recommendation_bygenre(title, gen_sim = gen_sim):
    idx = title_reversed[title]
    rec_sim = list(enumerate(gen_sim[idx]))
    rec_sort = sorted(rec_sim, key= lambda x: x[1], reverse=True)
    top_n = rec_sort[0:20]
    rec_indices = [x[0] for x in top_n ]
    sim = [round(((x[1])*100), 2) for x in top_n]
    rec = pd.Series(list(df['title'].iloc[rec_indices])).to_frame()
    country = pd.Series(list(df['country'].iloc[rec_indices])).to_frame()
    year = pd.Series(list(df['year'].iloc[rec_indices])).to_frame()
    sim_df = pd.Series(sim).to_frame()
    data = pd.concat([rec, country, year, sim_df], axis= 1)
    data.columns = ['titles', 'country' , 'year' , 'similarity(%)']
    return data


In [None]:
get_recommendation_bygenre('extracurricular')

#### RECOMMENDATION BY CAST

In [20]:
cast = df['cast']
cast = cast.fillna('no cast')
cast = cast.str.replace(' ', '')
cast = cast.str.lower()
cast = cast.str.strip()

In [21]:
cast_cv = CountVectorizer()
cast_transformed = cast_cv.fit_transform(cast)
cast_sim = cosine_similarity(cast_transformed, cast_transformed)

In [22]:
def get_recommendation_bycast(title, cast_sim = cast_sim):
    idx = title_reversed[title]
    rec_sim = list(enumerate(cast_sim[idx]))
    rec_sort = sorted(rec_sim, key= lambda x: x[1], reverse=True)
    top_n = rec_sort[0:20]
    rec_indices = [x[0] for x in top_n ]
    sim = [round(((x[1])*100), 2) for x in top_n]
    rec = pd.Series(list(df['title'].iloc[rec_indices])).to_frame()
    country = pd.Series(list(df['country'].iloc[rec_indices])).to_frame()
    year = pd.Series(list(df['year'].iloc[rec_indices])).to_frame()
    sim_df = pd.Series(sim).to_frame()
    data = pd.concat([rec, country, year, sim_df], axis= 1)
    data.columns = ['titles', 'country' , 'year' , 'similarity']
    return data

In [None]:
get_recommendation_bycast('extracurricular')

In [24]:
combined_feature = tags + genre + cast

In [25]:
cf_cv = CountVectorizer()
cf_transformed = cf_cv.fit_transform(combined_feature)
cf_sim = cosine_similarity(cf_transformed, cf_transformed)

In [53]:
def get_recommendation_bycf(title, cf_sim = cf_sim):
    idx = title_reversed[title]
    rec_sim = list(enumerate(cf_sim[idx]))
    rec_sort = sorted(rec_sim, key= lambda x: x[1], reverse=True)
    top_n = rec_sort[0:20]
    rec_indices = [x[0] for x in top_n ]
    sim = [round(((x[1])*100), 2) for x in top_n]
    rec = pd.Series(list(df['title'].iloc[rec_indices])).to_frame()
    country = pd.Series(list(df['country'].iloc[rec_indices])).to_frame()
    year = pd.Series(list(df['year'].iloc[rec_indices])).to_frame()
    sim_df = pd.Series(sim).to_frame()
    data = pd.concat([rec, country, year, sim_df], axis= 1)
    data.columns = ['titles', 'country' , 'year' , 'similarity']
    return data.to_html()

In [54]:
get_recommendation_bycf('hospital playlist')

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>titles</th>\n      <th>country</th>\n      <th>year</th>\n      <th>similarity</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>hospital playlist</td>\n      <td>Korean Drama</td>\n      <td>2020</td>\n      <td>100.00</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>hospital playlist season 2</td>\n      <td>Korean Drama</td>\n      <td>2021</td>\n      <td>61.11</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>revive</td>\n      <td>Chinese Drama</td>\n      <td>2016</td>\n      <td>29.46</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>pride</td>\n      <td>Japanese Drama</td>\n      <td>2004</td>\n      <td>28.58</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>men with sword</td>\n      <td>Chinese Drama</td>\n      <td>2016</td>\n      <td>28.58</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>graceful friends

In [69]:
app_df['country'] = app_df['country'].str.replace('Drama', '').str.strip().str.lower()

In [56]:
def get_recommendation_bycf(title, cf_sim = cf_sim):
    idx = title_reversed[title]
    rec_sim = list(enumerate(cf_sim[idx]))
    rec_sort = sorted(rec_sim, key= lambda x: x[1], reverse=True)
    top_n = rec_sort[0:20]
    rec_indices = [x[0] for x in top_n ]
    sim = [round(((x[1])*100), 2) for x in top_n]
    rec = pd.Series(list(app_df['title'].iloc[rec_indices])).to_frame()
    country = pd.Series(list(app_df['country'].iloc[rec_indices])).to_frame()
    year = pd.Series(list(app_df['year'].iloc[rec_indices])).to_frame()
    sim_df = pd.Series(sim).to_frame()
    data = pd.concat([rec, country, year, sim_df], axis= 1)
    data.columns = ['titles', 'country' , 'year' , 'similarity']
    return data

In [61]:
get_recommendation_bycf('hellbound')

Unnamed: 0,titles,country,year,similarity
0,hellbound,Korean Drama,2021,100.0
1,blind spot,Chinese Drama,2014,35.53
2,sweet home,Korean Drama,2020,27.04
3,mysterious summer,Chinese Drama,2014,25.2
4,strange school tales,Korean Drama,2020,25.0
5,higurashi no naku koro ni kai,Japanese Drama,2016,24.34
6,higurashi no naku koro ni,Japanese Drama,2016,23.57
7,demon out of chang an,Chinese Drama,2016,23.57
8,kamen rider amazons,Japanese Drama,2016,22.87
9,alive,Korean Drama,2019,22.87


In [None]:
import joblib
joblib.dump(cf_sim, open('cf_sim.pkl', 'wb'), protocol= 3, compress= 7)


In [70]:
joblib.dump(app_df, open('app_df.pkl', 'wb'))

In [None]:
pkl = joblib.load('app_df.pkl')
skskk = joblib.load('cf_sim.pkl')

In [73]:
app_df[app_df['title'] == 'first love']

Unnamed: 0,title,country,year
398,first love,japanese,2002
561,first love,korean,2003
2700,first love,chinese,2011
4851,first love,chinese,2016
9167,first love,chinese,2022
9737,first love,chinese,2022
11979,first love,korean,2024


array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.12262787, ..., 0.06681531, 0.        ,
        0.        ],
       [0.        , 0.12262787, 1.        , ..., 0.11470787, 0.        ,
        0.        ],
       ...,
       [0.        , 0.06681531, 0.11470787, ..., 1.        , 0.125     ,
        0.09449112],
       [0.        , 0.        , 0.        , ..., 0.125     , 1.        ,
        0.18898224],
       [0.        , 0.        , 0.        , ..., 0.09449112, 0.18898224,
        1.        ]])