In [1]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.metrics.pairwise import linear_kernel 
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('asianmovies.csv')

In [3]:
df.tail()

Unnamed: 0.1,Unnamed: 0,titles,links,user_rating,country,year,screenwriter,director,description,tags,cast,genres,user_count,content_rating
4965,4965,The Furthest End Awaits,https://mydramalist.com/8185-the-furthest-end-...,7.5,Japanese Movie,2015,,Drama,Misaki Yoshida runs a roasting coffee shop in ...,,"Sasaki Nozomi,Nagasaku Hiromi,Sakurada Hiyori,...",Drama,38,G
4966,4966,Contact Point,https://mydramalist.com/753259-contact-point,7.4,Korean Movie,2014,,"Web Movie,Short Film",Sun Woo receives a call from his ex-girlfriend...,"Web Movie,Short Film","Son Suk Ku,Choi Hee Seo","Romance,Drama",12,NR
4967,4967,Isao Takahata and His Tale of the Princess Kaguya,https://mydramalist.com/728987-isao-takahata-a...,7.5,Japanese Movie,2015,,,For his first film in fourteen years animation...,,"Miyazaki Hayao,Asakura Aki,Chii Takeo,Miyamoto...",Documentary,15,NR
4968,4968,Beautiful Legacy,https://mydramalist.com/7864-beautiful-legacy,7.5,Korean Movie,2011,,,"Father is dead!\nSoo Jeong, Min Seong, and Kye...",,"Kim Min Soo,Jeon Moo Song,Lim Hyun Sung,Dong H...","Romance,Drama,Melodrama",8,NR
4969,4969,Orpheus' Lyre,https://mydramalist.com/6958-orpheus-lyre,7.5,Japanese Movie,2013,,"Attempted Suicide,Adapted From A Novel",Yoko lost her young daughter Kanako through a...,"Attempted Suicide,Adapted From A Novel","Fukuda Mayuko,Inagaki Goro,Hirosue Ryoko,Enami...",Drama,12,NR


In [4]:
#df.query('year == 2023 and country == "Japanese Movie "')

In [5]:
del df['Unnamed: 0']
df['titles'] = df['titles'].str.strip()
df['titles'] = df['titles'].str.lower()

In [6]:
description = df['description'].to_list()

In [7]:
def clean_description(text):
    text = re.sub(r'\n+', ' ', text)  # Replace multiple newlines with a space
    text = re.sub(r'\(Source:.*?\)', '', text)  # Remove "(Source: ...)"
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces  
    text = text.split("~~")[0].strip()
    text = text.replace('Edit Translation', '') 
    text = text.replace('\"', '')
    return text  

In [8]:
cleaned = [clean_description(i) for i in description]
cleaned_series = pd.Series(cleaned)
df['description'] = cleaned_series

In [9]:
df[df['director'] == df['tags']]

Unnamed: 0,titles,links,user_rating,country,year,screenwriter,director,description,tags,cast,genres,user_count,content_rating
51,legend of bruce lee,https://mydramalist.com/73047-legend-of-bruce-lee,8.9,Hong Kong Movie,1976,,"Inspired By A True Story,Kung Fu","Bruce Li stars in this Bruce Lee biopic, very ...","Inspired By A True Story,Kung Fu","Bruce Li,Ko Hsiao Pao,Lung Fei,Wei Ping Ao,Sha...","Action,Drama,Martial Arts",3,NR
54,pink dream,https://mydramalist.com/752271-pink-dream,8.4,Chinese Movie,1932,,"Writer Male Lead,Silent Film,Black And White,I...",Young novelist who is supported by a loving an...,"Writer Male Lead,Silent Film,Black And White,I...","Tam Ying,Zheng Jun Li","Historical,Romance,Drama",3,NR
57,mishima: a life in four chapters,https://mydramalist.com/771323-mishima-a-life-...,8.1,Japanese Movie,1985,,"Nationalism,Co-produced,Art House Film,Patriot...",A fictional account of the life of Japanese au...,"Nationalism,Co-produced,Art House Film,Patriot...","Ogata Ken,Hirata Mitsuru,Nagashima Toshiyuki,M...","Historical,Drama,Political",10,R
89,the adventures of milo and otis,https://mydramalist.com/20650-the-adventures-o...,8.0,Japanese Movie,1986,,"Cat,Dog,Animal,Friendship",The adventures of a young cat and a dog as the...,"Cat,Dog,Animal,Friendship","Tanikawa Shuntaro,Sakaguchi Takeharu","Adventure,Drama,Family",24,NR
94,violent panic: the big crash,https://mydramalist.com/708745-violent-panic-t...,8.0,Japanese Movie,1976,,"Robber Male Lead,Robbery,Heist,Gang","Takashi, a bank robber, dreams of his final he...","Robber Male Lead,Robbery,Heist,Gang","Watase Tsunehiko,Sugimoto Miki,Watanabe Yayoi,...","Action,Crime",3,NR
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4947,fighting,https://mydramalist.com/9636-fighting,7.5,Chinese Movie,2014,,"Athlete Male Lead,Suspense",The film centres around a gifted young athlete...,"Athlete Male Lead,Suspense","Peter Ho,Lu Yi,Wu Ma,Zhang Xiao Jun,Wei Yi,Qi Ke","Action,Thriller,Fantasy",15,NR
4950,cats and dogs,https://mydramalist.com/14040-cats-and-dogs,7.5,Korean Movie,2013,,"Dog,Cat,Animal","A dog-like guy keeping a cat, a cat-like girl ...","Dog,Cat,Animal","Son Min Ji,Shin Myung Geun,Baek Jae Ho","Comedy,Romance,Drama,Melodrama",3,NR
4953,my 1876,https://mydramalist.com/59007-my-1876,7.5,Chinese Movie,2013,,Short Film,,Short Film,"Feng Shao Feng,Gao Yuan Yuan","Romance,Drama",3,NR
4966,contact point,https://mydramalist.com/753259-contact-point,7.4,Korean Movie,2014,,"Web Movie,Short Film",Sun Woo receives a call from his ex-girlfriend...,"Web Movie,Short Film","Son Suk Ku,Choi Hee Seo","Romance,Drama",12,NR


In [None]:
df.query('director == genres')

In [11]:
reversed = pd.Series(df.index, index= df['titles'])

#### MOVIE RECOMMENDATION BY TAGS

In [12]:
df['tags'].fillna(df['genres'])

0       Animal,Dog,Based On A True Story,Professor Mal...
1       Village Setting,Samurai Male Lead,Male Centere...
2       Samurai Male Lead,Bushido,Japanese New Wave,Hi...
3       Terminal Illness,Bureaucracy,Social Commentary...
4       Father-Son Relationship,Mental Illness,Power S...
                              ...                        
4965                                                Drama
4966                                 Web Movie,Short Film
4967                                          Documentary
4968                              Romance,Drama,Melodrama
4969               Attempted Suicide,Adapted From A Novel
Name: tags, Length: 4970, dtype: object

In [13]:
tags = df['tags']

In [14]:
tags_na_removed = df['tags'].dropna(ignore_index=True)

In [15]:
tags = tags.fillna('No tags')

In [16]:
tags = tags.str.replace(' ', '')
tags = tags.str.lower()

In [17]:
cv = CountVectorizer()
transformed_tags = cv.fit_transform(tags)
sim = cosine_similarity(transformed_tags, transformed_tags)

In [18]:
transformed_tags.shape

(4970, 2784)

In [19]:
def get_recommendation_bytag(title, sim = sim):
    idx = reversed[title]
    cos_score = list(enumerate(sim[idx]))
    cos_score = sorted(cos_score, key=lambda x:x[1], reverse=True)
    first_10 = cos_score[0:20]
    rec_indices = [x[0] for x in first_10]
    rec_sim = [x[1] for x in first_10]
    soft = pd.Series(list(df['titles'].iloc[rec_indices])).to_frame()
    hot = pd.Series(rec_sim).to_frame()
    dff = pd.concat((soft, hot), axis= 1)
    dff.columns=['titles', 'similarity']
    return dff

In [20]:
get_recommendation_bytag('lighting up the stars')

Unnamed: 0,titles,similarity
0,lighting up the stars,1.0
1,papa,0.596285
2,salut d'amour,0.596285
3,dad for rent,0.596285
4,moratorium tamako,0.547723
5,step,0.527046
6,an autumn afternoon,0.516398
7,sori: voice from the heart,0.516398
8,oi handsome!!,0.516398
9,a day with my son,0.516398


In [21]:
get_recommendation_bytag('lighting up the stars')

Unnamed: 0,titles,similarity
0,lighting up the stars,1.0
1,papa,0.596285
2,salut d'amour,0.596285
3,dad for rent,0.596285
4,moratorium tamako,0.547723
5,step,0.527046
6,an autumn afternoon,0.516398
7,sori: voice from the heart,0.516398
8,oi handsome!!,0.516398
9,a day with my son,0.516398


#### RECOMMENDATION USING MOVIE DESCRIPTION

In [22]:
tv = CountVectorizer(stop_words= 'english')
transformed = tv.fit_transform(cleaned_series)
cos = cosine_similarity(transformed, transformed)

In [23]:
tv = TfidfVectorizer(stop_words='english')
transformed = tv.fit_transform(cleaned_series)
cos = linear_kernel(transformed, transformed)

In [24]:
CountVectorizer()

In [25]:
transformed.shape

(4970, 21713)

In [26]:
reversed[reversed== 2709]

titles
monster    2709
dtype: int64

In [27]:
#idx = reversed[title]
cos_score = list(enumerate(cos[2709]))
cos_score = sorted(cos_score, key=lambda x:x[1], reverse=True)
first_10 = cos_score[0:10]
rec_indices = [x[0] for x in first_10]
rec_sim = [x[1] for x in first_10]
soft = pd.Series(list(df['titles'].iloc[rec_indices])).to_frame()
hot = pd.Series(rec_sim).to_frame()
dff = pd.concat((soft, hot), axis= 1)
dff.columns=['titles', 'similarity']
dff

Unnamed: 0,titles,similarity
0,monster,1.0
1,silenced,0.226271
2,shichinin no tomurai,0.170784
3,lost love,0.152134
4,school excursion,0.143379
5,sansho the bailiff,0.140129
6,at home,0.134022
7,tokyo family,0.129332
8,kuchisake onna,0.117565
9,birthday card,0.116847


In [28]:
def get_recommendation(title, cos =  cos):
    idx = reversed[title]
    cos_score = list(enumerate(cos[idx]))
    cos_score = sorted(cos_score, key=lambda x:x[1], reverse=True)
    first_10 = cos_score[0:20]
    rec_indices = [x[0] for x in first_10]
    rec_sim = [x[1] for x in first_10]
    soft = pd.Series(list(df['titles'].iloc[rec_indices])).to_frame()
    hot = pd.Series(rec_sim).to_frame()
    dff = pd.concat((soft, hot), axis= 1)
    dff.columns=['titles', 'similarity']
    return dff

In [29]:
get_recommendation('confidential assignment')

Unnamed: 0,titles,similarity
0,confidential assignment,1.0
1,joint security area,0.307776
2,confidential assignment 2: international,0.272549
3,steel rain,0.258217
4,6/45,0.254622
5,the net,0.239689
6,children gone to poland,0.233145
7,poongsan,0.224492
8,secret reunion,0.222408
9,beautiful days,0.220903


In [30]:
get_recommendation_bytag('confidential assignment')

Unnamed: 0,titles,similarity
0,confidential assignment,1.0
1,confidential assignment 2: international,0.6
2,secretly greatly,0.4
3,carter,0.381385
4,operation chromite,0.3
5,the prison,0.3
6,"the gangster, the cop and the devil",0.3
7,the accidental detective 2: in action,0.3
8,dragon inn part 1: the city of sadness,0.3
9,masquerade night,0.3


In [31]:
get_recommendation('exhuma')

Unnamed: 0,titles,similarity
0,exhuma,1.0
1,the concubine,0.180253
2,rules of dating,0.173399
3,a frozen flower,0.144851
4,hwayi: a monster boy,0.143215
5,citizen of a kind,0.139662
6,dancing queen,0.139147
7,man wanted,0.13389
8,perfect number,0.128801
9,along with the gods 2: the last 49 days,0.127768


In [32]:
get_recommendation_bytag('exhuma')

Unnamed: 0,titles,similarity
0,exhuma,1.0
1,dr. cheon and lost talisman,0.381385
2,karaoke crazies,0.316228
3,qin zei you dao,0.316228
4,address unknown,0.316228
5,the secret of the black dahlia,0.316228
6,jenius: countdown in the riddle,0.316228
7,monster emperor: extra story,0.316228
8,the cursed: dead man's prey,0.316228
9,while you were sleeping,0.316228


In [33]:
get_recommendation_bycast('exhuma')

NameError: name 'get_recommendation_bycast' is not defined

In [None]:
get_recommendation_bygenre('exhuma')

Unnamed: 0,titles,similarity
0,whispering corridors 2: memento mori,1.0
1,whispering corridors,1.0
2,the wailing,1.0
3,house of the disappeared,1.0
4,the mimic,1.0
5,hide and never seek,1.0
6,gonjiam: haunted asylum,1.0
7,the closet,1.0
8,svaha: the sixth finger,1.0
9,deja vu,1.0


##### MOVIE RECOMMENDATION BY CAST

In [None]:
cast = df['cast']

In [None]:
cast = cast.fillna('nocast')

In [None]:
cast = cast.str.strip()
cast = cast.str.replace(' ', '')
cast = cast.str.lower()

In [None]:
cs = CountVectorizer()
cast_transformed  = cs.fit_transform(cast)
cast_sim = cosine_similarity(cast_transformed, cast_transformed)

In [None]:
def get_recommendation_bycast(title, cast_sim = cast_sim):
    idx = reversed[title]
    cos_score = list(enumerate(cast_sim[idx]))
    cos_score = sorted(cos_score, key=lambda x:x[1], reverse=True)
    first_10 = cos_score[0:20]
    rec_indices = [x[0] for x in first_10]
    rec_sim = [x[1] for x in first_10]
    soft = pd.Series(list(df['titles'].iloc[rec_indices])).to_frame()
    hot = pd.Series(rec_sim).to_frame()
    dff = pd.concat((soft, hot), axis= 1)
    dff.columns=['titles', 'similarity']
    return dff

In [None]:
genre =  df['genres']

In [None]:
genre = genre.fillna('no genre')
genre = genre.str.strip()
genre = genre.str.replace(' ', '')
genre = genre.str.lower()

In [None]:
gn = TfidfVectorizer()
genre_transformed = gn.fit_transform(genre)
genre_sim = linear_kernel(genre_transformed, genre_transformed)

In [None]:
def get_recommendation_bygenre(title, genre_sim = genre_sim):
    idx = reversed[title]
    cos_score = list(enumerate(genre_sim[idx]))
    cos_score = sorted(cos_score, key=lambda x:x[1], reverse=True)
    first_10 = cos_score[0:20]
    rec_indices = [x[0] for x in first_10]
    rec_sim = [x[1] for x in first_10]
    soft = pd.Series(list(df['titles'].iloc[rec_indices])).to_frame()
    hot = pd.Series(rec_sim).to_frame()
    dff = pd.concat((soft, hot), axis= 1)
    dff.columns=['titles', 'similarity']
    return dff

In [None]:
pd.DataFrame(cos, index=df["titles"], columns=df["titles"])

In [None]:
combined = (genre + ','  + tags)

In [None]:
comb = CountVectorizer()
combined_transformed = comb.fit_transform(combined)
comb_sim = cosine_similarity(combined_transformed, combined_transformed)

In [None]:
def get_recommendation_bycombination(title, comb_sim = comb_sim):
    idx = reversed[title]
    cos_score = list(enumerate(comb_sim[idx]))
    cos_score = sorted(cos_score, key=lambda x:x[1], reverse=True)
    first_10 = cos_score[0:20]
    rec_indices = [x[0] for x in first_10]
    rec_sim = [x[1] for x in first_10]
    rec = pd.Series(list(df['titles'].iloc[rec_indices])).to_frame()
    year = pd.Series(list(df['year'].iloc[rec_indices])).to_frame()
    country = pd.Series(list(df['country'].iloc[rec_indices])).to_frame()
    sim = pd.Series(rec_sim).to_frame()
    dff = pd.concat((rec, year, country, sim), axis= 1)
    dff.columns=['titles', 'year', 'country','similarity']
    return dff

In [None]:
get_recommendation_bycombination('exhuma')

Unnamed: 0,titles,year,country,similarity
0,exhuma,2024,Korean Movie,1.0
1,the cursed: dead man's prey,2021,Korean Movie,0.358569
2,satoru dayo,2015,Japanese Movie,0.310087
3,the wailing,2016,Korean Movie,0.3
4,the mimic,2017,Korean Movie,0.3
5,the haunting 3,2023,Chinese Movie,0.298807
6,possessed,2009,Korean Movie,0.29277
7,cinderella,2006,Korean Movie,0.29277
8,spiral,1998,Japanese Movie,0.288675
9,oppressive love,2016,Chinese Movie,0.288675


In [None]:
get_recommendation_bycombination('')

Unnamed: 0,titles,year,country,similarity
0,rurouni kenshin,2012,Japanese Movie,1.0
1,rurouni kenshin: the legend ends,2014,Japanese Movie,0.733333
2,rurouni kenshin: kyoto inferno,2014,Japanese Movie,0.733333
3,rurouni kenshin: the final,2021,Japanese Movie,0.6
4,"have sword, will travel",1969,Hong Kong Movie,0.57735
5,azumi,2003,Japanese Movie,0.544949
6,rurouni kenshin: the beginning,2021,Japanese Movie,0.533333
7,the blood brothers,1973,Hong Kong Movie,0.527046
8,azumi 2: death or love,2005,Japanese Movie,0.527046
9,11 samurai,1967,Japanese Movie,0.516398
