In [284]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.metrics.pairwise import linear_kernel 
from sklearn.metrics.pairwise import cosine_similarity

In [403]:
df = pd.read_csv('asianmovies.csv')

In [None]:
df.tail()

In [476]:
df.query('year == 2023 and country == "Japanese Movie "')

Unnamed: 0,titles,links,user_rating,country,year,screenwriter,director,description,tags,cast,genres,user_count,content_rating
2709,monster,https://mydramalist.com/743747-kaibutsu,9.0,Japanese Movie,2023,"Thriller,Drama,Family",Koreeda Hirokazu,A suburban town with a large lake. A single mo...,"School Bullying,Coming Of Age,Mother-Son Relat...","Ando Sakura,Nagayama Eita,Kurokawa Souya,Hiira...","Thriller,Drama,Family",6862,G
2711,godzilla minus one,https://mydramalist.com/755475-godzilla-minus-one,8.5,Japanese Movie,2023,"Action,Tokusatsu,Horror,Sci-Fi",Yamazaki Takashi,Post-war Japan is at its lowest point when a n...,"Pilot Male Lead,Post World War 2,Kamikaze,Kaij...","Kamiki Ryunosuke,Hamabe Minami,Yamada Yuki,Aok...","Action,Tokusatsu,Horror,Sci-Fi",2542,G
2713,as long as we both shall live,https://mydramalist.com/726653-my-happy-marriage,8.5,Japanese Movie,2023,"Historical,Romance,Drama,Fantasy",Kanno Tomoe,"Saimori Miyo, the daughter of a loveless marri...","Arranged Marriage,Supernatural Power,Supernatu...","Imada Mio,Meguro Ren,Watanabe Keisuke,Onishi R...","Historical,Romance,Drama,Fantasy",11137,G
2716,our secret diary,https://mydramalist.com/748899-our-secret-diary,8.4,Japanese Movie,2023,"Romance,Youth",Takemura Kentaro,"Nozomi, a second-year high schooler, finds a l...","School Setting,Popular Male Lead,High School,M...","Takahashi Fumiya,Sakurada Hiyori,Kayashima Miz...","Romance,Youth",9142,G
2718,perfect days,https://mydramalist.com/752603-perfect-days,8.3,Japanese Movie,2023,"Cleaner Male Lead,Loner Male Lead,Co-produced,...","Life,Drama",Hirayama works as a toilet cleaner in Tokyo. H...,"Cleaner Male Lead,Loner Male Lead,Co-produced,...","Yakusho Koji,Emoto Tokio,Aoi Yamada,Aso Yumi,I...","Life,Drama",1151,NR
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2942,kyrie,https://mydramalist.com/751205-kyrie-no-uta,7.5,Japanese Movie,2023,"Music,Youth,Drama",Iwai Shunji,"In the Great East Japan Earthquake, a young gi...","Musician Female Lead,Adapted From A Novel","Aina The End,Matsumura Hokuto,Kuroki Haru,Hiro...","Music,Youth,Drama",71,G
2945,one last bloom,https://mydramalist.com/740065-haru-ni-chiru,7.5,Japanese Movie,2023,"Boxer Male Lead,Athlete Male Lead,Boxing,Adapt...","Drama,Sports",Hirooka Jinichi is a former boxer. He lost a b...,"Boxer Male Lead,Athlete Male Lead,Boxing,Adapt...","Sato Koichi,Yokohama Ryusei,Yamaguchi Tomoko,H...","Drama,Sports",26,G
2947,ichikei's crow: the movie,https://mydramalist.com/732779-ichikei-s-crow-...,7.5,Japanese Movie,2023,Hamada Hideya,Tanaka Ryo,Two years after Iruma Michio left first crimin...,"Criminal Justice System,Courtroom Setting,Judg...","Takenouchi Yutaka,Kuroki Haru,Mukai Osamu,Sait...","Mystery,Comedy,Law,Drama",43,G
2950,jigen daisuke,https://mydramalist.com/758539-jigen-daisuke,7.5,Japanese Movie,2023,"Action,Adventure,Mystery,Drama",Hashimoto Hajime,"Feeling unhappy with his gun, Jigen is looking...","Web Movie,Adapted From A Manga,Investigation","Tamayama Tetsuji,Maki Kotoka,Maki Yoko,Nagase ...","Action,Adventure,Mystery,Drama",95,15+


In [405]:
del df['Unnamed: 0']
df['titles'] = df['titles'].str.strip()
df['titles'] = df['titles'].str.lower()

In [406]:
description = df['description'].to_list()

In [407]:
def clean_description(text):
    text = re.sub(r'\n+', ' ', text)  # Replace multiple newlines with a space
    text = re.sub(r'\(Source:.*?\)', '', text)  # Remove "(Source: ...)"
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces  
    text = text.split("~~")[0].strip()
    text = text.replace('Edit Translation', '') 
    text = text.replace('\"', '')
    return text  

In [None]:
cleaned = [clean_description(i) for i in description]
cleaned_series = pd.Series(cleaned)
df['description'] = cleaned_series

In [None]:
df[df['director'] == df['tags']]

In [None]:
df.query('director == genres')

#### MOVIE RECOMMENDATION BY TAGS

In [None]:
df['tags'].fillna(df['genres'])

In [439]:
tags = df['tags']

In [428]:
tags_na_removed = df['tags'].dropna(ignore_index=True)

In [442]:
tags = tags.fillna('No tags')

In [443]:
tags = tags.str.replace(' ', '')
tags = tags.str.lower()

In [510]:
cv = CountVectorizer()
transformed_tags = cv.fit_transform(tags)
sim = cosine_similarity(transformed_tags, transformed_tags)

In [509]:
transformed_tags.shape

(4970, 2784)

In [515]:
def get_recommendation_bytag(title, sim = sim):
    idx = reversed[title]
    cos_score = list(enumerate(sim[idx]))
    cos_score = sorted(cos_score, key=lambda x:x[1], reverse=True)
    first_10 = cos_score[0:20]
    rec_indices = [x[0] for x in first_10]
    rec_sim = [x[1] for x in first_10]
    soft = pd.Series(list(df['titles'].iloc[rec_indices])).to_frame()
    hot = pd.Series(rec_sim).to_frame()
    dff = pd.concat((soft, hot), axis= 1)
    dff.columns=['titles', 'similarity']
    return dff

In [516]:
get_recommendation_bytag('lighting up the stars')

Unnamed: 0,titles,similarity
0,lighting up the stars,1.0
1,papa,0.596285
2,salut d'amour,0.596285
3,dad for rent,0.596285
4,moratorium tamako,0.547723
5,step,0.527046
6,an autumn afternoon,0.516398
7,sori: voice from the heart,0.516398
8,oi handsome!!,0.516398
9,a day with my son,0.516398


In [507]:
get_recommendation_bytag('lighting up the stars')

Unnamed: 0,titles,similarity
0,lighting up the stars,1.0
1,papa,0.454605
2,dad for rent,0.451531
3,moratorium tamako,0.409701
4,oi handsome!!,0.408133
5,a day with my son,0.395068
6,sori: voice from the heart,0.387922
7,salut d'amour,0.369134
8,the shadowless tower,0.365321
9,gunkan shonen,0.356489


#### RECOMMENDATION USING MOVIE DESCRIPTION

In [492]:
tv = CountVectorizer(stop_words= 'english')
transformed = tv.fit_transform(cleaned_series)
cos = cosine_similarity(transformed, transformed)

In [494]:
tv = TfidfVectorizer(stop_words='english')
transformed = tv.fit_transform(cleaned_series)
cos = linear_kernel(transformed, transformed)

In [None]:
CountVectorizer()

In [294]:
transformed.shape

(4970, 21713)

In [295]:
reversed = pd.Series(df.index, index= df['titles'])

In [477]:
reversed[reversed== 2709]

titles
monster    2709
dtype: int64

In [496]:
#idx = reversed[title]
cos_score = list(enumerate(cos[2709]))
cos_score = sorted(cos_score, key=lambda x:x[1], reverse=True)
first_10 = cos_score[0:10]
rec_indices = [x[0] for x in first_10]
rec_sim = [x[1] for x in first_10]
soft = pd.Series(list(df['titles'].iloc[rec_indices])).to_frame()
hot = pd.Series(rec_sim).to_frame()
dff = pd.concat((soft, hot), axis= 1)
dff.columns=['titles', 'similarity']
dff

Unnamed: 0,titles,similarity
0,monster,1.0
1,silenced,0.226271
2,shichinin no tomurai,0.170784
3,lost love,0.152134
4,school excursion,0.143379
5,sansho the bailiff,0.140129
6,at home,0.134022
7,tokyo family,0.129332
8,kuchisake onna,0.117565
9,birthday card,0.116847


In [545]:
def get_recommendation(title, cos =  cos):
    idx = reversed[title]
    cos_score = list(enumerate(cos[idx]))
    cos_score = sorted(cos_score, key=lambda x:x[1], reverse=True)
    first_10 = cos_score[0:20]
    rec_indices = [x[0] for x in first_10]
    rec_sim = [x[1] for x in first_10]
    soft = pd.Series(list(df['titles'].iloc[rec_indices])).to_frame()
    hot = pd.Series(rec_sim).to_frame()
    dff = pd.concat((soft, hot), axis= 1)
    dff.columns=['titles', 'similarity']
    return dff

In [548]:
get_recommendation('confidential assignment')

Unnamed: 0,titles,similarity
0,confidential assignment,1.0
1,joint security area,0.307776
2,confidential assignment 2: international,0.272549
3,steel rain,0.258217
4,6/45,0.254622
5,the net,0.239689
6,children gone to poland,0.233145
7,poongsan,0.224492
8,secret reunion,0.222408
9,beautiful days,0.220903


In [513]:
get_recommendation_bytag('confidential assignment')

Unnamed: 0,titles,similarity
0,confidential assignment,1.0
1,confidential assignment 2: international,0.6
2,secretly greatly,0.4
3,carter,0.381385
4,operation chromite,0.3
5,the prison,0.3
6,"the gangster, the cop and the devil",0.3
7,the accidental detective 2: in action,0.3
8,dragon inn part 1: the city of sadness,0.3
9,masquerade night,0.3


In [566]:
get_recommendation('exhuma')

Unnamed: 0,titles,similarity
0,exhuma,1.0
1,the concubine,0.180253
2,rules of dating,0.173399
3,a frozen flower,0.144851
4,hwayi: a monster boy,0.143215
5,citizen of a kind,0.139662
6,dancing queen,0.139147
7,man wanted,0.13389
8,perfect number,0.128801
9,along with the gods 2: the last 49 days,0.127768


In [565]:
get_recommendation_bytag('exhuma')

Unnamed: 0,titles,similarity
0,exhuma,1.0
1,dr. cheon and lost talisman,0.381385
2,karaoke crazies,0.316228
3,qin zei you dao,0.316228
4,address unknown,0.316228
5,the secret of the black dahlia,0.316228
6,jenius: countdown in the riddle,0.316228
7,monster emperor: extra story,0.316228
8,the cursed: dead man's prey,0.316228
9,while you were sleeping,0.316228


In [564]:
get_recommendation_bycast('exhuma')

Unnamed: 0,titles,similarity
0,exhuma,1.0
1,swimming bird,0.235702
2,tazza 2: the hidden card,0.182574
3,the quiet family,0.166667
4,shiri,0.166667
5,happy end,0.166667
6,our twisted hero,0.166667
7,a taxi driver,0.166667
8,1987: when the day comes,0.166667
9,canola,0.166667


In [563]:
get_recommendation_bygenre('exhuma')

Unnamed: 0,titles,similarity
0,whispering corridors 2: memento mori,1.0
1,whispering corridors,1.0
2,the wailing,1.0
3,house of the disappeared,1.0
4,the mimic,1.0
5,hide and never seek,1.0
6,gonjiam: haunted asylum,1.0
7,the closet,1.0
8,svaha: the sixth finger,1.0
9,deja vu,1.0


##### MOVIE RECOMMENDATION BY CAST

In [517]:
cast = df['cast']

In [519]:
cast = cast.fillna('nocast')

In [523]:
cast = cast.str.strip()
cast = cast.str.replace(' ', '')
cast = cast.str.lower()

In [526]:
cs = CountVectorizer()
cast_transformed  = cs.fit_transform(cast)
cast_sim = cosine_similarity(cast_transformed, cast_transformed)

In [537]:
def get_recommendation_bycast(title, cast_sim = cast_sim):
    idx = reversed[title]
    cos_score = list(enumerate(cast_sim[idx]))
    cos_score = sorted(cos_score, key=lambda x:x[1], reverse=True)
    first_10 = cos_score[0:20]
    rec_indices = [x[0] for x in first_10]
    rec_sim = [x[1] for x in first_10]
    soft = pd.Series(list(df['titles'].iloc[rec_indices])).to_frame()
    hot = pd.Series(rec_sim).to_frame()
    dff = pd.concat((soft, hot), axis= 1)
    dff.columns=['titles', 'similarity']
    return dff

In [552]:
genre =  df['genres']

In [553]:
genre = genre.fillna('no genre')
genre = genre.str.strip()
genre = genre.str.replace(' ', '')
genre = genre.str.lower()

In [559]:
gn = TfidfVectorizer()
genre_transformed = gn.fit_transform(genre)
genre_sim = linear_kernel(genre_transformed, genre_transformed)

In [560]:
def get_recommendation_bygenre(title, genre_sim = genre_sim):
    idx = reversed[title]
    cos_score = list(enumerate(genre_sim[idx]))
    cos_score = sorted(cos_score, key=lambda x:x[1], reverse=True)
    first_10 = cos_score[0:20]
    rec_indices = [x[0] for x in first_10]
    rec_sim = [x[1] for x in first_10]
    soft = pd.Series(list(df['titles'].iloc[rec_indices])).to_frame()
    hot = pd.Series(rec_sim).to_frame()
    dff = pd.concat((soft, hot), axis= 1)
    dff.columns=['titles', 'similarity']
    return dff

In [104]:
pd.DataFrame(cos, index=df["titles"], columns=df["titles"])

titles,Hachiko Monogatari,Seven Samurai,Harakiri,Ikiru,Ran,Yojimbo,Rashomon,Tokyo Story,High and Low,The Last Emperor,...,Carrying Spring,Goodbye Debussy,12th Assistant Deacon,Phone Call to the Bar 2,MAD SAD BAD,The Furthest End Awaits,Contact Point,Isao Takahata and His Tale of the Princess Kaguya,Beautiful Legacy,Orpheus' Lyre
titles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Hachiko Monogatari,1.000000,0.045805,0.017396,0.046219,0.067311,0.049357,0.063321,0.040410,0.046976,0.065561,...,0.043918,0.024671,0.029749,0.024575,0.036342,0.027137,0.048913,0.037213,0.030843,0.030221
Seven Samurai,0.045805,1.000000,0.138600,0.048601,0.073598,0.126092,0.172681,0.055109,0.057117,0.066964,...,0.045504,0.040589,0.039290,0.030824,0.048732,0.017961,0.040397,0.052027,0.028294,0.030696
Harakiri,0.017396,0.138600,1.000000,0.025758,0.121041,0.084405,0.111894,0.024200,0.038766,0.010359,...,0.011707,0.023871,0.010240,0.020137,0.025015,0.012734,0.024501,0.011122,0.006333,0.053304
Ikiru,0.046219,0.048601,0.025758,1.000000,0.103619,0.070524,0.069839,0.080100,0.062083,0.047353,...,0.055290,0.030251,0.018447,0.033627,0.059330,0.025637,0.082942,0.039096,0.118343,0.047000
Ran,0.067311,0.073598,0.121041,0.103619,1.000000,0.135995,0.109307,0.088009,0.109200,0.092432,...,0.100245,0.046813,0.043963,0.050368,0.048950,0.023800,0.074867,0.073603,0.089930,0.043425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
The Furthest End Awaits,0.027137,0.017961,0.012734,0.025637,0.023800,0.025593,0.029961,0.064684,0.015026,0.013153,...,0.038191,0.049328,0.016372,0.023984,0.044155,1.000000,0.042546,0.018996,0.036760,0.060140
Contact Point,0.048913,0.040397,0.024501,0.082942,0.074867,0.051393,0.046685,0.083352,0.029385,0.053638,...,0.043611,0.031471,0.023003,0.037404,0.035113,0.042546,1.000000,0.045415,0.042912,0.077162
Isao Takahata and His Tale of the Princess Kaguya,0.037213,0.052027,0.011122,0.039096,0.073603,0.055287,0.063333,0.041400,0.042655,0.058414,...,0.057240,0.027175,0.030702,0.033131,0.031143,0.018996,0.045415,1.000000,0.035827,0.026510
Beautiful Legacy,0.030843,0.028294,0.006333,0.118343,0.089930,0.056234,0.044698,0.034237,0.032226,0.051030,...,0.106955,0.029757,0.020410,0.024449,0.113270,0.036760,0.042912,0.035827,1.000000,0.047170
