<a href="https://colab.research.google.com/github/codeBySejal/ml-colab-notebooks/blob/main/Kdrama_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Download dataset via Kaggle CLI
!kaggle datasets download ahbab911/top-250-korean-dramas-kdrama-dataset

Dataset URL: https://www.kaggle.com/datasets/ahbab911/top-250-korean-dramas-kdrama-dataset
License(s): CC0-1.0
Downloading top-250-korean-dramas-kdrama-dataset.zip to /content
  0% 0.00/89.9k [00:00<?, ?B/s]
100% 89.9k/89.9k [00:00<00:00, 64.8MB/s]


In [51]:
import pandas as pd
kdrama_df = pd.read_csv('/content/top-250-korean-dramas-kdrama-dataset.zip')
print(kdrama_df.head())

                  Name                   Aired Date  Year of release  \
0       Move to Heaven                 May 14, 2021             2021   
1       Flower of Evil  Jul 29, 2020 - Sep 23, 2020             2020   
2    Hospital Playlist  Mar 12, 2020 - May 28, 2020             2020   
3  Hospital Playlist 2  Jun 17, 2021 - Sep 16, 2021             2021   
4            My Mister  Mar 21, 2018 - May 17, 2018             2018   

  Original Network              Aired On  Number of Episodes       Duration  \
0          Netflix                Friday                  10        52 min.   
1              tvN   Wednesday, Thursday                  16  1 hr. 10 min.   
2   Netflix,  tvN               Thursday                  12  1 hr. 30 min.   
3   Netflix,  tvN               Thursday                  12  1 hr. 40 min.   
4              tvN   Wednesday, Thursday                  16  1 hr. 17 min.   

                          Content Rating  Rating  \
0  18+ Restricted (violence & profanity)

In [53]:
#features
print(kdrama_df.columns)

Index(['Name', 'Aired Date', 'Year of release', 'Original Network', 'Aired On',
       'Number of Episodes', 'Duration', 'Content Rating', 'Rating',
       'Synopsis', 'Genre', 'Tags', 'Director', 'Screenwriter', 'Cast',
       'Production companies', 'Rank'],
      dtype='object')


In [54]:
kdrama_df.loc[0]

Unnamed: 0,0
Name,Move to Heaven
Aired Date,"May 14, 2021"
Year of release,2021
Original Network,Netflix
Aired On,Friday
Number of Episodes,10
Duration,52 min.
Content Rating,18+ Restricted (violence & profanity)
Rating,9.2
Synopsis,Geu Roo is a young autistic man. He works for ...


In [55]:
kdrama_df.loc[0]['Synopsis']

"Geu Roo is a young autistic man. He works for his father’s business “Move To Heaven.” Their job is to arrange items left by deceased people. One day, Geu Roo's own father dies. Geu Roo is left alone, but his uncle Sang Koo suddenly appears in front of him. Sang Koo is a cold man. He was a martial artist who fought in underground matches. He went to prison because of what happened at his fight. Sang Koo now becomes Geu Roo’s guardian. They run “Move To Heaven” together."

In [56]:
kdrama_df.loc[0]['Tags']

'Autism, Uncle-Nephew Relationship, Death, Savant Syndrome, Mourning, Tearjerker, Father-Son Relationship, Life Lesson, Ex-convict, Cleaning And Organizing'

In [75]:
kdrama_df['embedding_text'] = kdrama_df['Genre'] + ' ' + kdrama_df['Cast'] + ' ' + kdrama_df['Tags']

In [77]:
kdrama_df['embedding_text']

Unnamed: 0,embedding_text
0,"Life, Drama, Family Lee Je Hoon, Tang Jun S..."
1,"Thriller, Romance, Crime, Melodrama Lee Jo..."
2,"Friendship, Romance, Life, Medical Jo Jung..."
3,"Friendship, Romance, Life, Medical Jo Jung..."
4,"Psychological, Life, Drama, Family Lee Sun..."
...,...
245,"Historical, Romance, Medical, Fantasy Kim Nam ..."
246,"Historical, Romance, Melodrama, Political Park..."
247,"Law, Romance, Life, Drama Lee Sung Min, Kim Je..."
248,"Horror, Comedy, Romance, Fantasy Lee Seung Gi,..."


In [78]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

kdrama_corpus = []
for index, text in enumerate(kdrama_df['embedding_text']):
  text = re.sub('[^a-zA-Z]', ' ', text)
  lowercase_text = text.lower()
  lowercase_text = lowercase_text.split()
  refined_text = [word for word in lowercase_text if word not in set(stopwords.words('english'))]
  lemmatized_text = [lemmatizer.lemmatize(word) for word in refined_text]
  kdrama_corpus.append(' '.join(lemmatized_text))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [79]:
kdrama_corpus[0]

'life drama family lee je hoon tang jun sang hong seung hee jung suk yong jung young joo lee moon shik autism uncle nephew relationship death savant syndrome mourning tearjerker father son relationship life lesson ex convict cleaning organizing'

In [80]:
from tensorflow.keras.preprocessing.text import one_hot

vocab_size = 10000
one_hot_repr = [one_hot(sent, vocab_size) for sent in kdrama_corpus]
one_hot_repr[0]

[2515,
 709,
 8342,
 4661,
 6953,
 6919,
 1257,
 1276,
 7552,
 8686,
 6385,
 6347,
 7040,
 1002,
 6962,
 7040,
 8896,
 2590,
 4661,
 1060,
 6348,
 9479,
 6461,
 2581,
 3516,
 3159,
 2624,
 2322,
 2318,
 7515,
 52,
 8176,
 3516,
 2515,
 6787,
 5449,
 7075,
 3718,
 5553]

In [81]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = max(len(seq) for seq in kdrama_corpus)
padded_docs = pad_sequences(one_hot_repr, maxlen = max_length, padding = 'post')
padded_docs[0]

array([2515,  709, 8342, 4661, 6953, 6919, 1257, 1276, 7552, 8686, 6385,
       6347, 7040, 1002, 6962, 7040, 8896, 2590, 4661, 1060, 6348, 9479,
       6461, 2581, 3516, 3159, 2624, 2322, 2318, 7515,   52, 8176, 3516,
       2515, 6787, 5449, 7075, 3718, 5553,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [82]:
max_length

935

In [83]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential

embedding_dimension = 10
model = Sequential()
model.add(Embedding(vocab_size, embedding_dimension, input_length = max_length))
model.compile('adam', 'mse')
model.summary()



In [84]:
embeddings = model.predict(padded_docs)
embeddings[0]



[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


array([[ 0.03170756, -0.00305394,  0.0017276 , ..., -0.0147028 ,
        -0.04808709, -0.04092104],
       [-0.01012785,  0.02093926,  0.03652493, ..., -0.02720459,
         0.0087013 , -0.01570817],
       [-0.04840249,  0.04500215,  0.01749127, ..., -0.0350848 ,
        -0.01338469, -0.04396091],
       ...,
       [-0.04322326, -0.02333865,  0.03717582, ...,  0.02122513,
        -0.04080587, -0.02918975],
       [-0.04322326, -0.02333865,  0.03717582, ...,  0.02122513,
        -0.04080587, -0.02918975],
       [-0.04322326, -0.02333865,  0.03717582, ...,  0.02122513,
        -0.04080587, -0.02918975]], dtype=float32)

In [85]:
kdrama_df['Name'].tolist()

['Move to Heaven',
 'Flower of Evil',
 'Hospital Playlist',
 'Hospital Playlist 2',
 'My Mister',
 'Reply 1988',
 'Weak Hero Class 1',
 'Prison Playbook',
 'Alchemy of Souls',
 'Extraordinary Attorney Woo',
 'Mr. Queen',
 'Mother',
 "It's Okay to Not Be Okay",
 'Crash Landing on You',
 'Vincenzo',
 'Navillera',
 'Signal',
 'Mr. Sunshine',
 'Happiness',
 'Kingdom: Season 2',
 'SKY Castle',
 'Tomorrow',
 'Healer',
 'Stranger',
 'Twenty-Five Twenty-One',
 'The Red Sleeve',
 'Goblin',
 'The Uncanny Counter',
 'Mouse',
 'Kingdom',
 'Weightlifting Fairy Kim Bok Joo',
 'D.P.',
 'The Devil Judge',
 'The Penthouse',
 'Youth of May',
 'Taxi Driver',
 'Life on Mars',
 'Beyond Evil',
 'Racket Boys',
 'Hometown Cha-Cha-Cha',
 'Six Flying Dragons',
 'Our Beloved Summer',
 'The Guest',
 'Dear My Friends',
 'While You Were Sleeping',
 'The Penthouse 2',
 'Chicago Typewriter',
 '18 Again',
 'Arthdal Chronicles Part 2',
 'Arthdal Chronicles Part 3',
 'Through the Darkness',
 'Dr. Romantic 2',
 'Defendan

In [89]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def recommend(kdrama_title, top_n = 5):
  kdrama_row = kdrama_df[kdrama_df['Name'].str.lower() == kdrama_title.lower()]
  if kdrama_row.empty:
    return "Kdrama not found"
  kdrama_index = kdrama_row.index[0]
  kdrama_embedding = np.mean(embeddings[kdrama_index], axis=0).reshape(1, -1)
  all_embeddings = np.mean(embeddings, axis=1)
  similarities = cosine_similarity(kdrama_embedding, all_embeddings).flatten()
  recommended_indices = similarities.argsort()[-top_n-1:-1][::-1]
  recommended_dramas = kdrama_df.iloc[recommended_indices]['Name'].tolist()
  return recommended_dramas

recommended_dramas = recommend('True Beauty', top_n = 5)
print(recommended_dramas)

["What's Wrong with Secretary Kim", 'When the Camellia Blooms', 'Awaken', 'Secret', 'Nine: Nine Times Time Travel']
