# Import Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [2]:
from google.colab import files
files.upload()  # Upload kaggle.json

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"muhammadfathifarhat","key":"7db5203fc40607ba675837f473b53609"}'}

In [3]:
import os
import zipfile

# Buat direktori .kaggle dan pindahkan credential
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Preparation Data 1

In [4]:
!kaggle datasets download -d narayan63/netflix-popular-movies-dataset

Dataset URL: https://www.kaggle.com/datasets/narayan63/netflix-popular-movies-dataset
License(s): CC0-1.0
Downloading netflix-popular-movies-dataset.zip to /content
  0% 0.00/1.17M [00:00<?, ?B/s]
100% 1.17M/1.17M [00:00<00:00, 566MB/s]


In [5]:
# Ekstrak data
!unzip netflix-popular-movies-dataset.zip

Archive:  netflix-popular-movies-dataset.zip
  inflating: n_movies.csv            


In [6]:
df = pd.read_csv('n_movies.csv')
df

Unnamed: 0,title,year,certificate,duration,genre,rating,description,stars,votes
0,Cobra Kai,(2018– ),TV-14,30 min,"Action, Comedy, Drama",8.5,Decades after their 1984 All Valley Karate Tou...,"['Ralph Macchio, ', 'William Zabka, ', 'Courtn...",177031
1,The Crown,(2016– ),TV-MA,58 min,"Biography, Drama, History",8.7,Follows the political rivalries and romance of...,"['Claire Foy, ', 'Olivia Colman, ', 'Imelda St...",199885
2,Better Call Saul,(2015–2022),TV-MA,46 min,"Crime, Drama",8.9,The trials and tribulations of criminal lawyer...,"['Bob Odenkirk, ', 'Rhea Seehorn, ', 'Jonathan...",501384
3,Devil in Ohio,(2022),TV-MA,356 min,"Drama, Horror, Mystery",5.9,When a psychiatrist shelters a mysterious cult...,"['Emily Deschanel, ', 'Sam Jaeger, ', 'Gerardo...",9773
4,Cyberpunk: Edgerunners,(2022– ),TV-MA,24 min,"Animation, Action, Adventure",8.6,A Street Kid trying to survive in a technology...,"['Zach Aguilar, ', 'Kenichiro Ohashi, ', 'Emi ...",15413
...,...,...,...,...,...,...,...,...,...
9952,The Imperfects,(2022– ),TV-MA,45 min,"Action, Adventure, Drama",6.3,After an experimental gene therapy turns them ...,"['Morgan Taylor Campbell, ', 'Italia Ricci, ',...",3130
9953,The Walking Dead,(2010–2022),TV-MA,44 min,"Drama, Horror, Thriller",8.1,Sheriff Deputy Rick Grimes wakes up from a com...,"['Andrew Lincoln, ', 'Norman Reedus, ', 'Melis...",970067
9954,The Crown,(2016– ),TV-MA,58 min,"Biography, Drama, History",8.7,Follows the political rivalries and romance of...,"['Claire Foy, ', 'Olivia Colman, ', 'Imelda St...",199898
9955,Supernatural,(2005–2020),TV-14,44 min,"Drama, Fantasy, Horror",8.4,Two brothers follow their father's footsteps a...,"['Jared Padalecki, ', 'Jensen Ackles, ', 'Jim ...",439601


# Preprocessing Data 1

Meninjau data yang digunakan

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9957 entries, 0 to 9956
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        9957 non-null   object 
 1   year         9430 non-null   object 
 2   certificate  6504 non-null   object 
 3   duration     7921 non-null   object 
 4   genre        9884 non-null   object 
 5   rating       8784 non-null   float64
 6   description  9957 non-null   object 
 7   stars        9957 non-null   object 
 8   votes        8784 non-null   object 
dtypes: float64(1), object(8)
memory usage: 700.2+ KB


Hanya kolom "rating" yang terhitung sebagai kolom numerik. Tipe data akan diperbaiki nanti.

In [8]:
df.shape

(9957, 9)

Manage null and duplicates data

In [9]:
df.isnull().sum()

Unnamed: 0,0
title,0
year,527
certificate,3453
duration,2036
genre,73
rating,1173
description,0
stars,0
votes,1173


karena kolom year, certificate, dan duration tidak relevan dalam pembuatan sistem rekomendasi dan memiliki null values yg tinggi, maka akan didrop kolom tersebut alih-alih drop null values.

In [10]:
# drop columns year, certificate, and duration

df.drop(columns=['year', 'certificate', 'duration'], inplace=True)
df.isnull().sum()

Unnamed: 0,0
title,0
genre,73
rating,1173
description,0
stars,0
votes,1173


In [11]:
# drop null values
df.dropna(inplace=True)

mencari tahu jumlah duplicate data

In [12]:
df.duplicated().sum()

np.int64(0)

Change Data Type

In [13]:
# mengubah tipe data pada kolom 'votes' menjadi int
df['votes'] = df['votes'].astype(str).str.replace(',', '', regex=False).astype(int)

Ekstraksi fitur dengan TF-IDF (Term Frequency-Inverse Document Frequency)

In [14]:
# fillna NaN
df = df.reset_index(drop=True)
df['description'] = df['description'].fillna('')

# Menghapus semua stop word dalam bahasa inggris seperti 'a', 'the', dll.
tfidf = TfidfVectorizer(stop_words='english')

# membuat matriks TF-IDF dengan fit transform data
tfidf_matrix = tfidf.fit_transform(df['description'])

tfidf_matrix.shape

(8772, 21010)

Parsing kolom `stars` yang berupa string list dikonversi menggunakan `literal_eval`

In [15]:
from ast import literal_eval

# Ubah string yang terlihat seperti list menjadi list yang terbaca pyhton
df['stars'] = df['stars'].apply(literal_eval)

# Modelling: Content-Based Recommender

Pada tahap ini, dilakukan modelling content-based filtering

## Plot Description-Based Recommender

Pada filtering ini, akan dibuat recommender yang mengambil kesimpulan berdasarkan kesamaan plot suatu film. Deskripsi plot diambil dari fitur 'description' dari dataset.

Pada tahap ini, kita akan membuat sistem rekomendasi dengan cara menghitung similarity menggunakan cosine similarity.

In [16]:
# hitung matriks cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# membuat reverse map dari index dan judul film
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    title = title.lower()
    matched_indices = indices[indices.index.str.lower() == title]
    if matched_indices.empty:
      return f"Judul '{title}' tidak ditemukan."

    idx = matched_indices.values[0]  # atau .item()

    # mengambil similarity scores dari seluruh film
    sim_scores = list(enumerate(cosine_sim[idx]))

    # menyortir film berdasarkan similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # mengambil skor dari top 10 most similar movies
    sim_scores = sim_scores[1:11]

    # mengambil index film
    movie_indices = [i[0] for i in sim_scores]

    # mengembalikan list top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [17]:
print(cosine_sim.shape)  # Harus (N, N) dengan N = jumlah baris df
print(df.shape)          # Harus sama dengan N

(8772, 8772)
(8772, 6)


## Cast and Genres Based Recommender

Pada tahap ini kita akan membuat sistem rekomendasi film berdasarkan kesamaan cast dan genre.

In [18]:
# memperbaiki entri pada kolom 'stars' agar mudah dibaca
def clean_stars(entry):
    # Jika isinya list, gabungkan, buang spasi dan koma berlebih
    if isinstance(entry, list):
        return ', '.join([s.strip().rstrip(',') for s in entry])
    return entry  # biarkan kalau bukan list

# Terapkan ke dataframe
df['stars'] = df['stars'].apply(clean_stars)
df.head()

Unnamed: 0,title,genre,rating,description,stars,votes
0,Cobra Kai,"Action, Comedy, Drama",8.5,Decades after their 1984 All Valley Karate Tou...,"Ralph Macchio, William Zabka, Courtney Henggel...",177031
1,The Crown,"Biography, Drama, History",8.7,Follows the political rivalries and romance of...,"Claire Foy, Olivia Colman, Imelda Staunton, Ma...",199885
2,Better Call Saul,"Crime, Drama",8.9,The trials and tribulations of criminal lawyer...,"Bob Odenkirk, Rhea Seehorn, Jonathan Banks, Pa...",501384
3,Devil in Ohio,"Drama, Horror, Mystery",5.9,When a psychiatrist shelters a mysterious cult...,"Emily Deschanel, Sam Jaeger, Gerardo Celasco, ...",9773
4,Cyberpunk: Edgerunners,"Animation, Action, Adventure",8.6,A Street Kid trying to survive in a technology...,"Zach Aguilar, Kenichiro Ohashi, Emi Lo, Aoi Yûki",15413


In [50]:
def cast_genre_recommender(df):
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

    df = df.copy()

    # Asumsikan kolom 'stars' sudah berupa string bersih hasil preprocessing
    df['stars'] = df['stars'].fillna('').astype(str)
    df['genre'] = df['genre'].fillna('').astype(str)

    # Gabungkan fitur stars dan genre menjadi satu kolom 'soup'
    df['soup'] = df['stars'] + ' ' + df['genre']

    # Vektorisasi dan hitung kemiripan cosine
    count = CountVectorizer(stop_words='english')
    count_matrix = count.fit_transform(df['soup'])
    cosine_sim = cosine_similarity(count_matrix, count_matrix)

    # Buat index judul
    df = df.reset_index(drop=True)
    indices = pd.Series(df.index, index=df['title'].str.lower()).drop_duplicates()

    # Fungsi untuk rekomendasi
    def get_recommendations(title, top_n=10):
        title = title.lower()
        if title not in indices:
            return f"Judul '{title}' tidak ditemukan."

        idx = indices[title]
        if isinstance(idx, pd.Series):
            idx = idx.iloc[0]

        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
        movie_indices = [i[0] for i in sim_scores]
        return df.loc[movie_indices, ['title', 'genre', 'stars']].reset_index(drop=True)

    return get_recommendations

contoh list rekomendasi top 10

In [51]:
recommender = cast_genre_recommender(df)
recommender("The Crown")

Unnamed: 0,title,genre,stars
0,The Crown,"Biography, Drama, History","Claire Foy, Olivia Colman, Imelda Staunton, Ma..."
1,Victoria & Abdul,"Biography, Drama, History","Stephen Frears, |, Stars:, Judi Dench, Ali Faz..."
2,The Most Hated Woman in America,"Biography, Drama, History","Tommy O'Haver, |, Stars:, Melissa Leo, Brandon..."
3,Answer for Heaven,Drama,
4,Black Heart,Drama,
5,Broadchurch,"Crime, Drama, Mystery","David Tennant, Olivia Colman, Jodie Whittaker,..."
6,Medici,"Biography, Drama, History","Daniel Sharman, Alessandra Mastronardi, Synnov..."
7,Versailles,"Biography, Drama, History","George Blagden, Alexander Vlahos, Tygh Runyan,..."
8,Borgia,"Biography, Drama, History","Mark Ryder, Isolda Dychauk, Diarmuid Noyes, Jo..."
9,Flowers,"Comedy, Drama","Sophia Di Martino, Olivia Colman, Julian Barra..."


# Modelling 2: Hybrid Recommender

Pada tahap ini, dilakukan modelling dengan hybrid filtering, caranya dengan menggabungkan content-based filtering dengan bobot TF-IDF.

In [53]:
def hybrid_recommender(df, weight_cast_genre=0.5, weight_plot=0.5):
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

    df = df.copy()

    # Pastikan kolom string dan tidak null
    df['stars'] = df['stars'].fillna('').astype(str)
    df['genre'] = df['genre'].fillna('').astype(str)
    df['description'] = df['description'].fillna('').astype(str)

    # Gabungkan fitur metadata
    df['soup'] = df['stars'] + ' ' + df['genre']

    # Vectorizer untuk soup dan description
    count = CountVectorizer(stop_words='english')
    count_matrix = count.fit_transform(df['soup'])

    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['description'])

    # Hitung cosine similarity
    cosine_sim_soup = cosine_similarity(count_matrix, count_matrix)
    cosine_sim_plot = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Gabungkan kedua similarity
    cosine_sim_hybrid = (weight_cast_genre * cosine_sim_soup) + (weight_plot * cosine_sim_plot)

    # Mapping judul ke index
    df = df.reset_index(drop=True)
    indices = pd.Series(df.index, index=df['title'].str.lower()).drop_duplicates()

    # Fungsi rekomendasi
    def get_hybrid_recommendations(title, top_n=10):
        title = title.lower()
        if title not in indices:
            return f"Judul '{title}' tidak ditemukan."

        idx = indices[title]
        if isinstance(idx, pd.Series):
            idx = idx.iloc[0]

        sim_scores = list(enumerate(cosine_sim_hybrid[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
        movie_indices = [i[0] for i in sim_scores]
        return df.loc[movie_indices, ['title', 'genre', 'stars']].reset_index(drop=True)

    return get_hybrid_recommendations

contoh list rekomendasi film top 10

In [54]:
get_hybrid = hybrid_recommender(df)
get_hybrid("The Crown", top_n=10)

Unnamed: 0,title,genre,stars
0,The Crown,"Biography, Drama, History","Claire Foy, Olivia Colman, Imelda Staunton, Ma..."
1,Victoria & Abdul,"Biography, Drama, History","Stephen Frears, |, Stars:, Judi Dench, Ali Faz..."
2,Medici,"Biography, Drama, History","Daniel Sharman, Alessandra Mastronardi, Synnov..."
3,Versailles,"Biography, Drama, History","George Blagden, Alexander Vlahos, Tygh Runyan,..."
4,The Most Hated Woman in America,"Biography, Drama, History","Tommy O'Haver, |, Stars:, Melissa Leo, Brandon..."
5,Answer for Heaven,Drama,
6,Black Heart,Drama,
7,Munich: The Edge of War,"Biography, Drama, History","Christian Schwochow, |, Stars:, George MacKay,..."
8,Stavisky,"Biography, Crime, Drama","Alain Resnais, |, Stars:, Jean-Paul Belmondo, ..."
9,The Tudors,"Drama, History, Romance","Jonathan Rhys Meyers, Henry Cavill, Anthony Br..."


# Preparation & Preprocessing Data 2

Setalah dilakukan modelling, selanjutnya dilakukan evaluasi model dengan cara mengekstrak dataset baru berisi rating user. Dataset diperoleh melalui Kaggle. Dataset baru akan disesuaikan dengan dataset lama agar dapat dimerge dan dianalisis

In [55]:
!kaggle datasets download -d rishitjavia/netflix-movie-rating-dataset

Dataset URL: https://www.kaggle.com/datasets/rishitjavia/netflix-movie-rating-dataset
License(s): CC0-1.0
Downloading netflix-movie-rating-dataset.zip to /content
  0% 0.00/74.7M [00:00<?, ?B/s]
100% 74.7M/74.7M [00:00<00:00, 1.05GB/s]


In [56]:
# ekstrak data
!unzip netflix-movie-rating-dataset.zip

Archive:  netflix-movie-rating-dataset.zip
  inflating: Netflix_Dataset_Movie.csv  
  inflating: Netflix_Dataset_Rating.csv  


Telah diekstrak 2 dataset dari Kaggle, yang selanjutnya akan dimerge untuk memudahkan evaluasi.

In [57]:
# identify df and merge right.

movie_df = pd.read_csv('Netflix_Dataset_Movie.csv')
user_df = pd.read_csv('Netflix_Dataset_Rating.csv')

merged_df = pd.merge(movie_df, user_df, on='Movie_ID', how='right')
merged_df

Unnamed: 0,Movie_ID,Year,Name,User_ID,Rating
0,3,1997,Character,712664,5
1,3,1997,Character,1331154,4
2,3,1997,Character,2632461,3
3,3,1997,Character,44937,5
4,3,1997,Character,656399,4
...,...,...,...,...,...
17337453,4496,1993,Farewell My Concubine,520675,3
17337454,4496,1993,Farewell My Concubine,1055714,5
17337455,4496,1993,Farewell My Concubine,2643029,4
17337456,4496,1993,Farewell My Concubine,1559566,3


mengubah nama kolom pada dataframe baru merge_df agar dapat dimerge dengan dataframe lama df.

In [58]:
# rename column and put in to new df
df2 = merged_df[['Name', 'User_ID', 'Rating']].rename(columns={'Name': 'title', 'Rating': 'rating_user'})
df2.head()

Unnamed: 0,title,User_ID,rating_user
0,Character,712664,5
1,Character,1331154,4
2,Character,2632461,3
3,Character,44937,5
4,Character,656399,4


tampak bahwa rating_user memiliki skala 1-5. Kita ubah formatnya menjadi 1-10 agar dapat sesuai dengan format rating pada df.

In [59]:
df2['rating_user'] = df2['rating_user'] * 2

membuat dataframe baru yang menggabungkan dataframe awal df dengan dataset baru df2

In [60]:
# Ambil hanya kolom 'title' dari df (metadata film)
df_metadata = df[['title']].drop_duplicates()

# Merge hanya film yang ada di df_metadata (df)
merged_df_final = pd.merge(df2, df_metadata, on='title', how='inner')

merged_df_final.head()

Unnamed: 0,title,User_ID,rating_user
0,The Killing,712664,10
1,The Killing,1990901,10
2,The Killing,306466,8
3,The Killing,477388,6
4,The Killing,765331,10


In [61]:
# count unique tittle in merged_df_final
print(merged_df_final['title'].nunique())

64


terdapat 64 film dari data final

# Evaluation

Pada tahap evaluasi, dilakukan evaluasi terhadap kinerja kedua model yang dibuat dan akan dikomparasikan kinerjanya satu sama lain.

## Content-Based Recommender

Pada tahap ini dilakukan evaluasi terhadap kinerja model content-based recommender (menggunakan 1 film yang disukai user sebagai seed untuk rekomendasi). Teknik evaluasi yang digunakan yaitu dengan average precision dan average recall.

In [69]:
def evaluate_content_based_recommender(merged_df_final, get_recommendations_fn, top_n=10, n_users=50):
    users = merged_df_final['User_ID'].unique()[:n_users]
    precision_list, recall_list = [], []

    for user in users:
        user_data = merged_df_final[merged_df_final['User_ID'] == user]
        liked_movies = user_data[user_data['rating_user'] >= 8]['title'].str.lower().tolist()

        if not liked_movies:
            continue

        seed_title = liked_movies[0]  # gunakan satu film sebagai seed
        try:
            recs = get_recommendations_fn(seed_title)
            if isinstance(recs, str):
                continue
            recommended = set(recs['title'].str.lower().tolist()[:top_n])
        except:
            continue

        relevant = set(liked_movies)
        true_positive = recommended & relevant

        precision = len(true_positive) / len(recommended) if recommended else 0
        recall = len(true_positive) / len(relevant) if relevant else 0

        precision_list.append(precision)
        recall_list.append(recall)

    avg_precision = sum(precision_list) / len(precision_list) if precision_list else 0
    avg_recall = sum(recall_list) / len(recall_list) if recall_list else 0

    print(f"Content-Based Evaluation")
    print(f"Average Precision@{top_n}: {avg_precision:.4f}")
    print(f"Average Recall@{top_n}: {avg_recall:.4f}")

In [70]:
evaluate_content_based_recommender(merged_df_final, get_recommendations, top_n=10)

Content-Based Evaluation
Average Precision@10: 0.0000
Average Recall@10: 0.0000


## Hybrid recommender

Pada tahap ini akan dilakukan evaluasi terhadap model Hybrid recommender. Teknik evaluasi yang digunakan yaitu dengan average precision dan average recall.

In [66]:
def evaluate_hybrid_recommender(merged_df_final, get_hybrid_recommendations_fn, top_n=10, n_users=50):
    users = merged_df_final['User_ID'].unique()[:n_users]
    precision_list, recall_list = [], []

    for user in users:
        user_data = merged_df_final[merged_df_final['User_ID'] == user]
        liked_movies = user_data[user_data['rating_user'] >= 8]['title'].str.lower().tolist()

        if not liked_movies:
            continue

        recommended = set()
        for title in liked_movies:
            try:
                recs = get_hybrid_recommendations_fn(title)
                if isinstance(recs, str):
                    continue
                recs = recs['title'].str.lower().tolist()
                recommended.update(recs[:top_n])
            except:
                continue

        relevant = set(liked_movies)
        true_positive = relevant & recommended

        precision = len(true_positive) / len(recommended) if recommended else 0
        recall = len(true_positive) / len(relevant) if relevant else 0

        precision_list.append(precision)
        recall_list.append(recall)

    avg_precision = sum(precision_list) / len(precision_list) if precision_list else 0
    avg_recall = sum(recall_list) / len(recall_list) if recall_list else 0

    print(f"Average Precision@{top_n}: {avg_precision:.4f}")
    print(f"Average Recall@{top_n}: {avg_recall:.4f}")

In [68]:
evaluate_hybrid_recommender(merged_df_final=merged_df_final, get_hybrid_recommendations_fn=get_hybrid_recommendations, top_n=10, n_users=100)

Average Precision@10: 0.0071
Average Recall@10: 0.0533


Di atas adalah hasil evaluasi model yang dilakukan. Model memiliki skor evaluasi yang rendah, dainggap penyebabnya karena data yang digunakan sedikit  (64 data film final) sehingga akurasinya kurang.