# **Import Library**

Kita akan menggunakan library pandas untuk akses dan mengatur data serta sklearn untuk menggunakan metode cosine similiarity

In [31]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

# **Load Dataset**

Dataset yang akan digunakan adalah dataset dari Movielens serta rating untuk film tersebut. Kita akan menggunakan 20000 data saja agar mempermudah komputasi

In [8]:
from google.colab import drive
drive.mount('/content/drive')

movies_path = '/content/drive/MyDrive/movie.csv'
rating_path = '/content/drive/MyDrive/rating.csv'
movies = pd.read_csv(movies_path).head(20000)
rating = pd.read_csv(rating_path).head(20000)

print("Dataset Movies: ")
print(movies.head())
print("\nDataset Rating: ")
print(rating.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset Movies: 
   movieId  \
0        1   
1        2   
2        3   
3        4   
4        5   

                                title  \
0                    Toy Story (1995)   
1                      Jumanji (1995)   
2             Grumpier Old Men (1995)   
3            Waiting to Exhale (1995)   
4  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Dataset Rating: 
   userId  \
0       1   
1       1   
2       1   
3       1   
4       1   

   movieId  \
0        2   
1       29   
2       32   
3       47   
4       50   

   rating  \
0     3.5   
1     3.5   
2     3

# **Data Preparation**

Dikarenakan kita menggunakan collaborative filtering dan memiliki 2 dataset yang berhubungan. Kita perlu untuk menggabungkan kedua dataset tersebut dengan patokan userId, juga akan melakukan data preprocessing seperti mengatasi missing values dan mengubah data menjadi pivot table agar data lebih terorganisir

In [10]:
# Gabungkan kedua dataset
data = pd.merge(rating, movies, on='movieId')

# Membuat Pivot table
data_pivot = data.pivot_table(index='userId', columns='title', values='rating')

# Mengatasi missing values
data_pivot.fillna(0, inplace=True)

print("Dataset setelah melakukan merge : ")
print(data.head())
print("\nDataset setelah melakukan pivot : ")
print(data_pivot.head())

Dataset setelah melakukan merge : 
   userId  \
0       1   
1       1   
2       1   
3       1   
4       1   

   movieId  \
0        2   
1       29   
2       32   
3       47   
4       50   

   rating  \
0     3.5   
1     3.5   
2     3.5   
3     3.5   
4     3.5   

             timestamp  \
0  2005-04-02 23:53:47   
1  2005-04-02 23:31:16   
2  2005-04-02 23:33:39   
3  2005-04-02 23:32:07   
4  2005-04-02 23:29:40   

                                               title  \
0                                     Jumanji (1995)   
1  City of Lost Children, The (Cité des enfants p...   
2          Twelve Monkeys (a.k.a. 12 Monkeys) (1995)   
3                        Seven (a.k.a. Se7en) (1995)   
4                         Usual Suspects, The (1995)   

                                   genres  
0              Adventure|Children|Fantasy  
1  Adventure|Drama|Fantasy|Mystery|Sci-Fi  
2                 Mystery|Sci-Fi|Thriller  
3                        Mystery|Thriller  
4       

# **Mencari Kesamaan Antar User**

Kali ini kita akan menggunakan metode cosine similarity untuk kemiripan rating film antar para user

In [50]:
data_similarity = cosine_similarity(data_pivot.T)

# Membuat dataframe setelah melakukan cosine similarity
data_similarity_df = pd.DataFrame(data_similarity, index=data_pivot.columns, columns=data_pivot.columns)

print(data_similarity_df.head())

title                           'burbs, The (1989)  \
title                                                
'burbs, The (1989)                             1.0   
'night Mother (1986)                           0.0   
(500) Days of Summer (2009)                    0.0   
*batteries not included (1987)                 0.0   
...And Justice for All (1979)                  1.0   

title                           'night Mother (1986)  \
title                                                  
'burbs, The (1989)                               0.0   
'night Mother (1986)                             1.0   
(500) Days of Summer (2009)                      0.0   
*batteries not included (1987)                   0.0   
...And Justice for All (1979)                    0.0   

title                           (500) Days of Summer (2009)  \
title                                                         
'burbs, The (1989)                                      0.0   
'night Mother (1986)                   

# **Mencoba Sistem Rekomendasi**

Mencoba tes fungsi rekomendasi dan evaluasi menggunakan MSE

In [56]:
# Fungsi untuk memberikan rekomendasi
def recommend_movies(user_id, data_pivot, data_similarity_df, num_recommendations=5):
    # Mendapatkan film yang sudah ditonton oleh pengguna
    user_ratings = data_pivot.loc[user_id]
    watched_movies = user_ratings[user_ratings > 0].index.tolist()

    # Membuat dictionary untuk menyimpan skor rekomendasi
    recommendations = {}

    # Menghitung skor rekomendasi untuk setiap film yang belum ditonton
    for movie in data_pivot.columns:
        if movie not in watched_movies:
            sim_scores = data_similarity_df[movie]
            score = sum(sim_scores[movie_watched] * user_ratings[movie_watched] for movie_watched in watched_movies)
            recommendations[movie] = score

    # Mengurutkan rekomendasi berdasarkan skor
    recommended_movies = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)[:num_recommendations]

    return [movie[0] for movie in recommended_movies]

# Fungsi untuk evaluasi
def evaluate_model(data_pivot, data_similarity_df):
    mse_list = []

    for user_id in data_pivot.index:
        user_ratings = data_pivot.loc[user_id]
        watched_movies = user_ratings[user_ratings > 0].index.tolist()

        for movie in watched_movies:
            # Menghitung estimasi rating
            sim_scores = data_similarity_df[movie]
            estimated_rating = sum(sim_scores[movie_watched] * user_ratings[movie_watched] for movie_watched in watched_movies) / sum(sim_scores[movie_watched] for movie_watched in watched_movies)
            actual_rating = user_ratings[movie]

            # Menghitung MSE
            mse_list.append((actual_rating, estimated_rating))  # Simpan tuple (actual_rating, estimated_rating)

    # Menghitung MSE
    actual_ratings, estimated_ratings = zip(*mse_list)  # Pisahkan actual dan estimated
    return mean_squared_error(actual_ratings, estimated_ratings)

# Contoh penggunaan
user_id = 2  # Ganti dengan userId yang diinginkan
recommended_movies = recommend_movies(user_id, data_pivot, data_similarity_df)
print(f"Rekomendasi film untuk pengguna {user_id}: {recommended_movies}")

# Evaluasi model
mse = evaluate_model(data_pivot, data_similarity_df)
print(f"MSE dari model: {mse}")


Rekomendasi film untuk pengguna 2: ['Aliens (1986)', 'Total Recall (1990)', 'Terminator, The (1984)', 'Fifth Element, The (1997)', 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)']
MSE dari model: 0.8796689607364603
