#1. Recall the merged Data(Combine1+ Title)

## 1-1) Recall: final_ratings.csv

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
# Load final merged dataset
df_ratings = pd.read_csv("/content/drive/MyDrive/Matrix/final_ratings.csv")

# Check the first few rows
df_ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Date,Year,Title
0,1488844,1,3,2005-09-06,2003.0,Dinosaur Planet
1,822109,1,5,2005-05-13,2003.0,Dinosaur Planet
2,885013,1,4,2005-10-19,2003.0,Dinosaur Planet
3,30878,1,4,2005-12-26,2003.0,Dinosaur Planet
4,823519,1,3,2004-05-03,2003.0,Dinosaur Planet


## 1-2)Convert UserID and MovieID to categorical indices

In [None]:
# Convert UserID and MovieID to categorical indices
user_ids = df_ratings["UserID"].unique()
movie_ids = df_ratings["MovieID"].unique()

# Mapping UserID and MovieID to indices
user_id_map = {id: idx for idx, id in enumerate(user_ids)}
movie_id_map = {id: idx for idx, id in enumerate(movie_ids)}

# Replace UserID and MovieID with index values
df_ratings["UserID"] = df_ratings["UserID"].map(user_id_map)
df_ratings["MovieID"] = df_ratings["MovieID"].map(movie_id_map)

# Check the transformed dataset
print(df_ratings.head())

   UserID  MovieID  Rating        Date    Year            Title
0       0        0       3  2005-09-06  2003.0  Dinosaur Planet
1       1        0       5  2005-05-13  2003.0  Dinosaur Planet
2       2        0       4  2005-10-19  2003.0  Dinosaur Planet
3       3        0       4  2005-12-26  2003.0  Dinosaur Planet
4       4        0       3  2004-05-03  2003.0  Dinosaur Planet


# 2. Building the Basic Structure for Matrix Factorization
Since we need to learn the user matrix (U) and movie matrix (V), we initialize them randomly.

In [None]:
#Example
num_users = 3
num_movies = 4
latent_dim = 3

U = np.random.normal(0, 0.1, (num_users, latent_dim))
V = np.random.normal(0, 0.1, (num_movies, latent_dim))

print("User Matrix (U):")
print(U)
print("\nMovie Matrix (V):")
print(V)


User Matrix (U):
[[-0.07931646 -0.01192192 -0.26367775]
 [-0.03790479  0.02391378  0.11841321]
 [-0.02545465  0.23164051  0.01029526]]

Movie Matrix (V):
[[ 0.22862196  0.00973138 -0.03312007]
 [ 0.2597886  -0.01966067 -0.08827588]
 [ 0.06636473 -0.0896412  -0.0649935 ]
 [ 0.03830622  0.22999564  0.00287945]]


In [None]:
import numpy as np

# Define matrix factorization parameters
num_users = len(user_ids)
num_movies = len(movie_ids)
latent_dim = 50  # Number of latent factors

# Initialize user and movie matrices with random values
U = np.random.normal(0, 0.1, (num_users, latent_dim))  # User Favorite matrix
V = np.random.normal(0, 0.1, (num_movies, latent_dim))  # Movie Characteristic matrix

Interpretation of Latent Factors in Matrix Factorization
I have a question regarding how to interpret latent factors in the user matrix (U) when performing matrix factorization.

For example, considering User 0's vector in the User Matrix (U):
[-0.11354342, -0.09060723, 0.08238567]

The first latent factor (-0.113) seems to indicate that User 0 dislikes this factor.

However, the third latent factor (0.082) is positive, but I am unsure whether it means that User 0 likes it or if it is too small to be meaningful.

My Questions:
Does a positive latent factor value always indicate that the user prefers that factor?
Is there a standard threshold or rule to determine whether a user likes or dislikes a specific latent factor?
When analyzing latent factors, should we interpret them independently, or should we always compare them with movie vectors (V) before making conclusions?

#3. SGLD (Stochastic Gradient Langevin Dynamics)
By adding stochastic noise to the SGD (Stochastic Gradient Descent) process, we ensure Differential Privacy while training the model.

In [None]:
import numpy as np

def sgld_update(U, V, df_ratings, learning_rate=0.01, noise_scale=0.1, batch_size=500, epoch=1):
    adjusted_lr = learning_rate / (1 + 0.01 * epoch) # Apply learning rate decay (gradually decrease learning rate as epoch progresses)

    # Shuffle data randomly (mini-batch processing)
    df_ratings = df_ratings.sample(frac=1).reset_index(drop=True)

    for i in range(0, len(df_ratings), batch_size):
        batch = df_ratings.iloc[i:i+batch_size]

        for _, row in batch.iterrows(): #Calculate Error and Predicition
            user_idx = row["UserID"]
            movie_idx = row["MovieID"]
            rating = row["Rating"]

            # Compute predicted rating
            pred = np.dot(U[user_idx], V[movie_idx])
            error = rating - pred

            # Compute gradients (Add Noise)
            grad_U = -error * V[movie_idx] + noise_scale * np.random.normal(size=U[user_idx].shape)
            grad_V = -error * U[user_idx] + noise_scale * np.random.normal(size=V[movie_idx].shape)

            # Gradient Clipping (Prevent excessively large updates)
            grad_U = np.clip(grad_U, -0.1, 0.1)
            grad_V = np.clip(grad_V, -0.1, 0.1)

            # SGLD Update
            U[user_idx] -= adjusted_lr * grad_U
            V[movie_idx] -= adjusted_lr * grad_V

    return U, V


**Learning Rate Decay**: Gradually decreases the learning rate as the epoch progresses for stable training.

**Mini-batch Training** : Processes 500 data points at a time for efficient and faster training.
**Prediction Calculation **:  Computes predicted ratings using the dot product of user vectors (U) and movie vectors (V).

**Noise Addition (SGLD Implementation)**:	Adds Gaussian noise to ensure Differential Privacy.

**Gradient Clipping**: Prevents excessively large updates to maintain stable learning.

**SGLD Update Application**:	Updates the user matrix (U) and movie matrix (V) based on the learning rate.

#4. Training Loop (Differential Privacy)

In [None]:
# Training parameters
num_epochs = 10
learning_rate = 0.01
noise_scale = 0.1  # Adjust this value for stronger DP

# Training loop
for epoch in range(num_epochs):
    U, V = sgld_update(U, V, df_ratings, learning_rate, noise_scale)
    print(f"Epoch {epoch+1}/{num_epochs} completed.")

print("SGLD-based Matrix Factorization Training Completed!")

Epoch 1/10 completed.
Epoch 2/10 completed.
Epoch 3/10 completed.


#5. Recommendation for a given user.

In [None]:
def recommend_movies(user_id, U, V, df_ratings, top_n=10):
    """
    Recommend top N movies for a given user.
    """
    user_idx = user_id_map[user_id]  # UserID를 인덱스로 변환
    predicted_ratings = np.dot(U[user_idx], V.T)  # 예측 평점 계산

    # 사용자가 이미 본 영화 목록
    watched_movies = df_ratings[df_ratings["UserID"] == user_idx]["MovieID"].unique()
    unwatched_movies = [m for m in range(len(movie_ids)) if m not in watched_movies]

    # 가장 높은 예측 평점을 가진 영화 추천
    recommended_movie_indices = np.argsort(predicted_ratings[unwatched_movies])[-top_n:][::-1]

    # MovieID를 원래 값으로 변환
    recommended_movie_ids = [movie_ids[idx] for idx in recommended_movie_indices]
    recommendations = df_ratings[df_ratings["MovieID"].isin(recommended_movie_ids)][["MovieID", "Title"]].drop_duplicates()

    return recommendations

# 사용자 785314에게 추천
recommended_movies = recommend_movies(785314, U, V, df_ratings)
print("Top recommended movies:")
print(recommended_movies)
