# 1. Load Sparse_matrix


In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')

dataset_path = "/content/drive/MyDrive/Netflix_Dataset/"
print("📂 Files in dataset directory:", os.listdir(dataset_path))


Mounted at /content/drive
📂 Files in dataset directory: ['README', 'combined_data_3.txt', 'combined_data_4.txt', 'probe.txt', 'qualifying.txt', 'combined_data_1.txt', 'movie_titles.csv', 'combined_data_2.txt', 'combined_data_1_fixed.csv', 'movies_data_fixed.csv', 'sparse_matrix.npz']


In [None]:
from scipy.sparse import load_npz

# Load Sparse_matrix
sparse_matrix_path = "/content/drive/MyDrive/Netflix_Dataset/sparse_matrix.npz"
rating_matrix_sparse = load_npz(sparse_matrix_path)

print("Sparse matrix successfully loaded!")
print(f"Matrix shape: {rating_matrix_sparse.shape}")  # (num_users, num_movies)

Sparse matrix successfully loaded!
Matrix shape: (470758, 4499)


# 2. Initialize User Matrix (U) & Movie Matrix (V)

✔️ U = User Preferences Matrix (Users × Latent Features)

✔️ V = Movie Features Matrix (Movies × Latent Features)

During training, U and V are optimized to capture hidden patterns in the user-movie interactions.

Since there is no prior information at the beginning, we initialize them with random values.

In [None]:
import numpy as np

# Define the number of latent factors (feature dimensions)
latent_dim = 50  # This can be adjusted

# Get the number of users and movies from the sparse matrix
num_users, num_movies = rating_matrix_sparse.shape

# Initialize user matrix (U) and movie matrix (V) with random values
U = np.random.normal(0, 0.1, (num_users, latent_dim))  # User matrix
V = np.random.normal(0, 0.1, (num_movies, latent_dim))  # Movie matrix

print(f"✅ Initialized User-Movie Matrices: U={U.shape}, V={V.shape}")

✅ Initialized User-Movie Matrices: U=(470758, 50), V=(4499, 50)


# 3: SGLD Optimization Function (Utilizing Vectorized Operations)
The traditional SGD (Stochastic Gradient Descent) method often relies on loops, which makes it slow. To improve efficiency, this function utilizes vectorized operations to accelerate computations

In [None]:
def sgld_update(U, V, rating_matrix_sparse, learning_rate=0.01, noise_scale=0.1, batch_size=5000, epoch=1):
    """
    Perform SGLD update using vectorized operations for speed improvement.
    - Utilizes sparse matrix directly (avoids dense conversion)
    - Implements efficient mini-batch updates
    - Reduces the number of Python loops for faster computation
    """
    adjusted_lr = learning_rate / (1 + 0.01 * epoch)  # Learning rate decay

    #  Get non-zero indices from the sparse matrix (User-Movie interactions)
    user_indices, movie_indices = rating_matrix_sparse.nonzero()
    ratings = rating_matrix_sparse.data  # Extract only non-zero values
    num_samples = len(user_indices)

    #  Apply mini-batch sampling (random selection of batch-sized samples)
    shuffled_indices = np.random.permutation(num_samples)

    for i in range(0, num_samples, batch_size):
        batch_indices = shuffled_indices[i : i + batch_size]

        #  Process in parallel using vectorized operations
        user_batch = user_indices[batch_indices]
        movie_batch = movie_indices[batch_indices]
        rating_batch = ratings[batch_indices]

        #  Compute predicted values
        pred_batch = np.sum(U[user_batch] * V[movie_batch], axis=1)
        error_batch = rating_batch - pred_batch

        #  Compute gradients (apply vectorized operations)
        grad_U = (-error_batch[:, np.newaxis] * V[movie_batch]) + noise_scale * np.random.normal(size=U[user_batch].shape)
        grad_V = (-error_batch[:, np.newaxis] * U[user_batch]) + noise_scale * np.random.normal(size=V[movie_batch].shape)

        #  Apply Gradient Clipping (Ensure stable training)
        grad_U = np.clip(grad_U, -0.1, 0.1)
        grad_V = np.clip(grad_V, -0.1, 0.1)

        #  Apply batch updates using vectorized operations
        U[user_batch] -= adjusted_lr * grad_U
        V[movie_batch] -= adjusted_lr * grad_V

    return U, V


Additionally, it directly processes sparse matrices, reducing memory usage and enabling more efficient training on large datasets.
Mini-batch training is implemented, allowing multiple samples to be processed in parallel, which enhances training stability and speed.
Gradient Clipping prevents excessively large updates, ensuring stable training throughout the learning process

In [None]:
import time

num_epochs = 10
learning_rate = 0.01
noise_scale = 0.1  # Adjust this value for stronger Differential Privacy
batch_size = 5000  # Increase mini-batch size for better efficiency

for epoch in range(num_epochs):
    start_time = time.time()

    U, V = sgld_update(U, V, rating_matrix_sparse, learning_rate, noise_scale, batch_size, epoch=epoch)

    elapsed_time = time.time() - start_time
    print(f"✅ Epoch {epoch+1}/{num_epochs} completed in {elapsed_time:.2f} seconds.")

print("🎉 Optimized SGLD-based Matrix Factorization Training Completed!")


✅ Epoch 1/10 completed in 119.05 seconds.
✅ Epoch 2/10 completed in 119.97 seconds.
✅ Epoch 3/10 completed in 121.34 seconds.
✅ Epoch 4/10 completed in 121.20 seconds.
✅ Epoch 5/10 completed in 121.29 seconds.
✅ Epoch 6/10 completed in 119.46 seconds.
✅ Epoch 7/10 completed in 118.92 seconds.
✅ Epoch 8/10 completed in 119.35 seconds.
✅ Epoch 9/10 completed in 118.71 seconds.
✅ Epoch 10/10 completed in 118.53 seconds.
🎉 Optimized SGLD-based Matrix Factorization Training Completed!


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

def evaluate_model(U, V, rating_matrix_sparse):
    """
    Compute RMSE & MAE for the trained model.
    """
    user_indices, movie_indices = rating_matrix_sparse.nonzero()
    true_ratings = rating_matrix_sparse.data  # Actual ratings

    # Compute predicted ratings
    predicted_ratings = np.sum(U[user_indices] * V[movie_indices], axis=1)

    # Compute RMSE & MAE
    rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
    mae = mean_absolute_error(true_ratings, predicted_ratings)

    print(f"📊 RMSE: {rmse:.4f}")
    print(f"📊 MAE: {mae:.4f}")

# Run evaluation
evaluate_model(U, V, rating_matrix_sparse)


📊 RMSE: 0.9671
📊 MAE: 0.7289


In [None]:
# Save trained matrices
np.save("/content/drive/MyDrive/Netflix_Dataset/U_sgld.npy", U)
np.save("/content/drive/MyDrive/Netflix_Dataset/V_sgld.npy", V)

print("✅ Trained matrices (U, V) saved successfully!")


✅ Trained matrices (U, V) saved successfully!
