<a href="https://colab.research.google.com/github/femketenharkel/Predicting_Ratings/blob/main/models/SVD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Singular Value Decomposition

In [1]:
import numpy as np
import pandas as pd
from google.colab import drive
from sklearn.metrics import mean_squared_error

In [2]:
drive.mount('/content/drive')
df = pd.read_csv("/content/drive/My Drive/Thesis/Data/df_final_2.csv")

Mounted at /content/drive


In [None]:
df.shape

(400083, 71)

In [None]:
df.head()

Unnamed: 0,UserID,MovieID,Rating,Age,Year,Month,Day,Hour,Release_year,Time_release_to_rating,...,Favourite_Fantasy,Favourite_Film-Noir,Favourite_Horror,Favourite_Musical,Favourite_Mystery,Favourite_Romance,Favourite_Sci-Fi,Favourite_Thriller,Favourite_War,Favourite_Western
0,5616,3590,3,4,2000,5,24,2,1974,26,...,False,False,False,False,False,False,False,False,False,True
1,4060,21,4,2,2000,8,5,15,1995,5,...,False,False,False,False,False,False,False,False,False,True
2,1125,3273,2,1,2000,11,22,16,2000,0,...,False,True,False,False,False,False,False,False,False,False
3,3410,585,4,3,2000,8,27,22,1995,5,...,False,True,False,False,False,False,False,False,False,False
4,3675,1374,4,3,2000,8,15,18,1982,18,...,False,True,False,False,False,False,False,False,False,False


In [None]:
# Define unique users and unique movies (items)
n_users = df['UserID'].nunique()
n_items = df['MovieID'].nunique()

In [None]:
# Create a user-item matrix
user_item_matrix = df.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0).to_numpy()

In [None]:
# Initialize parameters
n_factors = 50
learning_rate = 0.01
regularization = 0.1
n_epochs = 20

# Initialize user and item matrices
U = np.random.normal(scale=1./n_factors, size=(n_users, n_factors))
V = np.random.normal(scale=1./n_factors, size=(n_items, n_factors))

# Indicator matrix for existing ratings
indicator = (user_item_matrix > 0).astype(int)

In [None]:
# Training with SGD
for epoch in range(n_epochs):
    for i in range(n_users):
        for j in range(n_items):
            if indicator[i, j] > 0:
                # Compute the prediction error
                prediction = np.dot(U[i, :], V[j, :])
                error = user_item_matrix[i, j] - prediction

                # Update user and item latent factors
                U[i, :] += learning_rate * (error * V[j, :] - regularization * U[i, :])
                V[j, :] += learning_rate * (error * U[i, :] - regularization * V[j, :])

    # Compute the total loss (optional)
    total_loss = np.sum((indicator * (user_item_matrix - np.dot(U, V.T)))**2) + regularization * (np.sum(U**2) + np.sum(V**2))
    print(f'Epoch {epoch+1}/{n_epochs}, Loss: {total_loss}')

# Predict the missing ratings
predicted_ratings = np.dot(U, V.T)

# Convert the predicted ratings to a DataFrame
predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=df['MovieID'].unique(), index=df['UserID'].unique())


Epoch 1/20, Loss: 2131609.2773460713
Epoch 2/20, Loss: 496186.8147500284
Epoch 3/20, Loss: 386153.6296976194
Epoch 4/20, Loss: 360646.85109382705
Epoch 5/20, Loss: 350113.93376799417
Epoch 6/20, Loss: 344567.82855884376
Epoch 7/20, Loss: 340996.27020005183
Epoch 8/20, Loss: 338200.7091848762
Epoch 9/20, Loss: 335606.9941673594
Epoch 10/20, Loss: 332922.7038099146
Epoch 11/20, Loss: 330038.9616740854
Epoch 12/20, Loss: 326991.23593600426
Epoch 13/20, Loss: 323901.12984825065
Epoch 14/20, Loss: 320896.5082330492
Epoch 15/20, Loss: 318055.0497960004
Epoch 16/20, Loss: 315396.027140114
Epoch 17/20, Loss: 312901.6452050377
Epoch 18/20, Loss: 310540.4998801535
Epoch 19/20, Loss: 308281.65751766774
Epoch 20/20, Loss: 306100.214101143


In [None]:
# Example: Recommend 10 movies for user with UserID 1
user_id = 824
user_ratings = predicted_ratings_df.loc[user_id].sort_values(ascending=False)
print(user_ratings.head(10))

2397    5.072514
1000    4.976486
242     4.973757
517     4.965591
1234    4.896933
3182    4.877692
1523    4.875323
1483    4.875169
1050    4.846691
2645    4.826983
Name: 824, dtype: float64


In [None]:
# Function to calculate RMSE
def calculate_rmse(actual, predicted, indicator):
    # Only consider the actual ratings (where indicator is 1)
    error = indicator * (actual - predicted)
    mse = np.sum(error**2) / np.sum(indicator)
    rmse = np.sqrt(mse)
    return rmse

# Actual ratings matrix (user-item matrix)
actual_ratings = user_item_matrix

# Predicted ratings matrix
predicted_ratings = np.dot(U, V.T)

# Indicator matrix for existing ratings
indicator = (actual_ratings > 0).astype(int)

# Calculate RMSE
rmse = calculate_rmse(actual_ratings, predicted_ratings, indicator)
print(f'RMSE: {rmse}')

RMSE: 0.8699543750393


# Work in progress: tuned svd

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error

# Load the dataset
# Assuming df is your dataframe with columns: UserID, MovieID, Rating
n_users = df['UserID'].nunique()
n_items = df['MovieID'].nunique()

# Create a user-item matrix
user_item_matrix = df.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0).to_numpy()

# Split the data into training and test sets
train_indices, test_indices = train_test_split(np.arange(user_item_matrix.shape[0]), test_size=0.2, random_state=42)
train_data = user_item_matrix[train_indices]
test_data = user_item_matrix[test_indices]

# Hyperparameters to optimize
n_factors_list = [10, 20, 50]
learning_rate_list = [0.001, 0.01, 0.1]
regularization_list = [0.01, 0.1, 1]

# K-Fold Cross Validation
outer_kf = KFold(n_splits=5, shuffle=True, random_state=42)
inner_kf = KFold(n_splits=3, shuffle=True, random_state=42)

best_params = None
best_rmse = float('inf')

for train_index, val_index in outer_kf.split(train_data):
    outer_train_data = train_data[train_index]
    outer_val_data = train_data[val_index]

    for n_factors in n_factors_list:
        for learning_rate in learning_rate_list:
            for regularization in regularization_list:
                inner_rmse = []

                for inner_train_index, inner_val_index in inner_kf.split(outer_train_data):
                    inner_train_data = outer_train_data[inner_train_index]
                    inner_val_data = outer_train_data[inner_val_index]

                    # Initialize user and item matrices
                    U = np.random.normal(scale=1./n_factors, size=(inner_train_data.shape[0], n_factors))
                    V = np.random.normal(scale=1./n_factors, size=(inner_train_data.shape[1], n_factors))

                    # Indicator matrix for existing ratings
                    indicator_train = (inner_train_data > 0).astype(int)
                    indicator_val = (inner_val_data > 0).astype(int)

                    # Training with SGD
                    for epoch in range(20):
                        for i in range(inner_train_data.shape[0]):
                            for j in range(inner_train_data.shape[1]):
                                if indicator_train[i, j] > 0:
                                    # Compute the prediction error
                                    prediction = np.dot(U[i, :], V[j, :])
                                    error = inner_train_data[i, j] - prediction

                                    # Update user and item latent factors
                                    U[i, :] += learning_rate * (error * V[j, :] - regularization * U[i, :])
                                    V[j, :] += learning_rate * (error * U[i, :] - regularization * V[j, :])

                    # Predict the validation ratings
                    predicted_val_ratings = np.dot(U, V.T)

                    # Calculate RMSE for validation set
                    val_rmse = np.sqrt(mean_squared_error(inner_val_data[indicator_val > 0], predicted_val_ratings[indicator_val > 0]))
                    inner_rmse.append(val_rmse)

                avg_inner_rmse = np.mean(inner_rmse)

                if avg_inner_rmse < best_rmse:
                    best_rmse = avg_inner_rmse
                    best_params = (n_factors, learning_rate, regularization)

# Train the final model with the best hyperparameters on the entire training set
n_factors, learning_rate, regularization = best_params

U_final = np.random.normal(scale=1./n_factors, size=(train_data.shape[0], n_factors))
V_final = np.random.normal(scale=1./n_factors, size=(train_data.shape[1], n_factors))

indicator_train_final = (train_data > 0).astype(int)

for epoch in range(20):
    for i in range(train_data.shape[0]):
        for j in range(train_data.shape[1]):
            if indicator_train_final[i, j] > 0:
                prediction = np.dot(U_final[i, :], V_final[j, :])
                error = train_data[i, j] - prediction

                U_final[i, :] += learning_rate * (error * V_final[j, :] - regularization * U_final[i, :])
                V_final[j, :] += learning_rate * (error * U_final[i, :] - regularization * V_final[j, :])

# Predict the test ratings
predicted_test_ratings = np.dot(U_final, V_final.T)

# Calculate RMSE for test set
indicator_test = (test_data > 0).astype(int)
test_rmse = np.sqrt(mean_squared_error(test_data[indicator_test > 0], predicted_test_ratings[indicator_test > 0]))

print(f'Best Hyperparameters: n_factors={n_factors}, learning_rate={learning_rate}, regularization={regularization}')
print(f'Test RMSE: {test_rmse}')

IndexError: boolean index did not match indexed array along dimension 0; dimension is 2576 but corresponding boolean dimension is 1289