<a href="https://colab.research.google.com/github/gmrwh92/Recommender-System/blob/main/Matrix_Factorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
## importing libraries
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, metrics, preprocessing
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import torch.optim as optim

In [20]:
## load dataset
data = pd.read_csv('/content/drive/MyDrive/ml-latest-small/ratings.csv')
df= data[['userId','movieId','rating']]
num_users = data.userId.nunique()
num_items = data.movieId.nunique()
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [21]:
##check the number of users and items in movielens dataset
num_users, num_items

(610, 9724)

In [22]:
## encode data into numerical values
le = LabelEncoder()
df['user_ids'] = le.fit_transform(df['userId'])
df['item_ids'] = le.fit_transform(df['movieId'])

In [23]:
## split data into train and test set (20% for test and 80% training)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [24]:
## define Matrix Factorization model using pytorch
class MF(nn.Module):
    def __init__(self, n_users, n_items, n_factors):
        super(MF, self).__init__()
        self.user_embeddings = nn.Embedding(n_users, n_factors)
        self.item_embeddings = nn.Embedding(n_items, n_factors)
        self.user_bias = nn.Embedding(n_users, 1)
        self.item_bias = nn.Embedding(n_items, 1)


    def forward(self, user_ids, item_ids):
        user_embeds = self.user_embeddings(user_ids)
        item_embeds = self.item_embeddings(item_ids)
        user_bias = self.user_bias(user_ids).squeeze()
        item_bias = self.item_bias(item_ids).squeeze()
        dot = torch.sum(user_embeds * item_embeds, dim=1)
        preds = dot + user_bias + item_bias
        return preds

In [25]:
## dataset into torch sensor
class RatingDataset(Dataset):
    def __init__(self, user_ids, item_ids, ratings):
        self.user_ids = torch.tensor(user_ids, dtype=torch.long)
        self.item_ids = torch.tensor(item_ids, dtype=torch.long)
        self.ratings = torch.tensor(ratings, dtype=torch.float)

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):
        return self.user_ids[idx], self.item_ids[idx], self.ratings[idx]

In [26]:
train_data = RatingDataset(user_ids = train_df.user_ids.tolist(), item_ids = train_df.item_ids.tolist(), ratings = train_df.rating.tolist())
test_data = RatingDataset(user_ids = test_df.user_ids.tolist(), item_ids = test_df.item_ids.tolist(), ratings = test_df.rating.tolist())
train_loader = DataLoader(train_data, batch_size=128, shuffle=True)
test_loader = DataLoader(test_data, batch_size=128)

In [27]:
## training model
def train_model(model, train_loader, lr, num_epochs):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()
    for epoch in range(num_epochs):
        total_loss = 0
        for user_ids, item_ids, ratings in train_loader:
            optimizer.zero_grad()
            preds = model(user_ids, item_ids)
            loss = loss_fn(preds, ratings)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print('Epoch {}/{}: Loss={:.4f}'.format(epoch+1, num_epochs, total_loss))

In [28]:
## test model
def test_model(model, test_loader):
    model.eval()
    y_preds = []
    y_true = []
    y_pred = []
    with torch.no_grad():
        for user_ids, item_ids, ratings in test_loader:
            preds = model(user_ids, item_ids)
            y_true.extend(list(ratings.numpy()))
            y_pred.extend(list(preds.numpy()))
            y_preds.append(y_pred)

    rmse = mean_squared_error(y_true, y_pred, squared=False)
    return rmse, y_preds

In [29]:
n_users = num_users
n_items = num_items
n_factors = 20

In [30]:
## train model with 10 factors
model = MF(n_users, n_items, n_factors)
train_model(model, train_loader, lr=0.01, num_epochs=20)


Epoch 1/20: Loss=10749.6567
Epoch 2/20: Loss=2236.4451
Epoch 3/20: Loss=977.3054
Epoch 4/20: Loss=612.3986
Epoch 5/20: Loss=465.4890
Epoch 6/20: Loss=393.3503
Epoch 7/20: Loss=358.3395
Epoch 8/20: Loss=339.5329
Epoch 9/20: Loss=328.9770
Epoch 10/20: Loss=315.6478
Epoch 11/20: Loss=293.6611
Epoch 12/20: Loss=272.1361
Epoch 13/20: Loss=254.8414
Epoch 14/20: Loss=241.0347
Epoch 15/20: Loss=231.2821
Epoch 16/20: Loss=223.2801
Epoch 17/20: Loss=215.0276
Epoch 18/20: Loss=206.3474
Epoch 19/20: Loss=202.1018
Epoch 20/20: Loss=195.9075


In [31]:
## test models and get values for validation
rmse, y_preds = test_model(model, test_loader)

In [32]:
## Root Mean Square Error (RMSE) is the standard deviation of the residuals, tells us the difference between predicted values and original ratings.
## Smaller values indicate that model predicted well.
rmse

1.1893973

In [33]:
## display original ratings
test_df['rating']

67037    4.5
42175    3.0
93850    3.0
6187     4.0
12229    4.0
        ... 
57416    2.0
67290    3.5
33423    4.5
98552    3.0
87803    3.0
Name: rating, Length: 20168, dtype: float64

In [34]:
## display predicted ratings
yp = pd.DataFrame(y_preds)
yp.iloc[0]

0        2.331933
1        3.275849
2        3.408984
3        4.415833
4        2.757390
           ...   
20163    5.219423
20164    4.592649
20165    3.930120
20166    3.737211
20167    3.096143
Name: 0, Length: 20168, dtype: float32