<a href="https://colab.research.google.com/github/gmrwh92/Recommender-System/blob/main/Autoencoder_with_L1_Penalty.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
## importing libraries
import torch
import numpy as np
from torch import nn, div, square, norm
from torch.nn import functional as F
from torchdata import datapipes as dp
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import LabelEncoder
import torch.optim as optim
from torch.autograd import Function

In [35]:
## load dataset
data = pd.read_csv('/content/drive/MyDrive/ml-latest-small/ratings.csv')
df= data[['userId','movieId','rating']]
num_users = data.userId.nunique()
num_items = data.movieId.nunique()
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [36]:
## encode data into numerical values
le = LabelEncoder()
df['user_ids'] = le.fit_transform(df['userId'])
df['item_ids'] = le.fit_transform(df['movieId'])

In [37]:
## Autorec model take user-rating matrix as input, so we need to make user-rating matrix.
class MovieLensDataset(Dataset):
    def __init__(self, df, train_size=0.8, train=False):
        self.df = df
        self.train = train
        self.train_size = train_size
        self.test_size = 1 - self.train_size

        self.num_items = len(self.df['item_ids'].unique())
        self.num_users = len(self.df['user_ids'].unique())

        self.train_df, self.test_df = train_test_split(self.df, test_size=self.test_size, train_size=self.train_size, stratify = self.df['user_ids'].values, random_state=42)

        if self.train == True:
            self.df = self.train_df
        else:
            self.df = self.test_df

        self.users = torch.tensor(self.df['user_ids'].values)
        self.items = torch.tensor(self.df['item_ids'].values)
        self.ratings = torch.tensor(self.df['rating'].values)

        self.inter_mat = self.make_inter_mat()

    def make_inter_mat(self):
        inter_mat = np.zeros((self.num_items, self.num_users))
        for user, item, rating in zip(self.users, self.items, self.ratings):
            inter_mat[item][user] = rating

        return inter_mat

    def __len__(self):
        return len(self.inter_mat)

    def __getitem__(self, index):
        inter_mat = torch.tensor(self.inter_mat[index]).float()

        return inter_mat

In [38]:
class L1Penalty(torch.autograd.Function):

    def forward(ctx, input, l1weight = 0.0001):
        ctx.save_for_backward(input)
        ctx.l1weight = l1weight
        return input


    def backward(ctx, grad_output):
        input, = ctx.saved_variables
        grad_input = input.clone().sign().mul(ctx.l1weight)
        grad_input+=grad_output
        return grad_input

In [39]:
## AutoRec model
class AutoRec(nn.Module):
    def __init__(self, num_hidden, num_users, dropout=0.01):
        super(AutoRec, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(num_users, num_hidden),
            nn.Sigmoid(),
            nn.Linear(num_hidden,num_hidden//2),

            )
        self.decoder = nn.Sequential(
            nn.Linear(num_hidden//2, num_hidden),
            nn.Sigmoid(),
            nn.Linear(num_hidden,num_users),

        )
        self.penalty = L1Penalty()


    def forward(self, mat):
        mat = self.encoder(mat)
        hidden = L1Penalty.apply(mat)
        pred = self.decoder(hidden)

        return pred

In [40]:
## set up parameters
batch_size = 256
device = 'cuda' if torch.cuda.is_available() else 'cpu'
num_hidden = 100

In [41]:
## dataset into torch sensor
train_dataset = MovieLensDataset(df=df, train=True)
test_dataset = MovieLensDataset(df=df, train=False)
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [42]:
##Train model
def train_model(model, train_iter, lr, num_epochs):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    for epoch in range(num_epochs):
        total_loss = 0

        for inter_mat in train_iter:
          inter_mat = inter_mat.to(device)
          preds = model(inter_mat)

          loss = loss_fn(preds, inter_mat)
          loss.backward()
          optimizer.step()
          total_loss += loss.item()
        print('Epoch {}/{}: Loss={:.4f}'.format(epoch+1, num_epochs, total_loss))

In [43]:
##Test model
def test_model(model, test_loader):
  model.eval()
  y_true = []
  y_pred =[]
  y_preds = []
  with torch.no_grad():
    for inter_mat in test_loader:
      inter_mat = inter_mat.to(device)
      preds = model(inter_mat)
      y_true.extend(list(inter_mat.numpy()))
      y_pred.extend(list(preds.numpy()))
      y_preds.append(y_pred)

    rmse = mean_squared_error(y_true, y_pred, squared=False)
    return rmse, y_pred

In [44]:
model = AutoRec(num_hidden, num_users).to(device)

In [45]:
train_model(model, train_dataloader, lr=0.01, num_epochs=100)

  input, = ctx.saved_variables


Epoch 1/100: Loss=69.3143
Epoch 2/100: Loss=15.6424
Epoch 3/100: Loss=17.2311
Epoch 4/100: Loss=12.2606
Epoch 5/100: Loss=10.8858
Epoch 6/100: Loss=20.4746
Epoch 7/100: Loss=19.2832
Epoch 8/100: Loss=12.9769
Epoch 9/100: Loss=19.5754
Epoch 10/100: Loss=26.6326
Epoch 11/100: Loss=22.6493
Epoch 12/100: Loss=19.5672
Epoch 13/100: Loss=25.3766
Epoch 14/100: Loss=31.2845
Epoch 15/100: Loss=30.1133
Epoch 16/100: Loss=27.3824
Epoch 17/100: Loss=29.4550
Epoch 18/100: Loss=34.8419
Epoch 19/100: Loss=37.6033
Epoch 20/100: Loss=36.4972
Epoch 21/100: Loss=35.5470
Epoch 22/100: Loss=37.7296
Epoch 23/100: Loss=41.7084
Epoch 24/100: Loss=44.1268
Epoch 25/100: Loss=50.7423
Epoch 26/100: Loss=45.2418
Epoch 27/100: Loss=45.5158
Epoch 28/100: Loss=45.9351
Epoch 29/100: Loss=47.1172
Epoch 30/100: Loss=48.9598
Epoch 31/100: Loss=51.1010
Epoch 32/100: Loss=53.4379
Epoch 33/100: Loss=55.4202
Epoch 34/100: Loss=56.0084
Epoch 35/100: Loss=55.3569
Epoch 36/100: Loss=54.9030
Epoch 37/100: Loss=56.0168
Epoch 38/1

In [46]:
## test models and get values for validation
rmse, y_pred = test_model(model, test_dataloader)
rmse

26.744713

In [47]:
## display original rating for first user in test dataset
test_dataset[0]

tensor([0.0000, 0.0000, 0.0000, 0.0000, 4.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 4.5000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 5.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 5.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 2.5000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.5000, 0.0000, 4.0000, 0.0000, 0.0000, 0.0000,
        2.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 3.0000, 3.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 4.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 

In [48]:
## display predicted rating for first user in test dataset
yp = pd.DataFrame(y_pred)
yp.iloc[0]

0      1.956254
1     -0.584353
2     -0.941980
3      0.526665
4      2.074118
         ...   
605    0.979342
606   -2.308545
607   -0.748301
608   -0.024823
609    2.775534
Name: 0, Length: 610, dtype: float32