<a href="https://colab.research.google.com/github/gmrwh92/Recommender-System/blob/main/AutoRec(Autoencoder).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
## importing libraries
import torch
import numpy as np
from torch import nn, div, square, norm
from torch.nn import functional as F
from torchdata import datapipes as dp
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import LabelEncoder
import torch.optim as optim


In [3]:
## load dataset
data = pd.read_csv('/content/drive/MyDrive/ml-latest-small/ratings.csv')
df= data[['userId','movieId','rating']]
num_users = data.userId.nunique()
num_items = data.movieId.nunique()
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [4]:
##check the number of users and items in movielens dataset
num_users, num_items

(610, 9724)

In [5]:
## encode data into numerical values
le = LabelEncoder()
df['user_ids'] = le.fit_transform(df['userId'])
df['item_ids'] = le.fit_transform(df['movieId'])

In [6]:
## Autorec model take user-rating matrix as input, so we need to make user-rating matrix.
class MovieLensDataset(Dataset):
    def __init__(self, df, train_size=0.8, train=False):
        self.df = df
        self.train = train
        self.train_size = train_size
        self.test_size = 1 - self.train_size

        self.num_items = len(self.df['item_ids'].unique())
        self.num_users = len(self.df['user_ids'].unique())

        self.train_df, self.test_df = train_test_split(self.df, test_size=self.test_size, train_size=self.train_size, stratify = self.df['user_ids'].values, random_state=42)

        if self.train == True:
            self.df = self.train_df
        else:
            self.df = self.test_df

        self.users = torch.tensor(self.df['user_ids'].values)
        self.items = torch.tensor(self.df['item_ids'].values)
        self.ratings = torch.tensor(self.df['rating'].values)

        self.inter_mat = self.make_inter_mat()

    def make_inter_mat(self):
        inter_mat = np.zeros((self.num_items, self.num_users))
        for user, item, rating in zip(self.users, self.items, self.ratings):
            inter_mat[item][user] = rating

        return inter_mat

    def __len__(self):
        return len(self.inter_mat)

    def __getitem__(self, index):
        inter_mat = torch.tensor(self.inter_mat[index]).float()

        return inter_mat

In [7]:
## AutoRec model
class AutoRec(nn.Module):
    def __init__(self, num_hidden, num_users, dropout=0.01):
        super(AutoRec, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(num_users, num_hidden),
            nn.Sigmoid(),
            nn.Linear(num_hidden,num_hidden//2),
            )
        self.decoder = nn.Sequential(
            nn.Linear(num_hidden//2, num_hidden),
            nn.Sigmoid(),
            nn.Linear(num_hidden,num_users),
        )


    def forward(self, mat):
        hidden = self.encoder(mat)
        pred = self.decoder(hidden)

        return pred

In [8]:
## set up parameters
batch_size = 256
device = 'cuda' if torch.cuda.is_available() else 'cpu'
num_hidden = 100

In [9]:
## dataset into torch sensor
train_dataset = MovieLensDataset(df=df, train=True)
test_dataset = MovieLensDataset(df=df, train=False)
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)



In [10]:
##Train model
def train_model(model, train_iter, lr, num_epochs):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    for epoch in range(num_epochs):
        total_loss = 0

        for inter_mat in train_iter:
          inter_mat = inter_mat.to(device)
          preds = model(inter_mat)

          loss = loss_fn(preds, inter_mat)
          loss.backward()
          optimizer.step()
          total_loss += loss.item()
        print('Epoch {}/{}: Loss={:.4f}'.format(epoch+1, num_epochs, total_loss))


In [11]:
##Test model
def test_model(model, test_loader):
  model.eval()
  y_true = []
  y_pred =[]
  y_preds = []
  with torch.no_grad():
    for inter_mat in test_loader:
      inter_mat = inter_mat.to(device)
      preds = model(inter_mat)
      y_true.extend(list(inter_mat.numpy()))
      y_pred.extend(list(preds.numpy()))
      y_preds.append(y_pred)

    rmse = mean_squared_error(y_true, y_pred, squared=False)
    return rmse, y_pred

In [12]:
model = AutoRec(num_hidden, num_users).to(device)

In [13]:
train_model(model, train_dataloader, lr=0.01, num_epochs=100)

Epoch 1/100: Loss=7.3100
Epoch 2/100: Loss=7.5982
Epoch 3/100: Loss=8.7741
Epoch 4/100: Loss=10.0045
Epoch 5/100: Loss=12.0844
Epoch 6/100: Loss=13.2506
Epoch 7/100: Loss=15.5615
Epoch 8/100: Loss=17.5116
Epoch 9/100: Loss=18.2991
Epoch 10/100: Loss=21.4970
Epoch 11/100: Loss=23.1388
Epoch 12/100: Loss=23.3012
Epoch 13/100: Loss=26.2641
Epoch 14/100: Loss=29.3736
Epoch 15/100: Loss=29.3351
Epoch 16/100: Loss=29.6414
Epoch 17/100: Loss=33.0993
Epoch 18/100: Loss=36.5438
Epoch 19/100: Loss=36.6485
Epoch 20/100: Loss=35.6871
Epoch 21/100: Loss=37.7135
Epoch 22/100: Loss=42.0315
Epoch 23/100: Loss=44.8180
Epoch 24/100: Loss=44.1233
Epoch 25/100: Loss=42.6843
Epoch 26/100: Loss=44.1440
Epoch 27/100: Loss=48.4342
Epoch 28/100: Loss=52.3749
Epoch 29/100: Loss=53.1212
Epoch 30/100: Loss=51.1316
Epoch 31/100: Loss=49.9068
Epoch 32/100: Loss=51.9562
Epoch 33/100: Loss=56.5071
Epoch 34/100: Loss=60.6590
Epoch 35/100: Loss=61.8292
Epoch 36/100: Loss=59.9056
Epoch 37/100: Loss=57.6002
Epoch 38/100:

In [14]:
rmse, y_pred = test_model(model, test_dataloader)

In [15]:
rmse

1.5137869

In [16]:
## display original rating for first user in test dataset
test_dataset[0]

tensor([0.0000, 0.0000, 0.0000, 0.0000, 4.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 4.5000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 5.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 5.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 2.5000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.5000, 0.0000, 4.0000, 0.0000, 0.0000, 0.0000,
        2.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 3.0000, 3.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 4.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 

In [17]:
## display predicted rating for first user in test dataset
yp = pd.DataFrame(y_pred)
yp.iloc[0]

0     -0.804060
1     -2.301746
2     -2.328370
3      1.928782
4      2.080103
         ...   
605    2.460488
606    0.678061
607    2.579294
608   -2.366279
609    1.328778
Name: 0, Length: 610, dtype: float32