# AutoRec cs3639 Recommendation Systems course IDC

### here will be general explanations

In [153]:
import numpy as np
import pandas as pd
import sklearn
import torch
from torch import nn

In [154]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


## datasets

In this project, we will use 2 datasets:
* **movielens**, which can be downloaded using `utils.datasets_download.py` or straight from [here](http://files.grouplens.org/datasets/movielens/).
* **netflixprize**, which can be downloaded from this [semi-parsed version from kaggle](https://www.kaggle.com/netflix-inc/netflix-prize-data) or from this [raw version](https://archive.org/download/nf_prize_dataset.tar)

**NOTE**: for the notebook to run properly, you should save you dataset under `data` folder and `movielens` folder for the movielens dataset and `netflix` folder for the netflixprize dataset.
i.e `data/movielens` folder and `data/netflix` folder respectively.

In [155]:
from src.data_prep import movielens_load
train, test = movielens_load(1)
print(train.shape)
train

(80000, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
79995,943,1067,2,875501756
79996,943,1074,4,888640250
79997,943,1188,3,888640250
79998,943,1228,3,888640275


In [None]:
# TODO:
# - add function that will calculate the validation loss

In [159]:
def train_epocs(train, model, epochs=10, lr=0.001, reg=0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=reg)
    for i in range(epochs):
        model.train()
        # users and items indexes start 1, therefore we use the -1
        users = torch.LongTensor(train.user_id.values-1).to(device)
        items = torch.LongTensor(train.item_id.values-1).to(device)
        ratings = torch.FloatTensor(train.rating.values).to(device) # rating is our label

        preds = model(users, items)
        loss = torch.sqrt(nn.functional.mse_loss(preds, ratings))

        # backpropagation
        optimizer.zero_grad()
        loss.backward()

        # update
        optimizer.step()

        print(f'train RMSE: {loss.item()}')


In [160]:
from src.matrixfactorization import MatrixFactorization
num_users = train.user_id.max()
num_items = train.item_id.max()
print(num_users, num_items)
model = MatrixFactorization(num_users, num_items).to(device)


943 1682


In [161]:
%%time
train_epocs(train, model, epochs=1000, lr=0.001, reg=0.001)

train RMSE: 5.380264759063721
train RMSE: 5.375567436218262
train RMSE: 5.370880126953125
train RMSE: 5.366203784942627
train RMSE: 5.361536502838135
train RMSE: 5.356879234313965
train RMSE: 5.352232456207275
train RMSE: 5.347595691680908
train RMSE: 5.342968940734863
train RMSE: 5.338353157043457
train RMSE: 5.333747386932373
train RMSE: 5.329152584075928
train RMSE: 5.324568748474121
train RMSE: 5.319995403289795
train RMSE: 5.315433025360107
train RMSE: 5.310881614685059
train RMSE: 5.30634069442749
train RMSE: 5.3018107414245605
train RMSE: 5.2972917556762695
train RMSE: 5.292783737182617
train RMSE: 5.2882866859436035
train RMSE: 5.283801078796387
train RMSE: 5.279326915740967
train RMSE: 5.2748637199401855
train RMSE: 5.270411968231201
train RMSE: 5.265970706939697
train RMSE: 5.261541366577148
train RMSE: 5.257123947143555
train RMSE: 5.252717018127441
train RMSE: 5.248322486877441
train RMSE: 5.24393892288208
train RMSE: 5.239566802978516
train RMSE: 5.235206604003906
train RM

In [10]:
# halpful links:
# https://d2l.ai/chapter_recommender-systems/autorec.html
# https://github.com/gtshs2/Autorec
# https://github.com/ImKeTT/Recommend_algorithms_Librec2Python/blob/master/AutoRec_torch/src/model.py

In [117]:
from src.data_prep import movielens_prep
train, test = movielens_prep(1)
train

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,0.0,4.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [119]:
class AutoRec(nn.Module):
    """
    AutoRec model. See explanation on :param: num_features for use as USER TO USER or ITEM TO ITEM
    """
    def __init__(self, num_features, num_hidden=500):
        """
        :param num_hidden: Size of the hidden layer
        :param num_features: If num_features == num_items that means that we are doing USER TO USER model.
                             If num_features == num_users that means that we are doing ITEM TO ITEM model.
                             The logic is the a user vector has number of items features for it.
        """
        super().__init__()

        self.encoder = nn.Sequential(
            nn.Linear(num_features, num_hidden),
            nn.Sigmoid()
        )

        self.decoder = nn.Sequential(
            nn.Linear(num_hidden, num_features),
            nn.Sigmoid()
        )

    def forward(self, x):

        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [121]:
num_items = train.shape[1]
model = AutoRec(num_hidden=500, num_features=num_items).to(device)

In [122]:
train

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,0.0,4.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [133]:
from torch.utils.data import DataLoader

def train_epocs(train, model, batch_size=64, epochs=10, lr=0.005, reg=0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=reg)
    dataloader = DataLoader(train.values, batch_size=batch_size)
    for epoch in range(1, epochs+1):
        for batch_id, train_batch in enumerate(dataloader):
            # model.train()
            train_batch = train_batch.float().to(device)
            preds = model(train_batch).to(device)

            loss = torch.sqrt(nn.functional.mse_loss(preds, train_batch))
            # print(f'epoch: {epoch}, batch: {batch_id}, loss: {loss.item()}')

        # backpropagation
            optimizer.zero_grad()
            loss.backward()

            # update
            optimizer.step()
        print(f'epoch: {epoch} train RMSE: {loss.item()}')

    return preds

p = train_epocs(train, model, epochs=200)

epoch: 1 train RMSE: 0.6633583903312683
epoch: 2 train RMSE: 0.6633725762367249
epoch: 3 train RMSE: 0.6633778214454651
epoch: 4 train RMSE: 0.6633684635162354
epoch: 5 train RMSE: 0.6633650064468384
epoch: 6 train RMSE: 0.6633610129356384
epoch: 7 train RMSE: 0.663355827331543
epoch: 8 train RMSE: 0.663353681564331
epoch: 9 train RMSE: 0.6633505821228027
epoch: 10 train RMSE: 0.6633493900299072
epoch: 11 train RMSE: 0.663348913192749
epoch: 12 train RMSE: 0.6633484363555908
epoch: 13 train RMSE: 0.6633481979370117
epoch: 14 train RMSE: 0.6633477807044983
epoch: 15 train RMSE: 0.663347601890564
epoch: 16 train RMSE: 0.6633474230766296
epoch: 17 train RMSE: 0.6633473038673401
epoch: 18 train RMSE: 0.6633472442626953
epoch: 19 train RMSE: 0.6633471250534058
epoch: 20 train RMSE: 0.6633470058441162
epoch: 21 train RMSE: 0.6633468866348267
epoch: 22 train RMSE: 0.6633468270301819
epoch: 23 train RMSE: 0.6633468270301819
epoch: 24 train RMSE: 0.6633469462394714
epoch: 25 train RMSE: 0.66334

In [84]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler((0, 5))
scaler.fit_transform(torch.Tensor.cpu(p).detach().numpy())

array([[4.9101810e+00, 1.9693743e+00, 3.4679255e+00, ..., 3.3794111e-01,
        3.8359052e-01, 4.7213659e-01],
       [4.7767887e+00, 3.3519408e-03, 3.1996059e-01, ..., 1.9354329e-01,
        2.1273448e-01, 2.1550149e-01],
       [4.0800991e+00, 1.0779200e-01, 1.6463345e-01, ..., 6.1387408e-01,
        7.1507138e-01, 7.4785805e-01],
       ...,
       [4.8953419e+00, 1.7699021e-01, 9.3510562e-01, ..., 1.9119012e-01,
        2.8748101e-01, 1.6761416e-01],
       [4.8995171e+00, 5.6515604e-01, 8.8921607e-02, ..., 2.5211126e-01,
        2.9596657e-01, 4.7826663e-01],
       [4.9196835e+00, 4.9169917e+00, 3.4284275e+00, ..., 2.5384778e-01,
        6.7711121e-01, 5.3850901e-01]], dtype=float32)

In [74]:
train

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,0.0,4.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
from torch.utils.data import DataLoader
for batch in DataLoader(train.values, batch_size=15):
    print(batch)
    print(batch.shape)
    break

tensor([[5., 3., 4.,  ..., 0., 0., 0.],
        [4., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [3., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)
torch.Size([15, 1650])
