# AutoRec cs3639 Recommendation Systems course IDC

### here will be general explanations

In [77]:
import numpy as np
import pandas as pd
import sklearn
import torch
from torch import nn

In [78]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


## datasets

In this project, we will use 2 datasets:
* **movielens**, which can be downloaded using `utils.datasets_download.py` or straight from [here](http://files.grouplens.org/datasets/movielens/).
* **netflixprize**, which can be downloaded from this [semi-parsed version from kaggle](https://www.kaggle.com/netflix-inc/netflix-prize-data) or from this [raw version](https://archive.org/download/nf_prize_dataset.tar)

**NOTE**: for the notebook to run properly, you should save you dataset under `data` folder and `movielens` folder for the movielens dataset and `netflix` folder for the netflixprize dataset.
i.e `data/movielens` folder and `data/netflix` folder respectively.

In [3]:
from src.data_prep import movielens_load
train, test = movielens_load(1)
print(train.shape)
train

(80000, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
79995,943,1067,2,875501756
79996,943,1074,4,888640250
79997,943,1188,3,888640250
79998,943,1228,3,888640275


In [4]:
# TODO:
# - Add dataloder class
# - Add batch size
# - Add bias for the MatrixFactorization

In [5]:
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_items, k=15):
        super().__init__()
        self.k = k
        self.num_users = num_users
        self.num_items = num_items

        # create the latent matrixs
        self.users_emb = nn.Embedding(num_users, k)
        self.items_emb = nn.Embedding(num_items, k)

        self.batch_size = 100


    def forward(self, user, item):
        user = self.users_emb(user)
        item = self.items_emb(item)
        return (user*item).sum(axis=1)



In [6]:
# TODO:
# - add function that will calculate the validation loss

In [7]:
def train_epocs(train, model, epochs=10, lr=0.001, reg=0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=reg)
    for i in range(epochs):
        model.train()
        # users and items indexes start 1, therefore we use the -1
        users = torch.LongTensor(train.user_id.values-1).to(device)
        items = torch.LongTensor(train.item_id.values-1).to(device)
        ratings = torch.FloatTensor(train.rating.values).to(device) # rating is our label

        preds = model(users, items)
        loss = torch.sqrt(nn.functional.mse_loss(preds, ratings))

        # backpropagation
        optimizer.zero_grad()
        loss.backward()

        # update
        optimizer.step()

        print(f'train RMSE: {loss.item()}')


In [8]:
num_users = train.user_id.max()
num_items = train.item_id.max()
print(num_users, num_items)
model = MatrixFactorization(num_users, num_items).to(device)


943 1682


In [9]:
%%time
train_epocs(train, model, epochs=1000, lr=0.001, reg=0.001)

train RMSE: 5.363035202026367
train RMSE: 5.3583760261535645
train RMSE: 5.353726387023926
train RMSE: 5.349086284637451
train RMSE: 5.344456672668457
train RMSE: 5.339837551116943
train RMSE: 5.335227966308594
train RMSE: 5.330629348754883
train RMSE: 5.326040267944336
train RMSE: 5.321462631225586
train RMSE: 5.316894054412842
train RMSE: 5.312336444854736
train RMSE: 5.3077898025512695
train RMSE: 5.303253650665283
train RMSE: 5.298727989196777
train RMSE: 5.29421329498291
train RMSE: 5.28971004486084
train RMSE: 5.285216808319092
train RMSE: 5.280735492706299
train RMSE: 5.276265621185303
train RMSE: 5.271806240081787
train RMSE: 5.267358303070068
train RMSE: 5.2629218101501465
train RMSE: 5.258496284484863
train RMSE: 5.254081726074219
train RMSE: 5.249679088592529
train RMSE: 5.24528694152832
train RMSE: 5.240906715393066
train RMSE: 5.236537933349609
train RMSE: 5.232179641723633
train RMSE: 5.227833271026611
train RMSE: 5.2234978675842285
train RMSE: 5.219174385070801
train RMS

In [10]:
# halpful links:
# https://d2l.ai/chapter_recommender-systems/autorec.html
# https://github.com/gtshs2/Autorec
# https://github.com/ImKeTT/Recommend_algorithms_Librec2Python/blob/master/AutoRec_torch/src/model.py

In [11]:
from src.data_prep import movielens_prep
train, test = movielens_prep(1)
train

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,0.0,4.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
class AutoRec(nn.Module):
    """
    AutoRec model. See explanation on :param: num_features for use as USER TO USER or ITEM TO ITEM
    """
    def __init__(self, num_hidden=500, num_features):
        """
        :param num_hidden: Size of the hidden layer
        :param num_features: If num_features == num_items that means that we are doing USER TO USER model.
                             If num_features == num_users that means that we are doing ITEM TO ITEM model.
                             The logic is the a user vector has number of items features for it.
        """
        super().__init__()

        self.encoder = nn.Sequential(
            nn.Linear(num_features, num_hidden),
            nn.Sigmoid()
        )

        self.decoder = nn.Sequential(
            nn.Linear(num_hidden, num_features),
            nn.Sigmoid()
        )

    def forward(self, x):

        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [88]:
num_items = train.shape[1]
model = AutoRec(num_hidden=500, num_items=num_items).to(device)

In [89]:
train

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,0.0,4.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
def train_epocs(train, model, epochs=10, lr=0.001, reg=0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=reg)
    for i in range(epochs):
        model.train()

        rating = torch.FloatTensor(train.values).to(device)

        preds = model(rating)
        print(f'{preds.shape=}')
        loss = torch.sqrt(nn.functional.mse_loss(preds, rating))

        # backpropagation
        optimizer.zero_grad()
        loss.backward()

        # update
        optimizer.step()

        print(f'train RMSE: {loss.item()}')
    return preds

p = train_epocs(train, model, epochs=100)

type(x)=<class 'torch.Tensor'>
x.shape=torch.Size([943, 1650])
x=tensor([[5., 3., 4.,  ..., 0., 0., 0.],
        [4., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [5., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 5., 0.,  ..., 0., 0., 0.]], device='cuda:0')
preds.shape=torch.Size([943, 1650])
train RMSE: 0.8818525671958923
type(x)=<class 'torch.Tensor'>
x.shape=torch.Size([943, 1650])
x=tensor([[5., 3., 4.,  ..., 0., 0., 0.],
        [4., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [5., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 5., 0.,  ..., 0., 0., 0.]], device='cuda:0')
preds.shape=torch.Size([943, 1650])
train RMSE: 0.8551897406578064
type(x)=<class 'torch.Tensor'>
x.shape=torch.Size([943, 1650])
x=tensor([[5., 3., 4.,  ..., 0., 0., 0.],
        [4., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        .

In [83]:
pd.DataFrame(torch.Tensor.cpu(p).detach().numpy())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1640,1641,1642,1643,1644,1645,1646,1647,1648,1649
0,0.994848,0.392903,0.662953,0.895451,0.325029,0.145041,0.986105,0.947417,0.950001,0.662624,...,0.010853,0.011235,0.010198,0.014943,0.014040,0.012830,0.014401,0.012709,0.011365,0.016026
1,0.988557,0.003067,0.068967,0.021322,0.034220,0.101958,0.940231,0.049378,0.840171,0.371149,...,0.006684,0.010159,0.009151,0.006457,0.008276,0.010902,0.010148,0.008574,0.007880,0.009527
2,0.955694,0.023776,0.039658,0.048901,0.087155,0.077824,0.887364,0.072917,0.259490,0.084609,...,0.021863,0.018907,0.019174,0.020144,0.020436,0.023454,0.020993,0.020610,0.018124,0.023009
3,0.975177,0.028419,0.028970,0.054612,0.088096,0.056843,0.916210,0.085047,0.298206,0.046841,...,0.014441,0.015681,0.015159,0.013156,0.014604,0.017919,0.015180,0.015510,0.014135,0.017592
4,0.994654,0.622858,0.165374,0.718378,0.274245,0.058419,0.977258,0.887227,0.581146,0.094884,...,0.013419,0.013453,0.017735,0.012101,0.020025,0.017116,0.015807,0.017173,0.011341,0.013136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0.991130,0.051025,0.323125,0.071026,0.153836,0.089749,0.955611,0.153340,0.911655,0.204451,...,0.013207,0.016348,0.016048,0.011817,0.014905,0.018492,0.015332,0.014069,0.017185,0.015314
939,0.995140,0.143129,0.101056,0.801486,0.182982,0.102510,0.984325,0.909528,0.805859,0.389720,...,0.005721,0.004535,0.005133,0.005847,0.003894,0.006064,0.006330,0.006631,0.005134,0.008273
940,0.994149,0.037497,0.185038,0.086007,0.100694,0.064025,0.975074,0.159960,0.828495,0.171009,...,0.007111,0.009518,0.009600,0.007096,0.009748,0.009760,0.008535,0.008507,0.009405,0.008314
941,0.994345,0.114465,0.025372,0.405925,0.097407,0.078003,0.966089,0.860835,0.535772,0.136428,...,0.010542,0.006965,0.009909,0.008443,0.006108,0.009024,0.009501,0.010251,0.009578,0.016181


In [84]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler((0, 5))
scaler.fit_transform(torch.Tensor.cpu(p).detach().numpy())

array([[4.9101810e+00, 1.9693743e+00, 3.4679255e+00, ..., 3.3794111e-01,
        3.8359052e-01, 4.7213659e-01],
       [4.7767887e+00, 3.3519408e-03, 3.1996059e-01, ..., 1.9354329e-01,
        2.1273448e-01, 2.1550149e-01],
       [4.0800991e+00, 1.0779200e-01, 1.6463345e-01, ..., 6.1387408e-01,
        7.1507138e-01, 7.4785805e-01],
       ...,
       [4.8953419e+00, 1.7699021e-01, 9.3510562e-01, ..., 1.9119012e-01,
        2.8748101e-01, 1.6761416e-01],
       [4.8995171e+00, 5.6515604e-01, 8.8921607e-02, ..., 2.5211126e-01,
        2.9596657e-01, 4.7826663e-01],
       [4.9196835e+00, 4.9169917e+00, 3.4284275e+00, ..., 2.5384778e-01,
        6.7711121e-01, 5.3850901e-01]], dtype=float32)

In [74]:
train

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,0.0,4.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
