# AutoRec cs3639 Recommendation Systems course IDC

### here will be general explanations

In [534]:
import numpy as np
import pandas as pd
import torch
from torch import nn

In [535]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


## datasets

In this project, we will use 2 datasets:
* **movielens**, which can be downloaded using `utils.datasets_download.py` or straight from [here](http://files.grouplens.org/datasets/movielens/).
* **netflixprize**, which can be downloaded from this [semi-parsed version from kaggle](https://www.kaggle.com/netflix-inc/netflix-prize-data) or from this [raw version](https://archive.org/download/nf_prize_dataset.tar)

**NOTE**: for the notebook to run properly, you should save you dataset under `data` folder and `movielens` folder for the movielens dataset and `netflix` folder for the netflixprize dataset.
i.e `data/movielens` folder and `data/netflix` folder respectively.

In [536]:
from src.data_prep import movielens_prep
train, test = movielens_prep(fold=1)

In [537]:
from src.data_prep import movielens_load
train, test = movielens_load(1)
print(train.shape)
train

(80000, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
79995,943,1067,2,875501756
79996,943,1074,4,888640250
79997,943,1188,3,888640250
79998,943,1228,3,888640275


In [538]:
train.item_id.agg(['min', 'max'])

min       1
max    1682
Name: item_id, dtype: int64

In [539]:
train.user_id.agg(['min', 'max'])

min      1
max    943
Name: user_id, dtype: int64

In [540]:
# userid2idx = {old_id: i for i, old_id in enumerate(train.user_id)}
# userid2idx

In [541]:
# train['user_id'] = train.user_id.apply(lambda x: userid2idx[x])
# np.sort(np.unique(train.user_id))

In [542]:
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_items, k=15):
        super().__init__()
        self.k = k
        self.num_users = num_users
        self.num_items = num_items

        # create the latent matrixs
        self.users_emb = nn.Embedding(num_users, k)
        self.items_emb = nn.Embedding(num_items, k)

        self.batch_size = 100



    def forward(self, user, item):
        # the embedding vector of user_id
        user = self.users_emb(user)
        item = self.items_emb(item)
        return (user*item).sum(axis=1)



In [543]:
def train_epocs(train, model, epochs=10, lr=0.01, reg=0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=reg)
    for i in range(epochs):
        model.train()
        # users and items indexes start 1, therefore we use the -1
        users = torch.LongTensor(train.user_id.values-1)#.to(device)
        items = torch.LongTensor(train.item_id.values-1)#.to(device)
        ratings = torch.FloatTensor(train.rating.values)#.to(device) # rating is our label

        preds = model(users, items)
        loss = nn.functional.mse_loss(preds, ratings)

        # backpropagation
        optimizer.zero_grad()
        loss.backward()

        # update
        optimizer.step()

        print(f'train loss: {loss.item()}')


In [544]:
num_users = len(np.unique(train.user_id))
num_items = train.item_id.max()
print(num_users, num_items)
model = MatrixFactorization(num_users, num_items)#.to(device)


943 1682
self.users_emb.weight.shape=torch.Size([943, 15])
self.items_emb.weight.shape=torch.Size([1682, 15])


In [545]:
train_epocs(train, model, epochs=100)

train loss: 28.033849716186523
train loss: 27.497081756591797
train loss: 26.9763126373291
train loss: 26.471330642700195
train loss: 25.98189926147461
train loss: 25.50773811340332
train loss: 25.04854965209961
train loss: 24.6040096282959
train loss: 24.173763275146484
train loss: 23.757436752319336
train loss: 23.354631423950195
train loss: 22.96492576599121
train loss: 22.587881088256836
train loss: 22.223058700561523
train loss: 21.869993209838867
train loss: 21.52821922302246
train loss: 21.19725227355957
train loss: 20.876611709594727
train loss: 20.56580352783203
train loss: 20.26432991027832
train loss: 19.971691131591797
train loss: 19.687374114990234
train loss: 19.41087532043457
train loss: 19.141677856445312
train loss: 18.87926483154297
train loss: 18.623117446899414
train loss: 18.37271499633789
train loss: 18.127531051635742
train loss: 17.887039184570312
train loss: 17.650712966918945
train loss: 17.41801643371582
train loss: 17.18842315673828
train loss: 16.9613990783