# AutoRec cs3639 Recommendation Systems course IDC

### here will be general explanations

In [1]:
import numpy as np
import pandas as pd
import torch
import json
from time import time
from utils.plots import plot_train_history
from utils.models_results import models_results

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


## datasets

In this project, we will use 2 datasets:
* **movielens**, which can be downloaded using `utils.datasets_download.py` or straight from [here](http://files.grouplens.org/datasets/movielens/).
* **netflixprize**, which can be downloaded from this [semi-parsed version from kaggle](https://www.kaggle.com/netflix-inc/netflix-prize-data) or from this [raw version](https://archive.org/download/nf_prize_dataset.tar)

**NOTE**: for the notebook to run properly, you should save you dataset under `data` folder and `movielens` folder for the movielens dataset and `netflix` folder for the netflixprize dataset.
i.e `data/movielens` folder and `data/netflix` folder respectively.

# Matrix Factorization

In [3]:
from src.mf.model import MatrixFactorization
from src.mf.training import MFTrainer

In [4]:
from src.data_prep import movielens_load
train, test = movielens_load(1)
print(train.shape)
train

(80000, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,0,0,5,874965758
1,0,1,3,876893171
2,0,2,4,878542960
3,0,3,3,876893119
4,0,4,3,889751712
...,...,...,...,...
79995,942,1066,2,875501756
79996,942,1073,4,888640250
79997,942,1187,3,888640250
79998,942,1227,3,888640275


In [5]:
# If you wish to find hyper params run `mf_hyper_params` with `find=True`
# It will save the params and losses to models_params folder
# Otherwise it will return a df with the already run models

epochs = 60
batch_size = 64

def mf_hyper_params(find=False):
    if find:
        for k in [10, 15, 20, 50, 100]:
            for lr in [0.001, 0.003, 0.005]:
                for reg in [0.001, 0.003, 0.005]:
                    print(f'STARTING TRAIN MF with k: {k}, lr: {lr}, reg: {reg}')
                    start_time = time()
                    model_dict = {}
                    num_users = train.user_id.max() + 1
                    num_items = train.item_id.max() + 1

                    # Init model and trainer
                    model = MatrixFactorization(num_users, num_items, k=k)
                    mf_trainer = MFTrainer(train, test, model, epochs=epochs, lr=lr, reg=reg, batch_size=batch_size)

                    # Train
                    mf_trainer.train_model()

                    # Save parmas to model_dict
                    model_dict['model_params'] = model.params()
                    model_dict.update(mf_trainer.params())
                    model_dict['train_losses'] = mf_trainer.train_losses
                    model_dict['val_losses'] = mf_trainer.val_losses

                    # Save model params to json file
                    file_name = f'models_params/MF_{k}_{lr}_{reg}'
                    with open(file_name, 'w') as f:
                        json.dump(model_dict, f, indent=4)

                    plot_train_history(mf_trainer.train_losses, mf_trainer.val_losses)
                    print(f'model train time: {time() - start_time:.3f}\n')
        return
    else:
        return models_results('MF')


In [6]:
mf_results = mf_hyper_params(False)
mf_results.sort_values(by='val_loss')

Unnamed: 0,model_name,train_loss,val_loss,params
9,MF_10_0.001_0.001,0.428094,0.460347,"{'model_params': {'k': 10}, 'seed': 14, 'epoch..."
18,MF_15_0.001_0.001,0.428838,0.460788,"{'model_params': {'k': 15}, 'seed': 14, 'epoch..."
27,MF_20_0.001_0.001,0.429123,0.461021,"{'model_params': {'k': 20}, 'seed': 14, 'epoch..."
12,MF_10_0.003_0.001,0.429552,0.461221,"{'model_params': {'k': 10}, 'seed': 14, 'epoch..."
30,MF_20_0.003_0.001,0.429931,0.461431,"{'model_params': {'k': 20}, 'seed': 14, 'epoch..."
36,MF_50_0.001_0.001,0.430148,0.461547,"{'model_params': {'k': 50}, 'seed': 14, 'epoch..."
0,MF_100_0.001_0.001,0.43091,0.462082,"{'model_params': {'k': 100}, 'seed': 14, 'epoc..."
15,MF_10_0.005_0.001,0.431693,0.462411,"{'model_params': {'k': 10}, 'seed': 14, 'epoch..."
33,MF_20_0.005_0.001,0.431725,0.462419,"{'model_params': {'k': 20}, 'seed': 14, 'epoch..."
6,MF_100_0.005_0.001,0.431725,0.462502,"{'model_params': {'k': 100}, 'seed': 14, 'epoc..."


# AutoRec

In [7]:
from src.data_prep import movielens_create_ratings
train, test = movielens_create_ratings(1)
train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,5,3,4,3,3,0,4,1,5,0,...,0,0,0,0,0,0,0,0,0,0
1,4,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0,0,0,0,0,0,0,0,5,0,...,0,0,0,0,0,0,0,0,0,0
939,0,0,0,2,0,0,4,5,3,0,...,0,0,0,0,0,0,0,0,0,0
940,5,0,0,0,0,0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
941,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
from src.autorec.model import AutoRec
from src.autorec.training import AutoRecTrainer

In [9]:
# If you wish to find hyper params run `autorec_hyper_params` with `find=True`
# It will save the params and losses to models_params folder
# Otherwise it will return a df with the already run models

epochs = 60
batch_size = 64


def autorec_hyper_params(find=False):
    if find:
        for num_hidden in [100, 200, 500, 600]:
            for lr in [0.001, 0.003, 0.005]:
                for reg in [0.001, 0.003, 0.005]:
                    print(f'STARTING TRAIN AutoRec with num_hidden: {num_hidden}, lr: {lr}, reg: {reg}')
                    start_time = time()
                    model_dict = {}

                    num_users, num_items = train.shape

                    # Init model and trainer
                    model = AutoRec(num_hidden=num_hidden, num_features=num_users)
                    autorec_trainer = AutoRecTrainer(train, test, model, epochs=epochs, batch_size=batch_size, lr=lr, reg=reg)

                    # Train
                    autorec_trainer.train_model()

                    # Save parmas to model_dict
                    model_dict['model_params'] = model.params()
                    model_dict.update(autorec_trainer.params())
                    model_dict['train_losses'] = autorec_trainer.train_losses
                    model_dict['val_losses'] = autorec_trainer.val_losses

                    # Save model params to json file
                    file_name = f'models_params/AutoRec_{num_hidden}_{lr}_{reg}'
                    with open(file_name, 'w') as f:
                        json.dump(model_dict, f, indent=4)

                    plot_train_history(autorec_trainer.train_losses, autorec_trainer.val_losses)
                    print(f'model train time: {time() - start_time:.3f}\n')
        return
    else:
        return models_results('AutoRec')



In [10]:
autorec_results = autorec_hyper_params(find=False)
autorec_results.sort_values(by='val_loss')

Unnamed: 0,model_name,train_loss,val_loss,params
27,AutoRec_600_0.001_0.001,0.562352,0.771318,"{'model_params': {'num_hidden': 600}, 'seed': ..."
18,AutoRec_500_0.001_0.001,0.575954,0.77484,"{'model_params': {'num_hidden': 500}, 'seed': ..."
9,AutoRec_200_0.001_0.001,0.721868,0.822268,"{'model_params': {'num_hidden': 200}, 'seed': ..."
12,AutoRec_200_0.003_0.001,0.734199,0.834206,"{'model_params': {'num_hidden': 200}, 'seed': ..."
21,AutoRec_500_0.003_0.001,0.700022,0.834856,"{'model_params': {'num_hidden': 500}, 'seed': ..."
30,AutoRec_600_0.003_0.001,0.709172,0.8417,"{'model_params': {'num_hidden': 600}, 'seed': ..."
15,AutoRec_200_0.005_0.001,0.808713,0.871103,"{'model_params': {'num_hidden': 200}, 'seed': ..."
0,AutoRec_100_0.001_0.001,0.825419,0.872221,"{'model_params': {'num_hidden': 100}, 'seed': ..."
3,AutoRec_100_0.003_0.001,0.819192,0.872363,"{'model_params': {'num_hidden': 100}, 'seed': ..."
6,AutoRec_100_0.005_0.001,0.840435,0.886805,"{'model_params': {'num_hidden': 100}, 'seed': ..."


# AutoRecV2

In [12]:
from src.autorecv2.model import AutoRecV2
from src.autorecv2.training import AutoRecV2Trainer

In [16]:
# If you wish to find hyper params run `autorec_hyper_params` with `find=True`
# It will save the params and losses to models_params folder
# Otherwise it will return a df with the already run models

epochs = 60
batch_size = 64


def autorecv2_hyper_params(find=False):
    if find:
        for num_hidden in [100, 200, 500, 600]:
            for lr in [0.001, 0.003, 0.005]:
                for reg in [0.001, 0.003, 0.005]:
                    print(f'STARTING TRAIN AutoRecV2 with num_hidden: {num_hidden}, lr: {lr}, reg: {reg}')
                    start_time = time()
                    model_dict = {}

                    num_users, num_items = train.shape

                    # Init model and trainer
                    model = AutoRecV2(num_hidden=num_hidden, num_features=num_users)
                    autorec_trainer = AutoRecV2Trainer(train, test, model, epochs=epochs, batch_size=batch_size, lr=lr, reg=reg)

                    # Train
                    autorec_trainer.train_model()

                    # Save parmas to model_dict
                    model_dict['model_params'] = model.params()
                    model_dict.update(autorec_trainer.params())
                    model_dict['train_losses'] = autorec_trainer.train_losses
                    model_dict['val_losses'] = autorec_trainer.val_losses

                    # Save model params to json file
                    file_name = f'models_params/AutoRecV2_{num_hidden}_{lr}_{reg}'
                    with open(file_name, 'w') as f:
                        json.dump(model_dict, f, indent=4)

                    plot_train_history(autorec_trainer.train_losses, autorec_trainer.val_losses)
                    print(f'model train time: {time() - start_time:.3f}\n')
        return
    else:
        return models_results('AutoRecV2')


In [18]:
autorecv2_results = autorecv2_hyper_params(find=False)
autorecv2_results.sort_values(by='val_loss')

Unnamed: 0,model_name,train_loss,val_loss,params
0,AutoRecV2_100_0.001_0.001,0.826126,0.872967,"{'model_params': {'num_hidden': 100}, 'seed': ..."
1,AutoRecV2_100_0.001_0.003,0.944537,0.964306,"{'model_params': {'num_hidden': 100}, 'seed': ..."
