In [11]:
import os
import torch
import copy
import random
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.metrics import r2_score, mean_squared_error

def fix_random(seed: int) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

fix_random(42)

if not os.path.exists('tabularML'):
    os.makedirs('tabularML')

In [12]:
df = pd.read_csv('dataset.csv')

# PyTorch Device
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')
print('Using device:', device)

Using device: cuda


In [13]:
#print only the year column
df.drop(['year','rating_count'], axis=1, inplace=True)

In [14]:
X = df.drop('rating', axis=1)
Y = df['rating']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=42)

X_train = X_train.to_numpy()
X_val = X_val.to_numpy()
X_test = X_test.to_numpy()
Y_train = Y_train.to_numpy()
Y_val = Y_val.to_numpy()
Y_test = Y_test.to_numpy()

Y_train = Y_train.reshape(-1, 1)
Y_val = Y_val.reshape(-1, 1)
print(f'Number of training samples: {X_train.shape[0]}')
print(f'Number of validation samples: {X_val.shape[0]}')
print(f'Number of testing samples: {X_test.shape[0]}')
print(f'\nNumber of features: {X_train.shape[1]}')

Number of training samples: 9934
Number of validation samples: 1104
Number of testing samples: 2760

Number of features: 1148


In [21]:
# Hyperparameters
nums_epochs = [100]
batch_sizes = [64]
patience = [10]
n_d_a = [16]
n_shared = [3]
n_indipendent = [3]
n_step = [5]
gamma = [1.3]
epsilon = [1e-8]

hyperparameters = itertools.product(n_d_a, n_step,n_indipendent,n_shared, gamma, epsilon,nums_epochs, batch_sizes)
n_comb = len(n_d_a)*len(n_step)*len(n_indipendent)*len(n_shared)*len(gamma)*len(epsilon)*len(nums_epochs)*len(batch_sizes)
print (f'Number of hyperparameter combinations: {n_comb}')





Number of hyperparameter combinations: 1


In [22]:
def get_model(n_d_a, n_step,n_indipendent,n_shared, gamma, epsilon):
    model = TabNetRegressor(
        # n_d: the dimensionality of the output space of the feature transformer network (default 64)
        n_d=n_d_a,
        # n_a: the dimensionality of the output space of the attention network (default 64)
        n_a=n_d_a,
        # n_steps: the number of sequential steps in the attention mechanism (default 3)
        n_steps=n_step,
        # gamma: the scaling factor for the feature transformer network (default 1.3)
        gamma=gamma,
        # optimizerm name of optimizer to use (default Adam)
        optimizer_fn=torch.optim.Adam,
        # n_independent: the number of independent feature transformer networks to use (default 2)
        n_independent=n_indipendent,
        # n_shared: the number of shared feature transformer networks to use (default 2)
        n_shared=n_shared,
        # epsilon: a small value to add to the denominator of the feature importance calculation to avoid division by zero (default 1e-15)
        epsilon=epsilon,
        # seed: the random seed to use for reproducibility (default None)
        seed=42    
    )
    return model

In [23]:
if os.path.exists('tabularML/training'):
    os.system('rm -r tabularML/training')
else:
    os.makedirs('tabularML/training')

current_iter = 0

best_mse = float('inf')
best_model = None
best_n_d = None
best_n_a = None
best_n_step = None
best_n_indipendent = None
best_n_shared = None
best_gamma = None
best_batch_size = None

for n_d_a, n_step,n_indipendent,n_shared, gamma, epsilon,nums_epochs, batch_sizes in hyperparameters:
    current_iter += 1

    print("\nIterations {}/{} - Hyperparameters:  batch_sizes={}, nums_epochs={}, n_d={}, n_a={}, n_step={}, n_indipendent={}, n_shared={}, gamma={}, epsilon={}".format(
        current_iter, n_comb, batch_sizes, nums_epochs, n_d_a, n_d_a, n_step, n_indipendent, n_shared, gamma, epsilon ))

    model = get_model(n_d_a, n_step, n_indipendent, n_shared, gamma, epsilon)
    
    log_name = "batch_size="+str(batch_sizes)+"n_d="+str(n_d_a)+"n_a="+str(n_d_a)+"n_step="+str(n_step)+"n_indipendent="+str(n_indipendent)+"n_shared="+str(n_shared)+"gamma="+str(gamma)+"epsilon="+str(epsilon)
    
    # start tensorboard
    writer = SummaryWriter('tabularML/training/'+log_name)
    
    # train
    model.fit(
                X_train=X_train,
                y_train=Y_train,
                eval_set=[(X_val, Y_val)],
                eval_metric=['mse'],
                # patience: the number of epochs to wait without improvement in validation loss before early stopping (default 10)
                patience=10,
                # batch_size: the number of samples per batch (default 1024)
                batch_size=batch_sizes,
                # virtual_batch_size: the number of samples per virtual batch (default 128)
                virtual_batch_size=128,
                # num_workers: the number of worker processes to use for data loading (default 0)
                num_workers=0,
                # drop_last: whether to drop the last incomplete batch if the dataset size is not divisible by the batch size (default False)
                drop_last=False,
                # max_epochs: the maximum number of epochs to train for (default 100)
                max_epochs=nums_epochs,
            )
    

    preds = model.predict(X_test)
    mse = mean_squared_error(Y_test, preds)
    
    writer.add_hparams({'n_d':n_d_a, 'n_a':n_d_a, 'n_step':n_step, 'n_indipendent':n_indipendent, 'n_shared':n_shared, 'gamma':gamma, 'epsilon':epsilon, 'batch_sizes':batch_sizes, 'nums_epochs':nums_epochs }, {'hparam/mse': mse})

    if mse < best_mse:
        best_mse = mse
        best_n_d = n_d_a
        best_n_a = n_d_a
        best_n_step = n_step
        best_n_indipendent = n_indipendent
        best_n_shared = n_shared
        best_gamma = gamma
        best_batch_size = batch_sizes
        best_model = copy.deepcopy(model) 
        
    writer.flush()            
            
    print("Model MSE: {:.6f} - Best MSE: {:.6f}".format(mse, best_mse))
    print("Model R2 Score: {:.6f} - Best R2 Score: {:.6f}".format(r2_score(Y_test, preds), r2_score(Y_test, best_model.predict(X_test))))



Iterations 1/1 - Hyperparameters:  batch_sizes=64, nums_epochs=100, n_d=16, n_a=16, n_step=5, n_indipendent=3, n_shared=3, gamma=1.3, epsilon=1e-08




epoch 0  | loss: 0.73965 | val_0_mse: 0.17328 |  0:00:10s
epoch 1  | loss: 0.16103 | val_0_mse: 0.1443  |  0:00:20s
epoch 2  | loss: 0.1406  | val_0_mse: 0.13664 |  0:00:29s
epoch 3  | loss: 0.12008 | val_0_mse: 0.13728 |  0:00:39s
epoch 4  | loss: 0.09625 | val_0_mse: 0.10717 |  0:00:49s
epoch 5  | loss: 0.07152 | val_0_mse: 0.07017 |  0:00:59s
epoch 6  | loss: 0.05482 | val_0_mse: 0.04687 |  0:01:09s
epoch 7  | loss: 0.0521  | val_0_mse: 0.04121 |  0:01:20s
epoch 8  | loss: 0.03655 | val_0_mse: 0.03134 |  0:01:30s
epoch 9  | loss: 0.03102 | val_0_mse: 0.02541 |  0:01:41s
epoch 10 | loss: 0.03408 | val_0_mse: 0.02227 |  0:01:52s
epoch 11 | loss: 0.03802 | val_0_mse: 0.02353 |  0:02:02s
epoch 12 | loss: 0.02482 | val_0_mse: 0.02529 |  0:02:11s
epoch 13 | loss: 0.02963 | val_0_mse: 0.02298 |  0:02:20s
epoch 14 | loss: 0.02449 | val_0_mse: 0.03254 |  0:02:29s
epoch 15 | loss: 0.02597 | val_0_mse: 0.01613 |  0:02:39s
epoch 16 | loss: 0.03027 | val_0_mse: 0.03065 |  0:02:48s
epoch 17 | los



Model MSE: 0.007745 - Best MSE: 0.007745
Model R2 Score: 0.965665 - Best R2 Score: 0.965665
