# ML_25M Deep ML

In [1]:
import torch
import time
import os
import json
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from torch.utils.tensorboard import SummaryWriter
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
import itertools
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# from tab_transformer_pytorch import TabTransformer
from pytorch_tabnet.tab_model import TabNetClassifier
# from pytorch_tabular import TabularModel
# from pytorch_tabular.models import CategoryEmbeddingModelConfig
# from pytorch_tabular.config import (
#     DataConfig,
#     OptimizerConfig,
#     TrainerConfig,
# )

def fix_random(seed: int) -> None:
    """Fix all the possible sources of randomness.

    Args:
        seed: the seed to use. 
    """
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower


fix_random(42)
df = pd.read_csv('dataset.csv')

# PyTorch Device
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

print('Using device:', device)


delete_previous = True

if delete_previous:
    try:
        os.system('rm -rf runs')
        os.system('rm -rf models')
        os.system('rm -rf best_model.pth')
        os.system('rm -rf best_model_config.json')
    except:
        pass

Using device: mps


# Data preprocessing

In [2]:
X = df.drop('rating', axis=1)
Y = df['rating']

# encode Y
Y = LabelEncoder().fit_transform(Y)

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=42)

X_train = X_train.to_numpy()
X_val = X_val.to_numpy()
X_test = X_test.to_numpy()

# PCA
# pca = PCA(n_components=2)
# pca.fit(X_train)
# X_train = pca.transform(X_train)
# X_test = pca.transform(X_test)

# LDA
# lda = LinearDiscriminantAnalysis()
# lda.fit(X_train, Y_train)
# X_train = lda.transform(X_train)
# X_val = lda.transform(X_val)
# X_test = lda.transform(X_test)

# # # smote
# sm = SMOTE(random_state=42)
# X_train, Y_train = sm.fit_resample(X_train, Y_train)


print(f'Number of training samples: {X_train.shape[0]}')
print(f'Number of validation samples: {X_val.shape[0]}')
print(f'Number of testing samples: {X_test.shape[0]}')
print(f'\nNumber of features: {X_train.shape[1]}')

val_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(Y_val, dtype=torch.long)), batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(Y_test, dtype=torch.long)), batch_size=64, shuffle=True)

Number of training samples: 9946
Number of validation samples: 1106
Number of testing samples: 2764

Number of features: 1128


In [3]:
results = pd.DataFrame(columns=['Model', 'Accuracy'])

# Neural Network

In [4]:
def get_model(input_size, hidden_size, dropout_prob=0, depth=1):
    model = [
        torch.nn.Linear(input_size, hidden_size),
        torch.nn.ReLU(),
        torch.nn.Dropout(dropout_prob)
    ]

    for i in range(depth):
        model.append(torch.nn.Linear(hidden_size, hidden_size))
        model.append(torch.nn.ReLU())
        model.append(torch.nn.Dropout(dropout_prob))

    model.append(torch.nn.Linear(hidden_size, 1))

    return torch.nn.Sequential(*model)


In [5]:
# hyperparameters
hidden_sizes = [32, 64, 128]
nums_epochs = [200]
depth = [3, 4, 5]
batch_sizes = [16, 32, 64, 128]
learning_rate = [0.1, 0.01]
step_size_lr_decay = [10, 20]
momentum = [0.9]
dropout_prob = 0.3
patience = 10

hyperparameters = itertools.product(hidden_sizes, depth, nums_epochs, batch_sizes, learning_rate, step_size_lr_decay, momentum)
n_comb = len(hidden_sizes)*len(depth)*len(nums_epochs)*len(batch_sizes)*len(learning_rate)*len(step_size_lr_decay)*len(momentum)
print (f'Number of hyperparameter combinations: {n_comb}')

Number of hyperparameter combinations: 144


In [6]:
# training process
def train_model(model, criterion, optimizer, scheduler, epochs, data_loader, val_loader, device, writer):
    n_iter = 0

    best_val_loss = float('inf')
    epochs_since_last_improvement = 0

    start = time.time()

    for epoch in range(epochs):
        model.train()

        start_epoch = time.time()

        for data, targets in data_loader:
            data, targets = data.to(device), targets.to(device)

            # Forward pass
            y_pred = model(data)

            # Compute Loss
            loss = criterion(y_pred, targets)
            optimizer.zero_grad()
            writer.add_scalar("Loss/train", loss, n_iter)

            # Backward pass
            loss.backward()
            optimizer.step()

            n_iter += 1

        scheduler.step()

        # Compute Val Loss
        labels, _, y_pred = test_model(model, val_loader, device)
        val_loss = criterion(y_pred, labels).item()
        writer.add_scalar("Loss/val", val_loss, epoch)

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'models/best_model.pth')
            epochs_since_last_improvement = 0
        elif epochs_since_last_improvement >= patience:
            break
        else:
            epochs_since_last_improvement += 1

        print('Epoch [{}/{}] - {:.2f} seconds - loss: {:.6f} - val_loss: {:.6f} - patience: {}'.format(epoch+1,
              epochs, time.time() - start_epoch, loss.item(), val_loss.item(), epochs_since_last_improvement), end='\r')

    print('\nTraining ended after {:.2f} seconds'.format(time.time() - start))

    # Restore best model
    model.load_state_dict(torch.load('models/best_model.pth'))
    return model

In [7]:
# evaluation process
def test_model(model, data_loader, device):
    model.eval()
    y_pred = []
    y_test = []

    with torch.no_grad():
        for data, targets in data_loader:
            data, targets = data.to(device), targets.to(device)

            y_pred.append(model(data))
            y_test.append(targets)

    y_pred = torch.cat(y_pred, dim=0)
    y_test = torch.cat(y_test, dim=0)

    y_pred_c = torch.argmax(y_pred, dim=1)

    return y_test, y_pred_c, y_pred


In [8]:
current_iter = 0

best_model = None
best_mse = float('inf')


if os.path.exists('best_model.pth'):
    # read best model config
    with open('best_model_config.json', 'r') as f:
        best_model_config = json.load(f)

    # load best model
    best_model = get_model(X_train.shape[1], best_model_config['hidden_size'], dropout_prob, best_model_config['depth'])
    best_model.load_state_dict(torch.load('best_model.pth'))
    best_model.to(device)
    best_model.eval()

    # evaluate best model
    y_test, y_pred_c, y_pred = test_model(best_model, test_loader, device)

    best_mse = torch.nn.MSELoss()(y_pred, y_test)
    best_r2 = r2_score(y_test.cpu().numpy(), y_pred.cpu().numpy())
    
    print("Best model - MSE: {:.6f}".format(best_mse))

for hidden_size, depth, num_epochs, batch, lr, step_size, momentum in hyperparameters:
    current_iter += 1

    if not os.path.exists('models'):
        os.makedirs('models')

    print("\nIterations {}/{} - Training with hidden_size={}, depth={}, num_epochs={}, batch={}, lr={}, step_size={}, momentum={}".format(
        current_iter, n_comb, hidden_size, depth, num_epochs, batch, lr, step_size, momentum))
    log_name = "dim:"+str(hidden_size)+"_depth:"+str(depth)+"_epochs:"+str(num_epochs)+"_batch:" + \
        str(batch)+"_lr:"+str(lr)+"_step_size:" + \
        str(step_size)+"_momentum:"+str(momentum)

    if os.path.exists('runs/'+log_name):
        print("Model already trained, skipping...")
        continue

    # start tensorboard
    writer = SummaryWriter('runs/'+log_name)

    train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(Y_train, dtype=torch.long)), batch_size=batch, shuffle=True)

    model = get_model(X_train.shape[1], hidden_size, dropout_prob, depth=depth)
    model = model.to(device)

    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=0.1)

    # train
    model = train_model(model, criterion, optimizer, scheduler, num_epochs, train_loader, val_loader, device, writer)

    # validate
    y_test, y_pred_c, y_pred = test_model(model, test_loader, device)

    mse = torch.nn.MSELoss()(y_pred, y_test)

    writer.add_hparams({'hidden_size': hidden_size, 'depth': depth, 'num_epochs': num_epochs, 'batch': batch,'lr': lr, 'step_size': step_size, 'momentum': momentum}, {'hparam/mse': mse, 'hparam/r2': r2})

    if mse < best_mse:
        best_mse = mse
        best_model = model
        torch.save(model.state_dict(), 'best_model.pth')
        # save config
        with open('best_model_config.json', 'w') as f:
            json.dump({'hidden_size': hidden_size, 'depth': depth, 'num_epochs': num_epochs, 'batch': batch,
                       'lr': lr, 'step_size': step_size}, f)
            
            
    print("Model MSE: {:.6f} - Best MSE: {:.6f}".format(mse, best_mse))

    torch.save(model.state_dict(), 'models/'+log_name+'.pth')
    writer.flush()

writer.close()


Iterations 1/144 - Training with hidden_size=32, depth=3, num_epochs=200, batch=16, lr=0.1, step_size=10, momentum=0.9


: 

: 

In [None]:
# test
best_model.eval()
y_test, y_pred_c, y_pred = test_model(best_model, test_loader, device)
print(classification_report(y_test.cpu(), y_pred_c.cpu(), zero_division=0))
accuracy = accuracy_score(y_test.cpu(), y_pred_c.cpu())

results = pd.concat([results, pd.DataFrame({'Model': 'NeuralNetwork', 'Accuracy': accuracy}, index=[0])], ignore_index=True)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         9
           2       0.79      0.90      0.84        80
           3       0.80      0.88      0.84       284
           4       0.87      0.87      0.87       804
           5       0.92      0.80      0.86      1159
           6       0.72      0.93      0.81       426
           7       0.00      0.00      0.00         1

    accuracy                           0.85      2764
   macro avg       0.51      0.55      0.53      2764
weighted avg       0.86      0.85      0.85      2764



# Transformer

In [None]:
# Define the TabNet model
tabnet = TabNetClassifier(
    # n_d: the dimensionality of the output space of the feature transformer network (default 64)
    n_d=16,
    # n_a: the dimensionality of the output space of the attention network (default 64)
    n_a=16,
    # n_steps: the number of sequential steps in the attention mechanism (default 3)
    n_steps=4,
    # n_steps: the number of sequential steps in the attention mechanism (default 3)
    gamma=1.5,
    # n_independent: the number of independent feature transformer networks to use (default 2)
    n_independent=2,
    # n_shared: the number of shared feature transformer networks to use (default 2)
    n_shared=2,
    # epsilon: a small value to add to the denominator of the feature importance calculation to avoid division by zero (default 1e-15)
    epsilon=1e-15,
    seed=42,  # seed: the random seed to use for reproducibility (default None)
)

# Train the model
tabnet.fit(
    X_train=X_train,
    y_train=Y_train,
    eval_set=[(X_val, Y_val)],
    # patience: the number of epochs to wait without improvement in validation loss before early stopping (default 10)
    patience=10,
    # batch_size: the number of samples per batch (default 1024)
    batch_size=1024,
    # virtual_batch_size: the number of samples per virtual batch (default 128)
    virtual_batch_size=128,
    # num_workers: the number of worker processes to use for data loading (default 0)
    num_workers=0,
    # drop_last: whether to drop the last incomplete batch if the dataset size is not divisible by the batch size (default False)
    drop_last=False,
    # max_epochs: the maximum number of epochs to train for (default 100)
    max_epochs=100,
)




epoch 0  | loss: 2.20964 | val_0_accuracy: 0.42586 |  0:00:02s
epoch 1  | loss: 1.61665 | val_0_accuracy: 0.40958 |  0:00:05s
epoch 2  | loss: 1.41317 | val_0_accuracy: 0.41139 |  0:00:07s
epoch 3  | loss: 1.285   | val_0_accuracy: 0.4141  |  0:00:10s
epoch 4  | loss: 1.19069 | val_0_accuracy: 0.4123  |  0:00:13s
epoch 5  | loss: 1.09648 | val_0_accuracy: 0.41049 |  0:00:15s
epoch 6  | loss: 1.02561 | val_0_accuracy: 0.42586 |  0:00:18s
epoch 7  | loss: 0.94954 | val_0_accuracy: 0.43219 |  0:00:20s
epoch 8  | loss: 0.916   | val_0_accuracy: 0.42315 |  0:00:23s
epoch 9  | loss: 0.88823 | val_0_accuracy: 0.42767 |  0:00:26s
epoch 10 | loss: 0.87582 | val_0_accuracy: 0.43128 |  0:00:28s
epoch 11 | loss: 0.82223 | val_0_accuracy: 0.51266 |  0:00:31s
epoch 12 | loss: 0.8066  | val_0_accuracy: 0.48101 |  0:00:34s
epoch 13 | loss: 0.78798 | val_0_accuracy: 0.46564 |  0:00:36s
epoch 14 | loss: 0.7668  | val_0_accuracy: 0.45389 |  0:00:39s
epoch 15 | loss: 0.75445 | val_0_accuracy: 0.50723 |  0



In [None]:
# evaluate the model
y_pred = tabnet.predict(X_test)
print(classification_report(Y_test, y_pred))
accuracy = accuracy_score(Y_test, y_pred)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.67      0.22      0.33         9
           2       0.82      0.53      0.64        80
           3       0.79      0.59      0.68       284
           4       0.78      0.89      0.83       804
           5       0.84      0.90      0.87      1159
           6       0.90      0.71      0.79       426
           7       0.00      0.00      0.00         1

    accuracy                           0.82      2764
   macro avg       0.60      0.48      0.52      2764
weighted avg       0.82      0.82      0.82      2764



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
