# ML_25M Deep ML

In [1]:
import torch
import time
import os
import copy
import json
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from torch.utils.tensorboard import SummaryWriter
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
import itertools
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# from tab_transformer_pytorch import TabTransformer
from pytorch_tabnet.tab_model import TabNetClassifier
# from pytorch_tabular import TabularModel
# from pytorch_tabular.models import CategoryEmbeddingModelConfig
# from pytorch_tabular.config import (
#     DataConfig,
#     OptimizerConfig,
#     TrainerConfig,
# )

def fix_random(seed: int) -> None:
    """Fix all the possible sources of randomness.

    Args:
        seed: the seed to use. 
    """
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

fix_random(42)



Using device: mps


In [None]:
df = pd.read_csv('dataset.csv')

# PyTorch Device
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

print('Using device:', device)

In [None]:
delete_previous = True

if delete_previous:
    try:
        os.system('rm -rf runs')
        os.system('rm -rf models')
        os.system('rm -rf best_model.pth')
        os.system('rm -rf best_model_config.json')
    except:
        pass

# Data preprocessing

In [2]:
X = df.drop('rating', axis=1)
Y = df['rating']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=42)

X_train = X_train.to_numpy()
X_val = X_val.to_numpy()
X_test = X_test.to_numpy()
Y_train = Y_train.to_numpy()
Y_val = Y_val.to_numpy()
Y_test = Y_test.to_numpy()

# PCA
pca = PCA(n_components=0.95)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_val = pca.transform(X_val)
X_test = pca.transform(X_test)

print(f'Number of training samples: {X_train.shape[0]}')
print(f'Number of validation samples: {X_val.shape[0]}')
print(f'Number of testing samples: {X_test.shape[0]}')
print(f'\nNumber of features: {X_train.shape[1]}')

val_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(Y_val, dtype=torch.float32)), batch_size=Y_val.shape[0], shuffle=False)
test_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(Y_test, dtype=torch.float32)), batch_size=Y_test.shape[0], shuffle=False)

Number of training samples: 9946
Number of validation samples: 1106
Number of testing samples: 2764

Number of features: 552


In [3]:
results = pd.DataFrame(columns=['Model', 'Accuracy'])

# Neural Network

In [4]:
def get_model(input_size, hidden_size, dropout_prob=0, depth=1):
    model = [
        torch.nn.Linear(input_size, hidden_size),
        torch.nn.ReLU(),
        torch.nn.Dropout(dropout_prob)
    ]

    for i in range(depth):
        model.append(torch.nn.Linear(hidden_size, hidden_size))
        model.append(torch.nn.ReLU())
        model.append(torch.nn.Dropout(dropout_prob))

    model.append(torch.nn.Linear(hidden_size, 1))

    return torch.nn.Sequential(*model)


In [5]:
# hyperparameters
hidden_sizes =  [64, 128, 256]
nums_epochs = [200]
depth = [3, 4, 5]
batch_sizes = [32, 64, 128]
learning_rate = [0.1, 0.01]
step_size_lr_decay = [10, 20]
momentum = [0.9]
dropout_prob = 0.3
patience = 10

hyperparameters = itertools.product(hidden_sizes, depth, nums_epochs, batch_sizes, learning_rate, step_size_lr_decay, momentum)
n_comb = len(hidden_sizes)*len(depth)*len(nums_epochs)*len(batch_sizes)*len(learning_rate)*len(step_size_lr_decay)*len(momentum)
print (f'Number of hyperparameter combinations: {n_comb}')

Number of hyperparameter combinations: 108


In [6]:
def test_model(model, criterion, loader):
    loss = 0
    y_pred = torch.tensor([]).to(device)
    y_true = torch.tensor([]).to(device)
    i = 0

    for _, (x, y) in enumerate(loader):
        i += 1
        x = x.to(device)
        y = y.to(device)
        output = model(x)
        loss += criterion(output.squeeze(), y)
        y_pred = torch.cat((y_pred, output), 0)
        y_true = torch.cat((y_true, y), 0)

    return loss /i, y_pred.squeeze(), y_true


In [7]:
# training process
def train_model(model, criterion, optimizer, scheduler, epochs, data_loader, val_loader, device, writer, log_name):
    n_iter = 0

    best_model = None
    best_val_loss = float('inf')
    epochs_since_last_improvement = 0

    start = time.time()

    for epoch in range(epochs):
        model.train()

        start_epoch = time.time()

        for data, targets in data_loader:
            data, targets = data.to(device), targets.to(device)

            # Forward pass
            y_pred = model(data)

            # Compute Loss
            loss = criterion(y_pred.squeeze(), targets)
            optimizer.zero_grad()
            writer.add_scalar("Loss/train", loss, n_iter)

            # Backward pass
            loss.backward()
            optimizer.step()

            n_iter += 1

        scheduler.step()

        # Compute Val Loss
        val_loss, _, _ = test_model(model, criterion, val_loader)
        writer.add_scalar("Loss/val", val_loss, epoch)

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = copy.deepcopy(model)
            epochs_since_last_improvement = 0
        elif epochs_since_last_improvement >= patience:
            break
        else:
            epochs_since_last_improvement += 1

        print('Epoch [{}/{}] - {:.2f} seconds - val_loss: {:.6f} - patience: {}'.format(epoch+1,
              epochs, time.time() - start_epoch, val_loss, epochs_since_last_improvement), end='\r')

    print('\nTraining ended after {:.2f} seconds - Best val_loss: {:.6f}'.format(time.time() - start, best_val_loss))

    return best_model

In [8]:
current_iter = 0

best_model = None
best_mse = float('inf')
criterion = torch.nn.MSELoss()


if os.path.exists('best_model.pth'):
    # read best model config
    with open('best_model_config.json', 'r') as f:
        best_model_config = json.load(f)

    # load best model
    best_model = get_model(X_train.shape[1], best_model_config['hidden_size'], dropout_prob, best_model_config['depth'])
    best_model.load_state_dict(torch.load('best_model.pth'))
    best_model.to(device)
    best_model.eval()

    # evaluate best model
    best_mse, _, _ = test_model(best_model, criterion, test_loader)
    
    print("Best model - MSE: {:.6f}".format(best_mse))

for hidden_size, depth, num_epochs, batch, lr, step_size, momentum in hyperparameters:
    current_iter += 1

    if not os.path.exists('models'):
        os.makedirs('models')

    print("\nIterations {}/{} - Training with hidden_size={}, depth={}, num_epochs={}, batch={}, lr={}, step_size={}, momentum={}".format(
        current_iter, n_comb, hidden_size, depth, num_epochs, batch, lr, step_size, momentum))
    log_name = "dim:"+str(hidden_size)+"_depth:"+str(depth)+"_epochs:"+str(num_epochs)+"_batch:" + \
        str(batch)+"_lr:"+str(lr)+"_step_size:" + \
        str(step_size)+"_momentum:"+str(momentum)

    if os.path.exists('runs/'+log_name):
        print("Model already trained, skipping...")
        continue

    # start tensorboard
    writer = SummaryWriter('runs/'+log_name)

    train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(Y_train, dtype=torch.float32)), batch_size=batch, shuffle=True)

    model = get_model(X_train.shape[1], hidden_size, dropout_prob, depth=depth)
    model = model.to(device)

    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=0.1)

    # train
    model = train_model(model, criterion, optimizer, scheduler, num_epochs, train_loader, val_loader, device, writer, log_name)

    # validate model on test set
    mse, _, _ = test_model(model, criterion, test_loader)
    writer.add_hparams({'hidden_size': hidden_size, 'depth': depth, 'num_epochs': num_epochs, 'batch': batch,'lr': lr, 'step_size': step_size, 'momentum': momentum}, {'hparam/mse': mse})

    if mse < best_mse:
        best_mse = mse
        best_model = model
        torch.save(model.state_dict(), 'best_model.pth')
        # save config
        with open('best_model_config.json', 'w') as f:
            json.dump({'hidden_size': hidden_size, 'depth': depth, 'num_epochs': num_epochs, 'batch': batch,
                       'lr': lr, 'step_size': step_size}, f)
            
            
    print("Model MSE: {:.6f} - Best MSE: {:.6f}".format(mse, best_mse))

    torch.save(model.state_dict(), 'models/'+log_name+'.pth')
    writer.flush()

writer.close()


Iterations 1/108 - Training with hidden_size=64, depth=3, num_epochs=200, batch=32, lr=0.1, step_size=10, momentum=0.9
Epoch [25/200] - 2.36 seconds - val_loss: 0.020780 - patience: 10
Training ended after 58.44 seconds - Best val_loss: 0.018834
Model MSE: 0.021877 - Best MSE: 0.021877

Iterations 2/108 - Training with hidden_size=64, depth=3, num_epochs=200, batch=32, lr=0.1, step_size=20, momentum=0.9
Epoch [58/200] - 1.99 seconds - val_loss: 0.019234 - patience: 10
Training ended after 130.06 seconds - Best val_loss: 0.017551
Model MSE: 0.018884 - Best MSE: 0.018884

Iterations 3/108 - Training with hidden_size=64, depth=3, num_epochs=200, batch=32, lr=0.01, step_size=10, momentum=0.9
Epoch [37/200] - 2.05 seconds - val_loss: 0.022351 - patience: 10
Training ended after 80.56 seconds - Best val_loss: 0.020260
Model MSE: 0.021634 - Best MSE: 0.018884

Iterations 4/108 - Training with hidden_size=64, depth=3, num_epochs=200, batch=32, lr=0.01, step_size=20, momentum=0.9
Epoch [52/200

# Neural Network Evalutation

In [2]:
test_loss, y_pred, y_true = test_model(best_model, criterion, test_loader)

y_pred = y_pred.cpu().detach().numpy()
y_true = y_true.cpu().detach().numpy()

mse = mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print("MSE: {:.6f} - R2: {:.6f}".format(test_loss, r2))

NameError: name 'test_model' is not defined

# Transformer

In [1]:
# Define the TabNet model
tabnet = TabNetClassifier(
    # n_d: the dimensionality of the output space of the feature transformer network (default 64)
    n_d=16,
    # n_a: the dimensionality of the output space of the attention network (default 64)
    n_a=16,
    # n_steps: the number of sequential steps in the attention mechanism (default 3)
    n_steps=4,
    # n_steps: the number of sequential steps in the attention mechanism (default 3)
    gamma=1.5,
    # n_independent: the number of independent feature transformer networks to use (default 2)
    n_independent=2,
    # n_shared: the number of shared feature transformer networks to use (default 2)
    n_shared=2,
    # epsilon: a small value to add to the denominator of the feature importance calculation to avoid division by zero (default 1e-15)
    epsilon=1e-15,
    seed=42,  # seed: the random seed to use for reproducibility (default None)
)

# Train the model
tabnet.fit(
    X_train=X_train,
    y_train=Y_train,
    eval_set=[(X_val, Y_val)],
    # patience: the number of epochs to wait without improvement in validation loss before early stopping (default 10)
    patience=10,
    # batch_size: the number of samples per batch (default 1024)
    batch_size=1024,
    # virtual_batch_size: the number of samples per virtual batch (default 128)
    virtual_batch_size=128,
    # num_workers: the number of worker processes to use for data loading (default 0)
    num_workers=0,
    # drop_last: whether to drop the last incomplete batch if the dataset size is not divisible by the batch size (default False)
    drop_last=False,
    # max_epochs: the maximum number of epochs to train for (default 100)
    max_epochs=100,
)


NameError: name 'TabNetClassifier' is not defined

In [None]:
# evaluate the model
y_pred = tabnet.predict(X_test)
print(classification_report(Y_test, y_pred))
accuracy = accuracy_score(Y_test, y_pred)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.67      0.22      0.33         9
           2       0.82      0.53      0.64        80
           3       0.79      0.59      0.68       284
           4       0.78      0.89      0.83       804
           5       0.84      0.90      0.87      1159
           6       0.90      0.71      0.79       426
           7       0.00      0.00      0.00         1

    accuracy                           0.82      2764
   macro avg       0.60      0.48      0.52      2764
weighted avg       0.82      0.82      0.82      2764



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
