In [45]:
# Launch magic commands to automatically reloads modules
%load_ext autoreload
%autoreload 2

In [63]:
# import necessary packages

import pandas as pd
import numpy as np

# Load datasets

In [64]:
from src.data.sets import load_sets

X_train, y_train, X_val, y_val, X_test, y_test = load_sets(path='../data/processed/')

# Base Model

## Dummy Classifier

In [65]:
from sklearn.dummy import DummyClassifier

# Create dummy classifer
dummy = DummyClassifier(strategy='uniform', random_state=123)

# "Train" model
dummy.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=123, strategy='uniform')

In [66]:
# predictions
y_train_preds = dummy.predict(X_train)
y_val_preds = dummy.predict(X_val)

In [68]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# Get accuracy score

print('Dummy score:', dummy.score(X_test, y_test))
print('Train accuracy score:', accuracy_score(y_train,y_train_preds))
print('Validation accuracy score:', accuracy_score(y_val,y_val_preds))
print('Train accuracy score:', f1_score(y_train,y_train_preds, average="weighted"))
print('Validation accuracy score:', f1_score(y_val,y_val_preds, average="weighted"))

Dummy score: 0.009550208550745154
Train accuracy score: 0.00966653791117627
Validation accuracy score: 0.009543624470393036
Train accuracy score: 0.011742225123285682
Validation accuracy score: 0.011476281250959265


## Using Null Model

In [69]:
# Import class and predict

from src.models.null import NullModel
base_model = NullModel(target_type="classification")
y_base = base_model.fit_predict(y_train)

In [70]:
from src.models.performance import print_class_perf

print_class_perf(y_preds=y_base, y_actuals=y_train, set_name='Training', average='weighted')

Accuracy Training: 0.07484681022903845
F1 Training: 0.010423894732065047


# Define Architecture

In [71]:
# convert allsets to Pytorch datasets

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [72]:
# instantiate PytorchMultiClass model

from src.models.pytorch import PytorchMultiClass

model = PytorchMultiClass(X_train.shape[1])

### Print Architecture of the model

In [73]:
from src.models.pytorch import get_device

device = get_device()
model.to(device)
print(model)

PytorchMultiClass(
  (layer_1): Linear(in_features=8, out_features=32, bias=True)
  (layer_out): Linear(in_features=32, out_features=104, bias=True)
  (softmax): Softmax(dim=1)
)


# Train Model

In [74]:
# Instantiate Loss Functions

criterion = nn.CrossEntropyLoss()
#criterion = nn.NLLLoss()
#criterion = nn.GaussianNLLLoss()
#criterion = nn.PoissonNLLLoss()

In [75]:
# instatiate optimiser

#optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
#optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, amsgrad=True)
#optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.7)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)
#optimizer = torch.optim.Adagrad(model.parameters(), lr=0.0001)
#optimizer = torch.optim.Adamax(model.parameters(), lr=0.0001, weight_decay=0.01)
#optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0001)
#optimizer = torch.optim.ASGD(model.parameters(), lr=0.0001)
#optimizer = torch.optim.Adadelta(model.parameters(), lr=0.0001)


In [76]:
def train_classification(train_data, model, criterion, optimizer, batch_size, device, scheduler=None, generate_batch=None):
    """Train a Pytorch multi-class classification model

    Parameters
    ----------
    train_data : torch.utils.data.Dataset
        Pytorch dataset
    model: torch.nn.Module
        Pytorch Model
    criterion: function
        Loss function
    optimizer: torch.optim
        Optimizer
    bacth_size : int
        Number of observations per batch
    device : str
        Name of the device used for the model
    scheduler : torch.optim.lr_scheduler
        Pytorch Scheduler used for updating learning rate
    collate_fn : function
        Function defining required pre-processing steps

    Returns
    -------
    Float
        Loss score
    Float:
        Accuracy Score
    """
    
    # Set model to training mode
    model.train()
    train_loss = 0
    train_acc = 0
    
    # Create data loader
    data = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)
    
    #target = torch.empty(32, dtype=torch.long).random_(5)
    
    # Iterate through data by batch of observations
    for feature, target_class in data:

        # Reset gradients
        optimizer.zero_grad()
        
        # Load data to specified device
        feature, target_class = feature.to(device), target_class.to(device)
        
        # Make predictions
        output = model(feature)
        
        # Calculate loss for given batch
        loss = criterion(output, target_class.long())

        # Calculate global loss
        train_loss += loss.item()
        
        # Calculate gradients
        loss.backward()

        # Update Weights
        optimizer.step()
        
        # Calculate global accuracy
        train_acc += (output.argmax(1) == target_class).sum().item()

    # Adjust the learning rate
    if scheduler:
        scheduler.step()

    return train_loss / len(train_data), train_acc / len(train_data)

In [77]:
def test_classification(test_data, model, criterion, batch_size, device, generate_batch=None):
    """Calculate performance of a Pytorch multi-class classification model

    Parameters
    ----------
    test_data : torch.utils.data.Dataset
        Pytorch dataset
    model: torch.nn.Module
        Pytorch Model
    criterion: function
        Loss function
    bacth_size : int
        Number of observations per batch
    device : str
        Name of the device used for the model
    collate_fn : function
        Function defining required pre-processing steps

    Returns
    -------
    Float
        Loss score
    Float:
        Accuracy Score
    """    
    
    # Set model to evaluation mode
    model.eval()
    test_loss = 0
    test_acc = 0
    
    # Create data loader
    data = DataLoader(test_data, batch_size=batch_size, collate_fn=generate_batch)
    
    # Iterate through data by batch of observations
    for feature, target_class in data:
        
        # Load data to specified device
        feature, target_class = feature.to(device), target_class.to(device)
        
        # Set no update to gradients
        with torch.no_grad():
            
            # Make predictions
            output = model(feature)
            
            # Calculate loss for given batch
            loss = criterion(output, target_class.long())

            # Calculate global loss
            test_loss += loss.item()
            
            # Calculate global accuracy
            test_acc += (output.argmax(1) == target_class).sum().item()

    return test_loss / len(test_data), test_acc / len(test_data)

In [78]:
N_EPOCHS = 10
#N_EPOCHS = 5
BATCH_SIZE = 32

In [79]:
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset, model=model, criterion=criterion, optimizer=optimizer, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_acc = test_classification(val_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

Epoch: 0
	(train)	|	Loss: 0.1447	|	Acc: 3.0%
	(valid)	|	Loss: 0.1440	|	Acc: 5.3%
Epoch: 1
	(train)	|	Loss: 0.1439	|	Acc: 5.7%
	(valid)	|	Loss: 0.1439	|	Acc: 5.6%
Epoch: 2
	(train)	|	Loss: 0.1436	|	Acc: 6.5%
	(valid)	|	Loss: 0.1433	|	Acc: 7.7%
Epoch: 3
	(train)	|	Loss: 0.1433	|	Acc: 7.6%
	(valid)	|	Loss: 0.1432	|	Acc: 7.8%
Epoch: 4
	(train)	|	Loss: 0.1432	|	Acc: 7.7%
	(valid)	|	Loss: 0.1432	|	Acc: 7.8%
Epoch: 5
	(train)	|	Loss: 0.1432	|	Acc: 7.8%
	(valid)	|	Loss: 0.1432	|	Acc: 7.8%
Epoch: 6
	(train)	|	Loss: 0.1432	|	Acc: 7.9%
	(valid)	|	Loss: 0.1432	|	Acc: 7.8%
Epoch: 7
	(train)	|	Loss: 0.1432	|	Acc: 7.9%
	(valid)	|	Loss: 0.1432	|	Acc: 7.8%
Epoch: 8
	(train)	|	Loss: 0.1432	|	Acc: 7.9%
	(valid)	|	Loss: 0.1432	|	Acc: 7.8%
Epoch: 9
	(train)	|	Loss: 0.1432	|	Acc: 7.9%
	(valid)	|	Loss: 0.1432	|	Acc: 7.8%


## Assessment

In [80]:
test_loss, test_acc = test_classification(test_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)
print(f'\tLoss: {test_loss:.4f}\t|\tAccuracy: {test_acc:.1f}')

	Loss: 0.1432	|	Accuracy: 0.1


## Save Model

In [146]:
torch.save(model, "../models/pytorch_beer_best_model.pt")