[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/comp-hci-lab/BDSI_2021_ML/blob/master/A3%20-%20More%20Neural%20Nets.ipynb)

## BDSI ML 2021

# A3 - More Neural Nets

**Note:** If you wish to use GPU, please select from the menu "Runtime - Change runtime type" and set "Hardware accelerator" to "GPU". 

In [None]:
#@title Run this cell to download preprocessed data (features + labels). { display-mode: "form" }
!pip install -U wget
!rm -rf preprocessed
!mkdir preprocessed

import wget
wget.download('https://github.com/comp-hci-lab/BDSI_2021_ML/raw/master/preprocessed/data.npz', 'preprocessed/data.npz')

In [None]:
!mkdir -p checkpoint

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn import metrics, exceptions
import os, random, shutil

In [None]:
# GPU support
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('using device:', device)

In [None]:
#@title Run this cell to run preprocessing. { display-mode: "form" }

with np.load('preprocessed/data.npz') as f:
    X = f['X']
    y = f['y']
    feature_names = f['feature_names']

def impute_missing_values(X):
    """
    For each feature column, impute missing values  (np.nan) with the 
    population mean for that feature.
    
    Args:
        X: np.array, shape (N, d). X could contain missing values
    Returns:
        X: np.array, shape (N, d). X does not contain any missing values
    """
    from sklearn.impute import SimpleImputer
    return SimpleImputer().fit_transform(X)

def normalize_feature_matrix(X):
    """
    For each feature column, normalize all values to range [0, 1].

    Args:
        X: np.array, shape (N, d).
    Returns:
        X: np.array, shape (N, d). Values are normalized per column.
    """
    from sklearn.preprocessing import MinMaxScaler
    return MinMaxScaler().fit_transform(X)

X = impute_missing_values(X)
X = normalize_feature_matrix(X)

_, d = X.shape

# Split data into train (80%), validation (10%) and test (10%)
from sklearn.model_selection import train_test_split
Xtr, X__, ytr, y__ = train_test_split(X,   y,   train_size=0.7, stratify=y,   random_state=0)
Xva, Xte, yva, yte = train_test_split(X__, y__, test_size=0.5, stratify=y__, random_state=0)

print('tr Data Shapes:', Xtr.shape, ytr.shape)
print('va Data Shapes:', Xva.shape, yva.shape)
print('te Data Shapes:', Xte.shape, yte.shape)

import torch
from torch.utils.data import Dataset, DataLoader

class SimpleDataset(Dataset):
    def __init__(self, X, y):
        self.X, self.y = X, y
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx]).float(), torch.tensor([self.y[idx]]).float()
    def __len__(self):
        return len(self.X)

tr = SimpleDataset(Xtr, ytr)
va = SimpleDataset(Xva, yva)
te = SimpleDataset(Xte, yte)

In [None]:
#@title Run this cell to define helper functions for training, evaluation and checkpointing... { display-mode: "form" }
def _train_epoch(data_loader, model, criterion, optimizer):
    """
    Train the `model` for one epoch of data from `data_loader`
    Use `optimizer` to optimize the specified `criterion`
    """
    model.train()
    for i, (X, y) in enumerate(data_loader):
        # optionally, move data to the appropriate device (GPU or CPU)
        X, y = X.to(device), y.to(device)
        
        # clear parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        output = model(X)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()

def _evaluate_epoch(tr_loader, va_loader, model, criterion):
    model.eval()
    with torch.no_grad():
        # Evaluate on train
        y_true, y_score = [], []
        running_loss = []
        for X, y in tr_loader:
            X, y = X.to(device), y.to(device)
            output = model(X)
            y_true.append(y.cpu().numpy())
            y_score.append(output.cpu().numpy())
            running_loss.append(criterion(output, y).item())

        y_true, y_score = np.concatenate(y_true), np.concatenate(y_score)
        train_loss = np.mean(running_loss)
        train_score = metrics.roc_auc_score(y_true, y_score)
        #print('tr loss', train_loss, 'tr AUROC', train_score)

        # Evaluate on validation
        y_true, y_score = [], []
        running_loss = []
        for X, y in va_loader:
            X, y = X.to(device), y.to(device)
            with torch.no_grad():
                output = model(X)
                y_true.append(y.cpu().numpy())
                y_score.append(output.cpu().numpy())
                running_loss.append(criterion(output, y).item())

        y_true, y_score = np.concatenate(y_true), np.concatenate(y_score)
        val_loss = np.mean(running_loss)
        val_score = metrics.roc_auc_score(y_true, y_score)
        #print('va loss', val_loss, 'va AUROC', val_score)
    return train_loss, val_loss, train_score, val_score

def save_checkpoint(model, epoch, checkpoint_dir):
    state = {
        'epoch': epoch,
        'state_dict': model.state_dict(),
    }

    filename = os.path.join(checkpoint_dir, 'epoch={}.checkpoint.pth.tar'.format(epoch))
    torch.save(state, filename)

import itertools
def restore_checkpoint(model, checkpoint_dir, cuda=False):
    """
    If a checkpoint exists, restores the PyTorch model from the checkpoint.
    Returns the model and the current epoch.
    """
    cp_files = [file_ for file_ in os.listdir(checkpoint_dir)
        if file_.startswith('epoch=') and file_.endswith('.checkpoint.pth.tar')]

    if not cp_files:
        print('No saved model parameters found')
        if force:
            raise Exception("Checkpoint not found")
        else:
            return model, 0, []
    
    # Find latest epoch
    for i in itertools.count(1):
        if 'epoch={}.checkpoint.pth.tar'.format(i) in cp_files:
            epoch = i
        else:
            break

    print("Which epoch to load from? Choose in range [1, {}].".format(epoch))
    inp_epoch = int(input())
    if inp_epoch not in range(1, epoch+1):
        raise Exception("Invalid epoch number")

    filename = os.path.join(checkpoint_dir,
        'epoch={}.checkpoint.pth.tar'.format(inp_epoch))

    print("Loading from checkpoint {}".format(filename))
    
    if cuda:
        checkpoint = torch.load(filename)
    else:
        # Load GPU model on CPU
        checkpoint = torch.load(filename,
            map_location=lambda storage, loc: storage)

    try:
        import pdb
        pdb.set_trace()
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        print("=> Successfully restored checkpoint (trained for {} epochs)"
            .format(checkpoint['epoch']))
    except:
        print("=> Checkpoint not successfully restored")
        raise

    return model, inp_epoch

In [None]:
# Fit the neural network on training data
# and visualize learning progress by displaying the training/validation curves for both loss and AUROC
def run(model, criterion=torch.nn.BCELoss(), optimizer=None, batch_size=64, n_epochs=200, learning_rate=1e-3):
    shutil.rmtree('checkpoint')
    os.mkdir('checkpoint')
    
    torch.random.manual_seed(0)
    np.random.seed(0)
    random.seed(0)

    # Define datasets and data loaders for training, validation, and test set
    tr_loader = DataLoader(tr, batch_size=batch_size, shuffle=True)
    va_loader = DataLoader(va, batch_size=batch_size)
    te_loader = DataLoader(te, batch_size=batch_size)
    
    if optimizer is None:
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    def count_parameters(model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

    print('Number of learnable parameters:', count_parameters(model))
    print(flush=True)
    
    #### Start training
    # Move model to the appropriate device
    model = model.to(device)

    outputs = []
    #print('Epoch', 0)
    out = _evaluate_epoch(tr_loader, va_loader, model, criterion)
    outputs.append(out)

    for epoch in tqdm(range(0, n_epochs)):
        #print('Epoch', epoch+1)
        # Train model
        _train_epoch(tr_loader, model, criterion, optimizer)

        # Evaluate model
        out = _evaluate_epoch(tr_loader, va_loader, model, criterion)
        outputs.append(out)

        # Save model parameters
        save_checkpoint(model, epoch+1, 'checkpoint/')
    
    #### Finished training
    # Plot loss and AUROC scores for training and validation { display-mode: "form" }
    train_losses, val_losses, train_scores, val_scores = zip(*outputs)

    fig, ax = plt.subplots(figsize=(5,5))
    plt.plot(range(n_epochs + 1), train_scores, '-', label='Train')
    plt.plot(range(n_epochs + 1), val_scores, '-', label='Validation')
    plt.xlabel('epoch')
    plt.ylabel('AUROC')
    plt.legend()
    plt.show()

    fig, ax = plt.subplots(figsize=(5,5))
    plt.plot(range(n_epochs + 1), train_losses, '-', label='Train')
    plt.plot(range(n_epochs + 1), val_losses, '-', label='Validation')
    plt.xlabel('epoch')
    plt.ylabel('Loss (binary cross entropy)')
    plt.legend()
    plt.show()

## From last time...

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class Net(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc = nn.Linear(input_size, 1)
        self.init_weights()

    def init_weights(self):
        _, fan_in = self.fc.weight.shape
        bound = 1 / math.sqrt(fan_in)
        torch.nn.init.uniform_(self.fc.weight, -bound, bound)
        torch.nn.init.constant_(self.fc.bias, 0.0)
    
    def forward(self, x):
        z = torch.sigmoid(self.fc(x))
        return z

In [None]:
model = Net(d)
run(model)

## How to improve the model? 

- Start with *logistic regression*: linear(40, 1) → sigmoid
- Add a hidden layer: linear(40, 64) → relu → linear(64, 1) → sigmoid
- Add one more hidden layer: linear(40, 64) → relu → linear(64, 32) → relu → linear(32, 1) → sigmoid

Change up the hyperparameters (model architecture, training parameters, etc.)
- Train for a longer time (more epochs/iterations). 
- Add more layers. 
- Change (increase) the number of hidden units. 
- Change the batch size. 

And more...
- Use different activation functions. 
- Initialize weights "cleverly". 

Does the model start to overfit?
- Apply early stopping
- Add dropout and/or batch normalization

In [None]:
model2 = nn.Sequential(
    nn.Linear(40, 1024),
    nn.ReLU(inplace=True),
    nn.Dropout(),
    nn.Linear(1024, 64),
    nn.ReLU(inplace=True),
    nn.Linear(64, 1),
    nn.Sigmoid(),
)
run(model2)

## Final evaluation
- Pick your best model based on validation performance
- Get final model performance by evaluating on the test set

In [None]:
model = nn.Sequential(
    nn.Linear(40, 1024),
    nn.ReLU(inplace=True),
    nn.Linear(1024, 64),
    nn.ReLU(inplace=True),
    nn.Linear(64, 1),
    nn.Sigmoid(),
)
#model, save_epoch = restore_checkpoint(model, 'checkpoint/')

In [None]:
# Define datasets and data loaders for training, validation, and test set
te_loader = DataLoader(te, batch_size=64)

In [None]:
criterion = torch.nn.BCELoss()
model.eval()
with torch.no_grad():
    y_true, y_score, running_loss = [], [], []
    
    # Do forward pass on the entire test set
    # Save loss and output
    for X, y in te_loader:
        output = model(X)
        y_true.append(y)
        y_score.append(output)
        running_loss.append(criterion(output, y).item())
    y_true, y_score = np.concatenate(y_true), np.concatenate(y_score)

# Calculate average test loss
loss = np.mean(running_loss)

# Calculate test AUROC
score = metrics.roc_auc_score(y_true, y_score)

In [None]:
loss, score