<a href="https://colab.research.google.com/github/scaomath/wustl-math450/blob/main/Lectures/Math_450_Notebook_10_(Validation).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Coding lecture 10 of Math 450

## Last couple of weeks
- A complete pipeline of training a machine learning model

## Today
- How to build a bigger and more complex neural network.
- Set up a validation strategy.

In [133]:
import torch
import numpy as np
from torch import nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Optimizer
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("dark")

import warnings
warnings.filterwarnings("ignore")

In [134]:
train = datasets.MNIST(root='./', 
                       train=True, 
                       download=True, 
                       transform = transforms.ToTensor());

train_loader = DataLoader(train, batch_size=8) 

class MLP(nn.Module):
    def __init__(self, 
                 input_size: int = 28*28,
                 output_size: int = 10):
        super(MLP, self).__init__() 
        self.linear0 = nn.Linear(input_size, 256)
        self.activation = nn.ReLU()
        self.linear1 = nn.Linear(256, output_size)
        self.dropout = nn.Dropout(0.1) 
        # 10% of the weight does not get updated: dropout
        
    def forward(self, x): 
        x = x.view(x.size(0), -1) 
        x1 = self.linear0(x)
        a1 = self.activation(x1)
        output = self.linear1(a1)

        return output

In [135]:
class SGD(Optimizer): # subclass of Optimizer
    """
    Implements the vanilla SGD simplified 
    from the torch official one for Math 450 WashU
    
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float): learning rate
        
    Example:
        >>> optimizer = SGD(model.parameters(), lr=1e-2)
        >>> optimizer.zero_grad()
        >>> loss_fn(model(input), target).backward()
        >>> optimizer.step()
    """

    def __init__(self, params, # params: model.parameters()
                       lr: float = 1e-3, # input: type = value
                 ): 
        defaults = dict(lr=lr) 
        # add a default attribute that can be accessed
        super(SGD, self).__init__(params, defaults)

    def step(self, closure=None): 
      '''
      step(): w_{k+1} = w_k - alpha*grad f(w_k)
      '''  
      for group in self.param_groups:
          for param in group['params']:
              if param.grad is None:
                  continue
              grad_param = param.grad.data
              
              param.data = param.data - group['lr']*grad_param
      return loss

In [136]:
model = MLP() # initialize the model
loss_func = nn.CrossEntropyLoss() # set up the loss
# crossentropyloss is for the case of a balanced classification problem
epochs = 2
learning_rate = 1e-3
optimizer = SGD(model.parameters(), lr=learning_rate)

# How to build a bigger net?

In [None]:
class MLP(nn.Module):
    def __init__(self, 
                 input_size: int = 28*28,
                 output_size: int = 10):
        super(MLP, self).__init__() 
        self.linear0 = nn.Linear(input_size, 256)
        self.linear1 = nn.Linear(256, 128)
        self.linear2 = nn.Linear(128, 64)
        self.linear3 = nn.Linear(64, output_size)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(0.1) 
        # 10% of the weight does not get updated: dropout
        
    def forward(self, x): 
        x = x.view(x.size(0), -1) 
        x1 = self.linear0(x) # for linear layer, we have to use different names
        a1 = self.activation(x1) # activation we can just define a single obj
        x2 = self.linear1(a1)
        a2 = self.activation(x2)
        x3 = self.linear2(a2)
        a3 = self.activation(x3)
        output = self.linear3(a3)

        return output

# `nn.Modulelist`

In [None]:
# list
lst = []
print(lst)

In [None]:
lst.append('math450')
print(lst)

In [None]:
lst.append(10.2)
print(lst)

In [None]:
lst1 = ['math 450 student ' + str(i) for i in range(5)]
print(lst1)

In [None]:
# if we just wanna copy something n times
x = np.array([10, 20, -5])
print(x.repeat(5))

In [None]:
np.vstack([x, x, x, x, x])

In [None]:
[x for _ in range(5)] # makes 5 copies of x
# but this is a list of arrays

In [None]:
np.asarray([x for _ in range(5)]) 
# convert the input as an array

## what is `nn.Modulelist()`?
It has operations like list, but it is for `nn` modules.

In [None]:
layers = nn.ModuleList()

In [None]:
layers # no weight associated with this "layers" module

In [None]:
layers.append(nn.Linear(128, 64))
print(layers)

In [None]:
layers.append(nn.SiLU()) # Swish activation
print(layers)

In [None]:
# access this modulelist using indices
print(layers[0])

In [None]:
layer.parameters()

In [None]:
# this is an iterable
for layer in layers:
  for param in layer.parameters():
    print(param.size())

In [None]:
['hahaha' for _ in range(10)]

In [None]:
layers = nn.ModuleList([nn.Linear(20, 20) for _ in range(10)])

In [None]:
layers

In [None]:
block = nn.Sequential(
    nn.Linear(20, 20),
    nn.ReLU(),
    nn.Linear(20,20)
)

In [None]:
block

In [None]:
bigger_block = nn.ModuleList([block for _ in range(5)])

print(bigger_block)

## advanced: memory allocation

In [None]:
# standard practice
from copy import deepcopy

In [None]:
blocks = nn.ModuleList([deepcopy(block) for _ in range(5)])

In [None]:
print(blocks)

In [None]:
# how to add tuples
(10, 10) + (80, )

In [137]:
class MLP(nn.Module):
    def __init__(self, 
                 input_size: int = 28*28,
                 hidden_size: tuple = (128, 64),
                 output_size: int = 10):
        super(MLP, self).__init__() 
        self.sizes = (input_size, ) + hidden_size + (output_size, )
        self.layers = nn.ModuleList()
        for k in range(1, len(self.sizes)):
          self.layers.append(nn.Linear(self.sizes[k-1], self.sizes[k]))
          self.layers.append(nn.ReLU())

    def forward(self, x): 
        x = x.view(x.size(0), -1)
        for layer in self.layers:
          x = layer(x)
          # print(x.size())
        return x

In [138]:
model = MLP()
inp = torch.randn(64, 784) # (batch_size, 784)
y = model(inp)


In [139]:
model.layers

ModuleList(
  (0): Linear(in_features=784, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=10, bias=True)
  (5): ReLU()
)

In [None]:
# pipeline
for epoch in range(epochs):
    
    model.train() # formalism, useful when we have dropout
    
    loss_vals = []
    
    with tqdm(total=len(train_loader)) as pbar: # progress bar
      for data, targets in train_loader:
          
        # forward pass
        outputs = model(data)
        
        # loss function
        loss = loss_func(outputs, targets)
        
        # record loss function values .item()
        loss_vals.append(loss.item())
        
        # clean the gradient from last iteration
        # param.grad is not zero in last iteration
        optimizer.zero_grad()
        
        # backprop
        # autograd
        loss.backward()
        
        # stochastic gradient descent
        # no with torch.no_grad(): block, param operation is using .data
        optimizer.step()
        
        # check accuracy

        # tqdm template
        desc = f"epoch: [{epoch+1}/{epochs}] loss: {np.mean(loss_vals):.4f}"
        pbar.set_description(desc)
        pbar.update()

# How to validate?

In order to make an informed choice, we need a way to *validate* that our model and our hyperparameters are a good fit to the data.
While this may sound simple, there are some pitfalls that you must avoid to do this effectively.


Model validation is very simple: making use of "holdout" validation sets and cross-validation for more robust model evaluation. We hold back some subset of the data from the training of the model, and then use this holdout set to check the model performance. 
This splitting can be done using the ``train_test_split`` utility in Scikit-Learn:

## Reference:
- Python data science handbook

In [140]:
from sklearn.model_selection import train_test_split

In [147]:
X = train.data.float()[:10000]
y = train.targets[:10000]
print(X.size(), y.size())

torch.Size([10000, 28, 28]) torch.Size([10000])


In [148]:
X_tr, X_val, y_tr, y_val = \
train_test_split(X, y, random_state=0, train_size=0.8)
# random_state = seed

In [149]:
print(X_tr.size(), X_val.size())

torch.Size([8000, 28, 28]) torch.Size([2000, 28, 28])


In [152]:
train_set = TensorDataset(X_tr, y_tr)
train_loader = DataLoader(train_set, batch_size=32)

valid_set = TensorDataset(X_val, y_val)
val_loader = DataLoader(valid_set, batch_size=32)

In [153]:
sample = next(iter(train_loader))

In [154]:
sample[0].size(), sample[1].size()

(torch.Size([32, 28, 28]), torch.Size([32]))

In [155]:
sample[0].dtype

torch.float32

In [156]:
# pipeline
for epoch in range(epochs):
    
    model.train() # formalism, useful when we have dropout
    
    loss_vals = []
    acc_on_valid = []
    
    with tqdm(total=len(train_loader)) as pbar: # progress bar
      for data, targets in train_loader:
        
        # forward pass
        outputs = model(data)
        
        # loss function
        loss = loss_func(outputs, targets)
        
        # record loss function values .item()
        loss_vals.append(loss.item())
        
        # clean the gradient from last iteration
        # param.grad is not zero in last iteration
        optimizer.zero_grad()
        
        # backprop
        # autograd
        loss.backward()
        
        # stochastic gradient descent
        # no with torch.no_grad(): block, param operation is using .data
        optimizer.step()
        
        # check accuracy (add validation here)
        with torch.no_grad():
           for x, y in val_loader:
             yhat = model(x)
             yhat = yhat.argmax(dim=-1)
             acc = (yhat == y).float().mean()
             acc_on_valid.append(acc)

        # tqdm template
        desc = f"epoch: [{epoch+1}/{epochs}] loss: {np.mean(loss_vals):.2f}"
        desc += f"accuracy on validation: {np.mean(acc_on_valid):.2f}"
        pbar.set_description(desc)
        pbar.update()

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

KeyboardInterrupt: ignored