# TTIC 31020 Introduction to Statistical Machine Learning: Neural Networks
---
Collborated with Sam Zheng, Haichuan Wang

In this notebook you will perform classification on the Fashion MNIST dataset with neural networks. Your task is (mostly) to implement the forward and backward methods for different layers (forward methods compute a layer's output given its input, while backward methods compute gradients for its parameters and its input given the gradient of its output).

After filling the missing code, try to achieve the best performance by changing the hyperparameters. Neural networks are typically more hyperparameter-sensitive than other methods you've seen in the past homeworks, so good hyperparameter tuning is crucial to get good results.

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
%load_ext autoreload

%autoreload 2

%cd '/content/drive/MyDrive/Colab Notebooks/TTIC 31020/hw5-dl'

import numpy as np
from utils import *
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/content/drive/MyDrive/Colab Notebooks/TTIC 31020/hw5-dl


In [14]:
# superclass of neural network "modules" (layers)
class Module:
    """
    Module is a super class. It could be a single layer, or a multilayer perceptron.
    """
    
    def __init__(self):
        self.train = True
        return
    
    def forward(self, _input):
        """
        z = f(a); a is the input, and h is the output.
        
        Inputs:
        _input: a
        
        Returns:
        output z
        """
        pass
    
    def backward(self, _input, _gradOutput):
        """
        Compute:
        gradient w.r.t. _input
        gradient w.r.t. trainable parameters
        
        Inputs (in lecture notation):
        _input: a 
        _gradOutput: dL/dz
        
        Returns:
        gradInput: dL/dz
        """
        pass
        
    def parameters(self):
        """
        Return the value of trainable parameters and its corresponding gradient (Used for grandient descent)
        
        Returns:
        params, gradParams
        """
        pass
    
    def training(self):
        """
        Turn the module into training mode.(Only useful for Dropout layer)
        Ignore it if you are not using Dropout.
        """
        self.train = True
        
    def evaluate(self):
        """
        Turn the module into evaluate mode.(Only useful for Dropout layer)
        Ignore it if you are not using Dropout.
        """
        self.train = False

In [15]:
# a class representing a sequence of modules (a layered network)
class Sequential(Module):
    """
    Sequential provides a way to plug layers together in a feed-forward manner.
    """
    def __init__(self):
        Module.__init__(self)
        self.layers = [] # layers contain all the layers in order
    
    def add(self, layer):
        self.layers.append(layer) # Add another layer at the end
    
    def size(self):
        return len(self.layers) # How many layers.
    
    def forward(self, _input):
        """
        Feed forward through all the layers, and return the output of the last layer
        """
        for i in range(self.size()):
            # The output of (i-1)-th layer is the _input of i-th layer
            _input = self.layers[i].forward(_input)
        self._output = _input
        return self._output
    
    def backward(self, _gradOutput):
        """
        Backpropagate through all the layers using chain rule.
        """
        for i in reversed(range(self.size())):
            # The (i-1)-th layer receives the error from the i-th layer
            _gradOutput = self.layers[i].backward(_gradOutput)
        return _gradOutput
    
    def parameters(self):
        """
        Return trainable parameters and its corresponding gradient in a list
        """
        params = []
        gradParams = []
        for m in self.layers:
            p, g = m.parameters()
            if p is not None:
                for _p, _g in zip(p,g):
                    params.append(_p)
                    gradParams.append(_g)
        return params, gradParams

    def training(self):
        """
        Turn all the layers into training mode
        """
        Module.training(self)
        for m in self.layers:
            m.training()
    
    def evaluate(self):
        """
        Turn all the layers into evaluate mode
        """
        Module.evaluate(self)
        for m in self.layers:
            m.evaluate()
        

In [16]:
class FullyConnected(Module):
    """
    Fully connected layer (parameters include a matrix of weights a vector of biases)
    """
    def __init__(self, inputSize, outputSize):
        Module.__init__(self)
        
        # ADD CODE here to initialize the weights and biases
        self.weight = np.random.rand(outputSize, inputSize) * 0.2 - 0.1
        self.bias = np.random.rand(outputSize) * 0.2 - 0.1
        
        self.gradWeight = np.ndarray((inputSize, outputSize))
        self.gradBias = np.ndarray(outputSize)
        
    def forward(self, _input):
        """
        output = W * input + b
        """
        N = len(_input)
        self._input = _input
        self._output = np.matmul(self.weight, _input.T).T # ADD CODE here, 3 x 8
        for i in range(N):
            self._output[i] += self.bias

        # print("Fully forward self._output", self._output)
        return self._output
    
    def backward(self, _gradOutput):
        """
        gradWeight = gradOutput * input
        gradBias = gradOutput * vec(1)
        gradInput =  Weight * gradOutput
        """
        self.gradWeight.fill(0)
        self.gradBias.fill(0)
        self.gradWeight += np.matmul(self._input.T, _gradOutput) # ADD CODE to compute the gradient for the layer's weight
        self.gradBias += np.matmul(_gradOutput.T, np.ones(len(_gradOutput))) # ADD CODE to compute the gradient for the layer's bias
        self._gradInput = np.matmul(_gradOutput, self.weight)  # ADD CODE to compute the gradient for the layer's input

        return self._gradInput
        
    def parameters(self):
        """
        Return weight and bias and their g
        """
        return [self.weight, self.bias], [self.gradWeight, self.gradBias]

In [17]:
class ReLU(Module):
    """
    ReLU activation, not trainable.
    """
    def __init__(self):
        Module.__init__(self)
        return
    
    def forward(self, _input):
        """
        output = max(0, input)
        """
        self._input = _input
        self._output = _input
        N = len(_input)
        for i in range(N):
            self._output[i] = np.maximum(np.zeros(len(_input[i])), _input[i]) # ADD CODE to compute the layer's output
        return self._output
    
    def backward(self, _gradOutput):
        """
        gradInput = gradOutput * mask
        mask = _input > 0
        """
        mask = self._input > 0
        self._gradInput =  _gradOutput * mask # ADD CODE to compute the gradient for the layer's input
        return self._gradInput
        
    def parameters(self):
        """
        No trainable parametersm, return None
        """
        return None, None

In [18]:
class Dropout(Module):
    """
    A dropout layer
    """
    def __init__(self, p = 0.5):
        Module.__init__(self)
        self.p = p #self.p is the drop rate, if self.p is 0, then it's a identity layer
        
    def forward(self, _input):
        self._output = _input
        if self.p > 0:
            if self.train:
                # Randomize a mask from bernoulli distrubition
                self.mask = np.random.binomial(1, 1 - self.p, _input.shape).astype('float64')
                # Scale the mask
                self.mask /= 1 - self.p
                self._output *= self.mask
        return self._output
    
    def backward(self, _gradOutput):
        self._gradInput = _gradOutput
        if self.train:
            if self.p > 0:
                self._gradInput = self.mask* self._gradInput 
        return self._gradInput
    
    def parameters(self):
        """
        No trainable parameters.
        """
        return None, None

In [19]:
class SoftMaxLoss(object):
    def __init__(self):
        return
        
    def forward(self, _input, _label):
        """
        Softmax and cross entropy loss layer. Should return a scalar, since it's a
        loss. (It's almost identical to what we had in Pset 2)

        Inputs:
        _input: N x C
        _labels: N x C, one-hot

        Returns: loss (scalar)
        """
        self._input = _input - _input.max(1)[:, np.newaxis]
        self._logprob = self._input - np.log(np.exp(self._input).sum(1)[:, np.newaxis])
        
        self._output = np.mean(np.sum(-self._logprob * _label, 1))
        return self._output
    
    def backward(self, _label):
        # score = np.exp(self._input)
        # N = len(self._input)
        # C = len(self._input[0])
        # self._gradInput = np.zeros((N, C))
        # for i in range(N):
        #     c = np.argmax(_label[i])
        #     self._gradInput[i] = 1 / sum(score[i]) * score[i] / N # ADD CODE to compute the gradient for the layer's input
        #     self._gradInput[i][c] -= 1 / N
        self._gradInput = (np.exp(self._logprob) - _label) / len(self._input)
        return self._gradInput

In [20]:
# Test softmaxloss, the relative error should be small enough
def test_sm():
    crit = SoftMaxLoss()
    gt = np.zeros((3, 10))
    gt[np.arange(3), np.array([1,2,3])] = 1
    x = np.random.random((3,10))
    def test_f(x):
        return crit.forward(x, gt)

    print(crit.forward(x, gt))
    gradInput = crit.backward(gt)
    gradInput_num = numeric_gradient(test_f, x, 1, 1e-6)
    print(relative_error(gradInput, gradInput_num, 1e-8))
    
test_sm()

2.345378329122795
4.9687117176946155e-09


In [21]:
# Test modules, all the relative errors should be small enough (on the order of 1e-6 or smaller)
def test_module(model):
    model.evaluate()

    crit = TestCriterion()
    gt = np.random.random((3,10)) # what is gt and x
    x = np.random.random((3,10)) # what is gt and x
    def test_f(x):
        return crit.forward(model.forward(x), gt)

    test_f(x)
    gradInput = model.backward(crit.backward(gt))
    gradInput_num = numeric_gradient(test_f, x, 1, 1e-6)
    print(relative_error(gradInput, gradInput_num, 1e-8))

# Test fully connected
model = FullyConnected(10, 8)
print('testing FullyConnected')
test_module(model)

# Test ReLU
model = ReLU()
print('testing ReLU')
test_module(model)

# Test Dropout
model = Dropout()
print('testing Dropout')
test_module(model)


testing FullyConnected
3.4881282650908704e-09
testing ReLU
5.962448038856628e-10
testing Dropout
7.360228254783035e-10


In [22]:
model = Sequential()
#ADD CODE to add layers using the add attribute of sequential 
#to construct 2-layer Neural Network with hidden size 10
model.add(FullyConnected(10, 8))
model.add(ReLU())
model.add(FullyConnected(8, 5))

In [23]:
# Test your neural network
print('testing 2-layer model')
test_module(model)

testing 2-layer model
3.7584494041918615e-08


In [24]:
def sgd(x, dx, lr, weight_decay = 0):
    num_layers = len(x) // 2
    for i in range(num_layers):
        x[2*i] -= lr * (dx[2*i].T + 2 * weight_decay * x[2*i])
        x[2*i + 1] -= lr * dx[2*i + 1]

In [25]:
# Test gradient descent, the loss should be lower and lower
model = Sequential()
model.add(FullyConnected(10, 8))
model.add(ReLU())
model.add(FullyConnected(8, 5))

trainX = np.random.random((3,10))

crit = TestCriterion()

params, gradParams = model.parameters()

it = 0
state = None
while True:
    # print("==== Iter", it)
    output = model.forward(trainX)
    loss = crit.forward(output, None)
    if it % 100 == 0:
        print("loss", loss)
    doutput = crit.backward(None)
    model.backward(doutput)
    
    sgd(params, gradParams, 0.01)
    # print("params after sgd", params[0][0][0])
    params1, gradParams1 = model.parameters()
    # print("params in the model", params1[0][0][0])
    
    if it > 1000:
        break
    it += 1

loss 0.27641112127585904
loss 0.02191976659445667
loss 0.022571379324979324
loss 0.023062557513587597
loss 0.02342705905966593
loss 0.023691929205351403
loss 0.02342610590521886
loss 0.02975577013499246
loss 0.036383738739548484
loss 0.03654155092476587
loss 0.036659122030493575


Now we start to work on Fashion MNIST.

In [26]:
import FMNIST_utils

# We only consider large set this time
print("Load large trainset.")
Xlarge,Ylarge = FMNIST_utils.load_data("Tr")
print(Xlarge.shape)
print(Ylarge.shape)
if Xlarge.max() > 1: Xlarge = Xlarge/255

print("Load valset.")
Xval,Yval = FMNIST_utils.load_data("Vl")
print(Xval.shape)
print(Yval.shape)
if Xval.max() > 1: Xval = Xval/255

Load large trainset.
(50000, 784)
(50000, 10)
Load valset.
(5000, 784)
(5000, 10)


In [27]:
def predict(X, model):
    """
    Evaluate the soft predictions of the model.
    Input:
    X : N x d array (no unit terms)
    model : a multi-layer perceptron
    Output:
    yhat : N x C array
        yhat[n][:] contains the score over C classes for X[n][:]
    """
    return model.forward(X)

def error_rate(X, Y, model):
    """
    Compute error rate (between 0 and 1) for the model
    """
    model.evaluate()
    res = 1 - (model.forward(X).argmax(-1) == Y.argmax(-1)).mean()
    model.training()
    return res

from copy import deepcopy

def runTrainVal(X,Y,model,Xval,Yval,trainopt):
    """
    Run the train + evaluation on a given train/val partition
    trainopt: various (hyper)parameters of the training procedure
    During training, choose the model with the lowest validation error. (early stopping)
    Assumes (global) variable crit containing the loss (training "criterion" to be minimized)
    """
    
    params, gradParams = model.parameters()
    
    eta = trainopt['eta']
    
    N = X.shape[0] # number of data points in X
    
    # Save the model with lowest validation error
    minValError = np.inf
    saved_model = None # Save the best model accoring to validation error
    
    shuffled_idx = np.random.permutation(N)
    start_idx = 0
    for iteration in range(trainopt['maxiter']):
        if iteration % int(trainopt['eta_frac'] * trainopt['maxiter']) == 0:
            eta *= trainopt['etadrop']
        # form the next mini-batch
        stop_idx = min(start_idx + trainopt['batch_size'], N)
        batch_idx = range(N)[int(start_idx):int(stop_idx)]
        
        s_idx = shuffled_idx[batch_idx]
        
        bX = X[s_idx,:]
        bY = Y[s_idx,:]

        score = model.forward(bX)
        loss = crit.forward(score, bY)
        # note: this computes loss on the *batch* only, not on the entire training set!
        
        dscore = crit.backward(bY)
        model.backward(dscore)
        
        sgd(params, gradParams, eta, weight_decay = trainopt['lambda'])

        start_idx = stop_idx % N
        
        if (iteration % trainopt['display_iter']) == 0:
            #compute train and val error; multiply by 100 for readability (make it percentage points)
            trainError = 100 * error_rate(X, Y, model)
            valError = 100 * error_rate(Xval, Yval, model)
            print('{:8} batch loss: {:.3f} train error: {:.3f} val error: {:.3f}'.format(iteration, loss, trainError, valError))
            
            # early stopping: save the best model snapshot so far (i.e., model with lowest val error)
            if valError < minValError:
                saved_model = deepcopy(model)
                minValError = valError
        
    return saved_model, minValError, trainError

In [28]:
def build_model(input_size, hidden_size, output_size, activation_func = 'ReLU', dropout = 0):
    """
    Build a 2-layer model:
    input_size: the dimension of input data
    hidden_size: the dimension of hidden vector, hidden_size == 0 means only one layer
    output_size: the output size of final layer.
    activation_func: ReLU, sigmoid (defined above), Tanh (you'd have to define), etc. 
    dropout: the dropout rate: if dropout == 0, this is equivalent to no dropout
    """
    model = Sequential()
    
    if type(hidden_size) is int:
        hidden_size = [hidden_size] # ensure it's a list
    
    prev_size=input_size
    
    # add hidden layer(s) as requested
    if hidden_size[0] == 0: # no hidden layer
        pass
    
    else:
        for l in range(len(hidden_size)):
            # ADD CODE to add a fully connected layer 
            model.add(FullyConnected(prev_size, hidden_size[l]))
            
            prev_size=hidden_size[l]

            # ADD CODE to add a Relu
            model.add(ReLU())
                 
            if dropout > 0:
                model.add(Dropout(p = dropout))
            
                
                
    # ADD CODE to add output layer  (which is a fully connected layer)
    model.add(FullyConnected(prev_size, output_size))
    
    return model

In [31]:
trainopt = {
    'eta': 1e-3,   # initial learning rate
    'maxiter': 20000,   # max number of iterations (updates) of SGD
    'display_iter': 5000,  # display batch loss every display_iter updates
    'batch_size': 128,  
    'etadrop': .5, # when dropping eta, multiply it by this number (e.g., .5 means halve it)
    'eta_frac': .4,  # drop eta after every eta_frac*maxiter
    'update': 'sgd'
}
NFEATURES = Xlarge.shape[1]

# we will maintain a record of models trained for different values of lambda
# these will be indexed directly by lambda value itself
trained_models = dict()

lambda_ = 0.01
hidden_size_ = [20]
trainopt['lambda'] = lambda_

In [32]:
# Tuning hidden layer width and dpeth

for hidden_size_ in [[5], [20], [100], [200], [50], [50, 50], [50, 50, 50], [50, 50, 50, 50]]:
    model = build_model(NFEATURES, hidden_size_, 10, dropout = 0.5) 
    print("=== hidden_size", hidden_size_)
    trained_model,valErr,trainErr = runTrainVal(Xlarge, Ylarge, model, Xval, Yval, trainopt)
    trained_models[lambda_] = {'model': trained_model, "val_err": valErr, "train_err": trainErr }
    print('train set model [ h = ',end='')
    for l in range(len(hidden_size_)):
        print('%d '%hidden_size_[l],end='')
    print(' ], lambda= %.4f ] --> train error: %.2f, val error: %.2f' % (lambda_, trainErr, valErr))

# Observation 1: the larger the hidden layer width, the better the performance.
# This might be that FMNIST data set has many features. Thus, more units allows
# each feature to be represented and tuned.
# Yet, too much hidden units might lead to over-fitting. 
# And increasing the size of one layer alone doesn't change the number of activation function.

# Observation 2: depth of the network doesn't enhance the performance as much.
# A shallow but wide network can approximate any function.
# Adding depth allows the model to tune features at different level of abstractions.
# For this FMNIST dataset, perhaps there are not many levels of abstractions.
# Therefore, width is more effective than depth. 

=== hidden_size [5]
       0 batch loss: 2.305 train error: 91.056 val error: 91.180
    5000 batch loss: 2.159 train error: 76.266 val error: 76.840
   10000 batch loss: 2.032 train error: 63.268 val error: 64.220
   15000 batch loss: 2.041 train error: 58.972 val error: 60.040
train set model [ h = 5  ], lambda= 0.0100 ] --> train error: 58.97, val error: 60.04
=== hidden_size [20]
       0 batch loss: 2.332 train error: 86.644 val error: 87.420
    5000 batch loss: 1.759 train error: 51.280 val error: 51.940
   10000 batch loss: 1.620 train error: 40.002 val error: 39.760
   15000 batch loss: 1.427 train error: 37.330 val error: 37.660
train set model [ h = 20  ], lambda= 0.0100 ] --> train error: 37.33, val error: 37.66
=== hidden_size [100]
       0 batch loss: 2.393 train error: 86.934 val error: 86.940
    5000 batch loss: 1.452 train error: 35.566 val error: 35.880
   10000 batch loss: 1.245 train error: 32.576 val error: 32.460
   15000 batch loss: 1.150 train error: 31.506 va

In [34]:
# Best model
# The most influential parameters are: eta_frac, lambda, hidden_size_,
# and initial weight (-0.1 to 0.1, better than 0.001)

trainopt = {
    'eta': 1e-3,   # initial learning rate
    'maxiter': 20000,   # max number of iterations (updates) of SGD
    'display_iter': 5000,  # display batch loss every display_iter updates
    'batch_size': 128,  
    'etadrop': .5, # when dropping eta, multiply it by this number (e.g., .5 means halve it)
    'eta_frac': .4,  # drop eta after every eta_frac*maxiter
    'update': 'sgd',
    'lambda': 0.01
}

hidden_size_ = [300]

model = build_model(NFEATURES, hidden_size_, 10, dropout = 0.5) 
print("=== hidden_size", hidden_size_)
trained_model,valErr,trainErr = runTrainVal(Xlarge, Ylarge, model, Xval, Yval, trainopt)
trained_models[lambda_] = {'model': trained_model, "val_err": valErr, "train_err": trainErr }
print('train set model [ h = ',end='')
for l in range(len(hidden_size_)):
    print('%d '%hidden_size_[l],end='')
print(' ], lambda= %.4f ] --> train error: %.2f, val error: %.2f' % (lambda_, trainErr, valErr))

=== hidden_size [300]
       0 batch loss: 2.646 train error: 95.232 val error: 95.540
    5000 batch loss: 1.111 train error: 30.224 val error: 29.920
   10000 batch loss: 0.901 train error: 26.180 val error: 26.160
   15000 batch loss: 0.897 train error: 24.878 val error: 24.780
train set model [ h = 300  ], lambda= 0.0100 ] --> train error: 24.88, val error: 24.78


In [35]:
#Generate a Kaggle submission file using best_trained_model which you should set based on your experiments
kaggleX = FMNIST_utils.load_data('kaggle')
if kaggleX.max() > 1: kaggleX = kaggleX/255
kaggleYhat = predict(kaggleX, trained_model).argmax(-1)
save_submission('submission-fmnist.csv', kaggleYhat)

Saved: submission-fmnist.csv
