## Project 3: Digit Recognition

Good programmers can use neural nets. Great programmers can make them. This section will guide you through the implementation of a simple neural net with an architecture as shown in the figure below. You will implement the net from scratch (you will probably never do this again, don't worry) so that you later feel confident about using libraries. We provide some skeleton code in neural_nets.py for you to fill in.

![neural_net](../Media/images_neuralnet.png)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch

  from .autonotebook import tqdm as notebook_tqdm


### 3. Activation Functions

#### Rectified Linear Unit

In [3]:
def rectified_linear_unit(x):
    """ Returns the ReLU of x, or the maximum between 0 and x."""
    return np.maximum(x, 0)

#### Taking the Derivative

In [62]:
def rectified_linear_unit_derivative(x):
    """ Returns the derivative of ReLU."""

    x[x > 0] = 1
    x[x <= 0] = 0

    # ReLu returns 1 for all positive values and 0 for all negative values
    # (Returns 0 when the value is equal to 0 as well)
    return x

------
### 4. Training the Network

In [135]:
def output_layer_activation(x):
    """ Linear function, returns input as is. """
    return x

def output_layer_activation_derivative(x):
    """ Returns the derivative of a linear function: 1. """
    return 1


class NeuralNetwork():
    """
        Contains the following functions:
            -train: tunes parameters of the neural network based on error obtained from forward propagation.
            -predict: predicts the label of a feature vector based on the class's parameters.
            -train_neural_network: trains a neural network over all the data points for the specified number of epochs during initialization of the class.
            -test_neural_network: uses the parameters specified at the time in order to test that the neural network classifies the points given in testing_points within a margin of error.
    """

    def __init__(self):

        # DO NOT CHANGE PARAMETERS (Initialized to floats instead of ints)
        self.input_to_hidden_weights = np.matrix('1. 1.; 1. 1.; 1. 1.')  # (3,2)
        self.hidden_to_output_weights = np.matrix('1. 1. 1.')
        self.biases = np.matrix('0.; 0.; 0.')
        self.learning_rate = .001
        self.epochs_to_train = 10
        self.training_points = [((2,1), 10), ((3,3), 21), ((4,5), 32), ((6, 6), 42)]
        self.testing_points = [(1,1), (2,2), (3,3), (5,5), (10,10)]


    # ============================================================

    def train(self, x1 : float, x2: float, y):

        ### Forward propagation ###
        input_values = np.matrix([[x1],[x2]]) # 2 by 1

        # Calculate the input and activation of the hidden layer
        hidden_layer_weighted_input = np.dot(self.input_to_hidden_weights, input_values) + self.biases  # (3,2) * (2x1) = (3,1) + (3,1) = (3,1)
        hidden_layer_activation = rectified_linear_unit(hidden_layer_weighted_input)                    # (3,1)

        output = np.dot(self.hidden_to_output_weights, hidden_layer_activation)                         # (1,3) * (3,1) = (1,1)
        activated_output = output_layer_activation(output)

        ### Backpropagation ###

        # Compute gradients
        output_layer_error = (y - activated_output)                                                                                 # Derivative of cost function
        hidden_layer_error = self.hidden_to_output_weights.T * output_layer_activation_derivative(output) *  output_layer_error     # (3 by 1 matrix)

        bias_gradients = hidden_layer_error * 1                                                     # Derivative of Z with respect of the bias is 1 (Z = W*a + b) 
        hidden_to_output_weight_gradients = np.dot(output_layer_error, hidden_layer_activation.T)   # Derivative of Z with respect of the weights is the weighted input of the layer (Z' = a)
        input_to_hidden_weight_gradients = np.dot(hidden_layer_error, input_values.T)
        
        # print(bias_gradients.shape)
        # print(input_to_hidden_weight_gradients.shape)
        # print(hidden_to_output_weight_gradients.shape)
        # print("======================")

        # Use gradients to adjust weights and biases using gradient descent
        self.biases = self.biases - self.learning_rate * bias_gradients
        self.input_to_hidden_weights = self.input_to_hidden_weights - self.learning_rate * input_to_hidden_weight_gradients
        self.hidden_to_output_weights = self.hidden_to_output_weights - self.learning_rate * hidden_to_output_weight_gradients

    # ============================================================

    def predict(self, x1, x2):

        input_values = np.matrix([[x1],[x2]])
        print(input_values.shape)
        print(self.input_to_hidden_weights.shape)

        # Compute output for a single input(should be same as the forward propagation in training)
        hidden_layer_weighted_input = np.dot(self.input_to_hidden_weights, input_values) + self.biases
        hidden_layer_activation = rectified_linear_unit(hidden_layer_weighted_input)
        output = np.dot(self.hidden_to_output_weights, hidden_layer_activation)

        activated_output = output_layer_activation(output)
        return activated_output.item()

    # Run this to train your neural network once you complete the train method
    def train_neural_network(self):

        for epoch in range(self.epochs_to_train):
            for x,y in self.training_points:
                self.train(x[0], x[1], y)

    # Run this to test your neural network implementation for correctness after it is trained
    def test_neural_network(self):

        for point in self.testing_points:
            print("Point,", point, "Prediction,", self.predict(point[0], point[1]))
            if abs(self.predict(point[0], point[1]) - 7*point[0]) < 0.1:
                print("Test Passed")
            else:
                print("Point ", point[0], point[1], " failed to be predicted correctly.")
                return

In [136]:
x = NeuralNetwork()
x.train_neural_network()
x.test_neural_network()

(2, 1)
(3, 2)
Point, (1, 1) Prediction, -inf
(2, 1)
(3, 2)
Point  1 1  failed to be predicted correctly.


### 8. Fully-Connected Neural Networks

#### Training and Testing Accuracy Over Time

In [153]:
import _pickle as cPickle, gzip
import numpy as np
from tqdm import tqdm
import torch
import torch.autograd as autograd
import torch.nn.functional as F
import torch.nn as nn
import sys

sys.path.append("..")
from mnist.utils import *
from mnist.part2_mnist.train_utils import batchify_data, run_epoch, train_model

# Specify seed for deterministic behavior, then shuffle. 
# Do not change seed for official submissions to edx
np.random.seed(12321)  # for reproducibility
torch.manual_seed(12321)  # for reproducibility

# Load the dataset
num_classes = 10
X_train, y_train, X_test, y_test = get_MNIST_data()

# Split into train and dev
dev_split_index = int(9 * len(X_train) / 10)
X_dev = X_train[dev_split_index:]
y_dev = y_train[dev_split_index:]
X_train = X_train[:dev_split_index]
y_train = y_train[:dev_split_index]

permutation = np.array([i for i in range(len(X_train))])
np.random.shuffle(permutation)
X_train = [X_train[i] for i in permutation]
y_train = [y_train[i] for i in permutation]

# Split dataset into batches
batch_size = 32
train_batches = batchify_data(X_train, y_train, batch_size)
dev_batches = batchify_data(X_dev, y_dev, batch_size)
test_batches = batchify_data(X_test, y_test, batch_size)

#################################
## Model specification TODO
model = nn.Sequential(
            nn.Linear(784, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 10),
        )
lr=0.1
momentum=0
##################################

train_model(train_batches, dev_batches, model, lr=lr, momentum=momentum)

## Evaluate the model on test data
loss, accuracy = run_epoch(test_batches, model.eval(), None)

print ("Loss on test set:"  + str(loss) + " Accuracy on test set: " + str(accuracy))

# ===========================
# INITIAL ARCHITECTURE

#                                        TEST
#                       ----------------------------------------
# Baseline:             Accuracy = 0.920472 / Loss = 0.267226
# Batch Size 64:        Accuracy = 0.931490 / Loss = 0.24238465
# Learning Rate 0.01:   Accuracy = 0.920673 / Loss = 0.278865
# Momentum 0.9:         Accuracy = 0.859375 / Loss = 0.541848
# Leaky ReLU:           Accuracy = 0.920773 / Loss = 0.2689

# ============================
# HIDDEN REPRESENTATION WITH 128 NEURONS

#                           VALIDATION              TEST
#                       ----------------------------------------
# Baseline:             Accuracy = 0.978275   /   0.977163
# Batch Size 64:        Accuracy = 0.976983   /   0.97435
# Learning Rate 0.01:   Accuracy = 0.955047   /   0.942708
# Momentum 0.9:         Accuracy = 0.963402   /   0.962139
# Leaky ReLU:           Accuracy = 0.978944   /   0.977263

-------------
Epoch 1:



100%|██████████| 1687/1687 [00:00<00:00, 1855.88it/s]


Train loss: 0.366998 | Train accuracy: 0.897025


100%|██████████| 187/187 [00:00<00:00, 5053.80it/s]


Val loss:   0.179281 | Val accuracy:   0.947527
-------------
Epoch 2:



100%|██████████| 1687/1687 [00:00<00:00, 1815.93it/s]


Train loss: 0.175322 | Train accuracy: 0.948818


100%|██████████| 187/187 [00:00<00:00, 4794.60it/s]


Val loss:   0.126170 | Val accuracy:   0.966076
-------------
Epoch 3:



100%|██████████| 1687/1687 [00:00<00:00, 1819.84it/s]


Train loss: 0.123239 | Train accuracy: 0.965230


100%|██████████| 187/187 [00:00<00:00, 4794.81it/s]


Val loss:   0.104606 | Val accuracy:   0.970922
-------------
Epoch 4:



100%|██████████| 1687/1687 [00:00<00:00, 1757.29it/s]


Train loss: 0.095654 | Train accuracy: 0.973085


100%|██████████| 187/187 [00:00<00:00, 4921.19it/s]


Val loss:   0.092678 | Val accuracy:   0.973095
-------------
Epoch 5:



100%|██████████| 1687/1687 [00:00<00:00, 1783.30it/s]


Train loss: 0.077786 | Train accuracy: 0.977882


100%|██████████| 187/187 [00:00<00:00, 4794.69it/s]


Val loss:   0.084781 | Val accuracy:   0.975434
-------------
Epoch 6:



100%|██████████| 1687/1687 [00:00<00:00, 1810.08it/s]


Train loss: 0.065020 | Train accuracy: 0.981884


100%|██████████| 187/187 [00:00<00:00, 5054.16it/s]


Val loss:   0.079922 | Val accuracy:   0.977106
-------------
Epoch 7:



100%|██████████| 1687/1687 [00:00<00:00, 1821.82it/s]


Train loss: 0.055229 | Train accuracy: 0.984903


100%|██████████| 187/187 [00:00<00:00, 4794.78it/s]


Val loss:   0.076733 | Val accuracy:   0.976604
-------------
Epoch 8:



100%|██████████| 1687/1687 [00:00<00:00, 1779.53it/s]


Train loss: 0.047383 | Train accuracy: 0.987478


100%|██████████| 187/187 [00:00<00:00, 4794.75it/s]


Val loss:   0.074332 | Val accuracy:   0.977607
-------------
Epoch 9:



100%|██████████| 1687/1687 [00:00<00:00, 1753.63it/s]


Train loss: 0.040840 | Train accuracy: 0.989256


100%|██████████| 187/187 [00:00<00:00, 4794.66it/s]


Val loss:   0.072652 | Val accuracy:   0.978610
-------------
Epoch 10:



100%|██████████| 1687/1687 [00:00<00:00, 1759.12it/s]


Train loss: 0.035235 | Train accuracy: 0.991146


100%|██████████| 187/187 [00:00<00:00, 5342.92it/s]


Val loss:   0.070931 | Val accuracy:   0.978944


100%|██████████| 312/312 [00:00<00:00, 5114.76it/s]

Loss on test set:0.07416471567711806 Accuracy on test set: 0.9772636217948718





### 9. Convolutional Neural Networks

#### Convolutional Neural Networks

In [4]:
import _pickle as c_pickle, gzip
import numpy as np
from tqdm import tqdm
import torch
import torch.autograd as autograd
import torch.nn.functional as F
import torch.nn as nn
import sys

sys.path.append("..")
from mnist.utils  import *
from mnist.part2_mnist.train_utils import batchify_data, run_epoch, train_model, Flatten

# Specify seed for deterministic behavior, then shuffle. Do not change seed for official submissions to edx
np.random.seed(12321)  # for reproducibility
torch.manual_seed(12321)

# Load the dataset
num_classes = 10
X_train, y_train, X_test, y_test = get_MNIST_data()

# We need to rehape the data back into a 1x28x28 image
X_train = np.reshape(X_train, (X_train.shape[0], 1, 28, 28))
X_test = np.reshape(X_test, (X_test.shape[0], 1, 28, 28))

# Split into train and dev
dev_split_index = int(9 * len(X_train) / 10)
X_dev = X_train[dev_split_index:]
y_dev = y_train[dev_split_index:]
X_train = X_train[:dev_split_index]
y_train = y_train[:dev_split_index]

permutation = np.array([i for i in range(len(X_train))])
np.random.shuffle(permutation)
X_train = [X_train[i] for i in permutation]
y_train = [y_train[i] for i in permutation]

# Split dataset into batches
batch_size = 32
train_batches = batchify_data(X_train, y_train, batch_size)
dev_batches = batchify_data(X_dev, y_dev, batch_size)
test_batches = batchify_data(X_test, y_test, batch_size)

#################################
## Model specification TODO
model = nn.Sequential(
            nn.Conv2d(1, 32, (3, 3)),         # Channels: 1 (Monochrome image). 32 Image channels
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),             # Image reduced from 28x28 to 
            nn.Conv2d(32, 64, (3, 3)),        # The last Conv2d layer outputs 32 image channels. Here they are expanded to 64 channels
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),
            nn.Flatten(),
            nn.Linear(1600, 128),              # Input
            nn.Dropout(0.5),
            nn.Linear(128, 10),
        )
##################################

train_model(train_batches, dev_batches, model, nesterov=True)

## Evaluate the model on test data
loss, accuracy = run_epoch(test_batches, model.eval(), None)

print ("Loss on test set:"  + str(loss) + " Accuracy on test set: " + str(accuracy))


-------------
Epoch 1:



100%|██████████| 1687/1687 [00:15<00:00, 107.61it/s]


Train loss: 0.244268 | Train accuracy: 0.923477


100%|██████████| 187/187 [00:00<00:00, 258.29it/s]


Val loss:   0.060793 | Val accuracy:   0.983122
-------------
Epoch 2:



100%|██████████| 1687/1687 [00:15<00:00, 107.84it/s]


Train loss: 0.078232 | Train accuracy: 0.976400


100%|██████████| 187/187 [00:00<00:00, 259.72it/s]


Val loss:   0.043116 | Val accuracy:   0.988302
-------------
Epoch 3:



100%|██████████| 1687/1687 [00:15<00:00, 106.98it/s]


Train loss: 0.057196 | Train accuracy: 0.983088


100%|██████████| 187/187 [00:00<00:00, 247.68it/s]


Val loss:   0.041414 | Val accuracy:   0.986798
-------------
Epoch 4:



100%|██████████| 1687/1687 [00:16<00:00, 104.53it/s]


Train loss: 0.044947 | Train accuracy: 0.986477


100%|██████████| 187/187 [00:00<00:00, 261.91it/s]


Val loss:   0.034902 | Val accuracy:   0.988302
-------------
Epoch 5:



100%|██████████| 1687/1687 [00:15<00:00, 106.91it/s]


Train loss: 0.039600 | Train accuracy: 0.987737


100%|██████████| 187/187 [00:00<00:00, 251.34it/s]


Val loss:   0.033554 | Val accuracy:   0.989639
-------------
Epoch 6:



100%|██████████| 1687/1687 [00:16<00:00, 103.63it/s]


Train loss: 0.033319 | Train accuracy: 0.989497


100%|██████████| 187/187 [00:00<00:00, 260.81it/s]


Val loss:   0.035708 | Val accuracy:   0.989138
-------------
Epoch 7:



100%|██████████| 1687/1687 [00:15<00:00, 107.38it/s]


Train loss: 0.028448 | Train accuracy: 0.991053


100%|██████████| 187/187 [00:00<00:00, 264.87it/s]


Val loss:   0.036412 | Val accuracy:   0.988803
-------------
Epoch 8:



100%|██████████| 1687/1687 [00:15<00:00, 108.00it/s]


Train loss: 0.024920 | Train accuracy: 0.991979


100%|██████████| 187/187 [00:00<00:00, 262.27it/s]


Val loss:   0.032701 | Val accuracy:   0.990976
-------------
Epoch 9:



100%|██████████| 1687/1687 [00:16<00:00, 103.97it/s]


Train loss: 0.022843 | Train accuracy: 0.992590


100%|██████████| 187/187 [00:00<00:00, 256.87it/s]


Val loss:   0.036936 | Val accuracy:   0.990809
-------------
Epoch 10:



100%|██████████| 1687/1687 [00:15<00:00, 106.87it/s]


Train loss: 0.020617 | Train accuracy: 0.993387


100%|██████████| 187/187 [00:00<00:00, 250.67it/s]


Val loss:   0.036291 | Val accuracy:   0.991477


100%|██████████| 312/312 [00:01<00:00, 259.35it/s]

Loss on test set:0.02825509168977046 Accuracy on test set: 0.9902844551282052





### 10. Overlapping, multi-digit MNIST

#### Fully Connected Network

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from mnist.part2_twodigit.train_utils import batchify_data, run_epoch, train_model, Flatten
import mnist.part2_twodigit.utils_multiMNIST as U
path_to_data_dir = './mnist/Datasets/'
use_mini_dataset = True

batch_size = 64
nb_classes = 10
nb_epoch = 30
num_classes = 10
img_rows, img_cols = 42, 28 # input image dimensions

class MLP(nn.Module):

    def __init__(self, input_dimension):
        super(MLP, self).__init__()
        self.flatten = Flatten()
        # TODO initialize model layers here

    def forward(self, x):
        xf = self.flatten(x)

        # TODO use model layers to predict the two digits

        return out_first_digit, out_second_digit

def main():
    X_train, y_train, X_test, y_test = U.get_data(path_to_data_dir, use_mini_dataset)

    # Split into train and dev
    dev_split_index = int(9 * len(X_train) / 10)
    X_dev = X_train[dev_split_index:]
    y_dev = [y_train[0][dev_split_index:], y_train[1][dev_split_index:]]
    X_train = X_train[:dev_split_index]
    y_train = [y_train[0][:dev_split_index], y_train[1][:dev_split_index]]

    permutation = np.array([i for i in range(len(X_train))])
    np.random.shuffle(permutation)
    X_train = [X_train[i] for i in permutation]
    y_train = [[y_train[0][i] for i in permutation], [y_train[1][i] for i in permutation]]

    # Split dataset into batches
    train_batches = batchify_data(X_train, y_train, batch_size)
    dev_batches = batchify_data(X_dev, y_dev, batch_size)
    test_batches = batchify_data(X_test, y_test, batch_size)

    # Load model
    input_dimension = img_rows * img_cols
    model = MLP(input_dimension) # TODO add proper layers to MLP class above

    # Train
    train_model(train_batches, dev_batches, model)

    ## Evaluate the model on test data
    loss, acc = run_epoch(test_batches, model.eval(), None)
    print('Test loss1: {:.6f}  accuracy1: {:.6f}  loss2: {:.6f}   accuracy2: {:.6f}'.format(loss[0], acc[0], loss[1], acc[1]))

np.random.seed(12321)  # for reproducibility
torch.manual_seed(12321)  # for reproducibility
main()

  'x': torch.tensor(x_data[i:i + batch_size],


ValueError: optimizer got an empty parameter list