# Neural Networks

Table of contents

✔ Chapter 1.  Fully connected layer

To install further python libraries, type

`!pip install --target=$my_path [LIBRARY_NAME]`

# Chapter 1-1. Implement from scratch and train/test against fake data


In [1]:
# Base class
class Layer:
    def __init__(self):
        self.input = None
        self.output = None

    # computes the output Y of a layer for a given input X
    def forward_pass(self, input):
        raise NotImplementedError

    # computes dJ/dw and dJ/db and update params
    def backward_propagation(self, output_error, learning_rate):
        raise NotImplementedError

In [2]:
import numpy as np

# inherit from base class Layer
class FCLayer(Layer):
    # input_size = number of input neurons
    # output_size = number of output neurons
    def __init__(self, input_size, output_size):
        self.weights = np.random.rand(input_size, output_size) - 0.5
        self.bias = np.random.rand(1, output_size) - 0.5

    # returns output for a given input
    def forward_pass(self, input_data):
        self.input = input_data
        self.output = np.dot(self.input, self.weights) + self.bias
        return self.output

    # computes dJ/dW, dJ/dB for a given loss_d=dJ/dz
    # Returns next_input=dJ/dz*w for the next step of the backpropagation.
    def backward_propagation(self, loss_d, learning_rate):
        next_input = np.dot(loss_d, self.weights.T)
        
        #compute dJ/dw
        weights_error = np.dot(self.input.T, loss_d)
       
        # update parameters
        self.weights -= learning_rate * weights_error
        self.bias -= learning_rate * loss_d
        return next_input

In [3]:
# inherit from base class Layer
class ActivationLayer(Layer):
    def __init__(self, activation, activation_prime):
        self.activation = activation
        self.activation_prime = activation_prime

    # returns the activated input
    def forward_pass(self, input_data):
        self.input = input_data
        self.output = self.activation(self.input)
        return self.output

    # Returns next_input=ReLU'(x)*(dJ/da) for the next step of backpropagation
    # learning_rate is not used as no update is needed
    def backward_propagation(self, loss_d, learning_rate):
        return self.activation_prime(self.input) * loss_d

In [4]:
import numpy as np

# activation function and its derivative
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_prime(x):
    return sigmoid(x) * (1 - sigmoid(x))

def tanh(x):
    return np.tanh(x);

def tanh_prime(x):
    return 1-np.tanh(x)**2;

In [5]:
import numpy as np

# loss function and its derivative
def mse(y_true, y_pred):
    return np.mean(np.power(y_true-y_pred, 2));

def mse_prime(y_true, y_pred):
    return 2*(y_pred-y_true)/y_true.size;

In [6]:
class Network:
    def __init__(self):
        self.layers = []
        self.loss = None
        self.loss_prime = None

    # add layer to network
    def add(self, layer):
        self.layers.append(layer)

    # compute loss
    def compute_loss(self, loss, loss_prime):
        self.loss = loss
        self.loss_prime = loss_prime

    # predict for each of given inputs
    def predict(self, input_data):
        samples = len(input_data)
        result = []

        
        for i in range(samples):
            # forward pass
            output = input_data[i]
            for layer in self.layers:
                output = layer.forward_pass(output)
            result.append(output)

        return result

    # train the network
    def fit(self, x_train, y_train, epochs, learning_rate):
        samples = len(x_train)

        # training loop
        for i in range(epochs):
            err = 0
            # Batch gradient descent
            for j in range(samples):
                # forward pass
                a = x_train[j]
                for layer in self.layers:
                    a = layer.forward_pass(a)

                # compute loss (for display purpose only)
                err += self.loss(y_train[j], a)

                # backward propagation
                # Since the last layer is the activation layer, the output is denoted as a here
                # Note that the output of fclayer is z which is z=wx+b or z=wa+b
                # compute the derivative of loss (loss_d) with respect to activation output (a)
                loss_d = self.loss_prime(y_train[j], a)
                for layer in reversed(self.layers):
                    loss_d = layer.backward_propagation(loss_d, learning_rate)

            # calculate average error on all samples
            err /= samples
            print('epoch %d/%d   error=%f' % (i+1, epochs, err))

In [7]:
# training data
x_train = np.array([[[0,0]], [[0,1]], [[1,0]], [[1,1]]])
y_train = np.array([[[0]], [[1]], [[1]], [[0]]])
print(x_train.shape)
print(y_train.shape)


(4, 1, 2)
(4, 1, 1)


In [8]:
# network
net = Network()
net.add(FCLayer(2, 3)) # input (1,2), output (1,3)
net.add(ActivationLayer(sigmoid, sigmoid_prime)) # element-wise application
net.add(FCLayer(3, 1)) # input (1,3), output (1,1)
net.add(ActivationLayer(sigmoid, sigmoid_prime)) # element-wise application

# train
net.compute_loss(mse, mse_prime)
net.fit(x_train, y_train, epochs=1000, learning_rate=0.1)

# test
out = net.predict(x_train)
print(out)

epoch 1/1000   error=0.263809
epoch 2/1000   error=0.262578
epoch 3/1000   error=0.261530
epoch 4/1000   error=0.260640
epoch 5/1000   error=0.259887
epoch 6/1000   error=0.259250
epoch 7/1000   error=0.258713
epoch 8/1000   error=0.258261
epoch 9/1000   error=0.257881
epoch 10/1000   error=0.257562
epoch 11/1000   error=0.257294
epoch 12/1000   error=0.257069
epoch 13/1000   error=0.256881
epoch 14/1000   error=0.256723
epoch 15/1000   error=0.256591
epoch 16/1000   error=0.256480
epoch 17/1000   error=0.256387
epoch 18/1000   error=0.256310
epoch 19/1000   error=0.256245
epoch 20/1000   error=0.256190
epoch 21/1000   error=0.256145
epoch 22/1000   error=0.256107
epoch 23/1000   error=0.256075
epoch 24/1000   error=0.256048
epoch 25/1000   error=0.256025
epoch 26/1000   error=0.256006
epoch 27/1000   error=0.255990
epoch 28/1000   error=0.255977
epoch 29/1000   error=0.255965
epoch 30/1000   error=0.255956
epoch 31/1000   error=0.255947
epoch 32/1000   error=0.255941
epoch 33/1000   e

It seems that the error doesn't really reduce properly. Change the activation function to use tanh

In [9]:
# network
net = Network()
net.add(FCLayer(2, 3)) # input (1,2), output (1,3)
net.add(ActivationLayer(tanh, tanh_prime)) # change the activation function
net.add(FCLayer(3, 1)) # input (1,3), output (1,1)
net.add(ActivationLayer(tanh, tanh_prime)) # change the activation function

# train
net.compute_loss(mse, mse_prime)
net.fit(x_train, y_train, epochs=1000, learning_rate=0.1)

# test
out = net.predict(x_train)
print(out)

epoch 1/1000   error=0.424803
epoch 2/1000   error=0.324308
epoch 3/1000   error=0.315148
epoch 4/1000   error=0.311753
epoch 5/1000   error=0.309534
epoch 6/1000   error=0.307753
epoch 7/1000   error=0.306207
epoch 8/1000   error=0.304811
epoch 9/1000   error=0.303522
epoch 10/1000   error=0.302313
epoch 11/1000   error=0.301164
epoch 12/1000   error=0.300061
epoch 13/1000   error=0.298993
epoch 14/1000   error=0.297950
epoch 15/1000   error=0.296926
epoch 16/1000   error=0.295914
epoch 17/1000   error=0.294908
epoch 18/1000   error=0.293904
epoch 19/1000   error=0.292898
epoch 20/1000   error=0.291886
epoch 21/1000   error=0.290867
epoch 22/1000   error=0.289837
epoch 23/1000   error=0.288794
epoch 24/1000   error=0.287737
epoch 25/1000   error=0.286663
epoch 26/1000   error=0.285572
epoch 27/1000   error=0.284462
epoch 28/1000   error=0.283332
epoch 29/1000   error=0.282182
epoch 30/1000   error=0.281012
epoch 31/1000   error=0.279821
epoch 32/1000   error=0.278608
epoch 33/1000   e

Now it works out!

# Chapter 1-2. train/test against MNIST data

Use MNIST dataset


*   Source: keras



In [10]:
from keras.datasets import mnist
from keras.utils import np_utils

# load MNIST 
(x_train, y_train), (x_test, y_test) = mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [11]:
# training data : 60000 samples
# reshape and normalize input data
x_train = x_train.reshape(x_train.shape[0], 1, 28*28)
x_train = x_train.astype('float32')
print(x_train[0])
x_train /= 255
print(x_train[0])
# One-hot encoding: encode output in range [0,9] into a vector of size 10
# Change to one-hot vector (e.g. 3 = [0, 0, 0, 1, 0, 0, 0, 0, 0, 0])
print(y_train[0])
y_train = np_utils.to_categorical(y_train)
print(y_train[0])

[[  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   3.  18.
   18.  18. 126. 136. 175.  26. 166. 255. 247. 127.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.  30.  36.  94. 154. 170. 253.
  253. 253. 253. 253. 225. 172. 253. 242. 195.  64.   0.   0.   

In [12]:
x_test = x_test.reshape(x_test.shape[0], 1, 28*28)
x_test = x_test.astype('float32')
x_test /= 255
y_test = np_utils.to_categorical(y_test)

In [13]:
class Network:
    def __init__(self):
        self.layers = []
        self.loss = None
        self.loss_prime = None

    # add layer to network
    def add(self, layer):
        self.layers.append(layer)

    # compute loss
    def compute_loss(self, loss, loss_prime):
        self.loss = loss
        self.loss_prime = loss_prime

    # predict for each of given inputs
    def predict(self, input_data):
        # sample dimension first
        samples = len(input_data)
        result = []

        
        for i in range(samples):
            # forward pass
            output = input_data[i]
            for layer in self.layers:
                output = layer.forward_pass(output)
            result.append(output)

        return result

    # train the network
    def fit(self, x_train, y_train, epochs, learning_rate, batch_size):
        train_len = np.array(range(len(x_train)))
        
        # training loop
        for i in range(epochs):
            err = 0
            # mini-batch gradient descent
            np.random.shuffle(train_len)
            for j in train_len[:batch_size]:
                # forward pass
                a = x_train[j]
                for layer in self.layers:
                    a = layer.forward_pass(a)

                # compute loss (for display purpose only)
                err += self.loss(y_train[j], a)

                # backward propagation
                # Since the last layer is the activation layer, the output is denoted as a here
                # Note that the output of fclayer is z which is z=wx+b or z=wa+b
                # compute the derivative of loss (loss_d) with respect to activation output (a)
                loss_d = self.loss_prime(y_train[j], a)
                for layer in reversed(self.layers):
                    loss_d = layer.backward_propagation(loss_d, learning_rate)

            # calculate average error on all samples
            err /= batch_size
            print('epoch %d/%d   error=%f' % (i+1, epochs, err))

In [14]:
# Network
net = Network()
net.add(FCLayer(28*28, 100))                # input (1, 28*28), output (1, 100)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(100, 50))                   # input (1, 100), output (1, 50)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(50, 10))                    # input (1, 50), output (1, 10)
net.add(ActivationLayer(tanh, tanh_prime))

net.compute_loss(mse, mse_prime)
net.fit(x_train, y_train, epochs=500, learning_rate=0.1, batch_size=256)


epoch 1/500   error=0.478091
epoch 2/500   error=0.231900
epoch 3/500   error=0.151132
epoch 4/500   error=0.129323
epoch 5/500   error=0.126178
epoch 6/500   error=0.116678
epoch 7/500   error=0.108930
epoch 8/500   error=0.107085
epoch 9/500   error=0.097637
epoch 10/500   error=0.102030
epoch 11/500   error=0.098433
epoch 12/500   error=0.088355
epoch 13/500   error=0.097030
epoch 14/500   error=0.089046
epoch 15/500   error=0.087939
epoch 16/500   error=0.089958
epoch 17/500   error=0.074602
epoch 18/500   error=0.080989
epoch 19/500   error=0.085913
epoch 20/500   error=0.083337
epoch 21/500   error=0.073696
epoch 22/500   error=0.072018
epoch 23/500   error=0.073516
epoch 24/500   error=0.064846
epoch 25/500   error=0.062119
epoch 26/500   error=0.067348
epoch 27/500   error=0.063862
epoch 28/500   error=0.059175
epoch 29/500   error=0.062418
epoch 30/500   error=0.065417
epoch 31/500   error=0.055046
epoch 32/500   error=0.058623
epoch 33/500   error=0.051688
epoch 34/500   erro

In [15]:
out = net.predict(x_test)
tp=0
for pred, true in zip(out,y_test):
  if np.argmax(pred) == np.argmax(true):
    tp +=1
print(tp/len(x_test))

0.9299


# [Extra Credit] Could you improve the performance? 
Any strategies can be acceptable such as different activation funciton, learning rate, and loss function, adding more layers and neurons, changing the epoch and batch sizes, etc.
**(Deadline:12/16 FRI 23:59)**



### Submitters are as follows.
 *Note that hands-on assignments can be done collaboratively (**up to 2 students**)*

    Name: JungWon Kim
    Student ID: 2076084

    Name:
    Student ID:

TO-DO: Using the Network class provided above, improve the initial performance we got. Note that the model obtained 92.1% accuracy on average using five experiments (91.2%, 90.3%, 93.47%, 92.65%, 92.82%).
- Feel free to add and modify all the codes provided above
- Run the five experiments and report the average accuracy
- Discuss what changes/additions you made to improve the performance of the neural network model   

## Modification 1
- added one more layer 

In [16]:
# Network
net = Network()
net.add(FCLayer(28*28, 100))                # input (1, 28*28), output (1, 100)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(100, 50))                   # input (1, 100), output (1, 50)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(50, 100))                    # input (1, 50), output (1, 100)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(100, 10))                    # input (1, 100), output (1, 10)
net.add(ActivationLayer(tanh, tanh_prime))

net.compute_loss(mse, mse_prime)
net.fit(x_train, y_train, epochs=500, learning_rate=0.1, batch_size=256)


epoch 1/500   error=0.541185
epoch 2/500   error=0.340344
epoch 3/500   error=0.268075
epoch 4/500   error=0.234475
epoch 5/500   error=0.225783
epoch 6/500   error=0.212241
epoch 7/500   error=0.205108
epoch 8/500   error=0.180319
epoch 9/500   error=0.186460
epoch 10/500   error=0.177996
epoch 11/500   error=0.170437
epoch 12/500   error=0.165937
epoch 13/500   error=0.151570
epoch 14/500   error=0.144096
epoch 15/500   error=0.128690
epoch 16/500   error=0.143962
epoch 17/500   error=0.135159
epoch 18/500   error=0.141370
epoch 19/500   error=0.132055
epoch 20/500   error=0.130836
epoch 21/500   error=0.131343
epoch 22/500   error=0.114002
epoch 23/500   error=0.115947
epoch 24/500   error=0.112788
epoch 25/500   error=0.120190
epoch 26/500   error=0.117711
epoch 27/500   error=0.113246
epoch 28/500   error=0.117864
epoch 29/500   error=0.104924
epoch 30/500   error=0.106759
epoch 31/500   error=0.096025
epoch 32/500   error=0.105821
epoch 33/500   error=0.101670
epoch 34/500   erro

In [17]:
out = net.predict(x_test)
tp=0
for pred, true in zip(out,y_test):
  if np.argmax(pred) == np.argmax(true):
    tp +=1
print(tp/len(x_test))

0.9275


We can see that the accuracy slightly increased to 92.75%

## Modification 2

increased batch size to 512

In [18]:
# Network
net = Network()
net.add(FCLayer(28*28, 100))                # input (1, 28*28), output (1, 100)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(100, 50))                   # input (1, 100), output (1, 50)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(50, 10))                    # input (1, 50), output (1, 10)
net.add(ActivationLayer(tanh, tanh_prime))

net.compute_loss(mse, mse_prime)
net.fit(x_train, y_train, epochs=500, learning_rate=0.1, batch_size=512)


epoch 1/500   error=0.343800
epoch 2/500   error=0.142834
epoch 3/500   error=0.120163
epoch 4/500   error=0.114011
epoch 5/500   error=0.105291
epoch 6/500   error=0.098479
epoch 7/500   error=0.085989
epoch 8/500   error=0.089401
epoch 9/500   error=0.081564
epoch 10/500   error=0.079704
epoch 11/500   error=0.074044
epoch 12/500   error=0.076392
epoch 13/500   error=0.072945
epoch 14/500   error=0.064415
epoch 15/500   error=0.067233
epoch 16/500   error=0.060313
epoch 17/500   error=0.064131
epoch 18/500   error=0.055053
epoch 19/500   error=0.058854
epoch 20/500   error=0.052411
epoch 21/500   error=0.052086
epoch 22/500   error=0.060764
epoch 23/500   error=0.054228
epoch 24/500   error=0.054904
epoch 25/500   error=0.050176
epoch 26/500   error=0.049857
epoch 27/500   error=0.051313
epoch 28/500   error=0.051627
epoch 29/500   error=0.049224
epoch 30/500   error=0.044727
epoch 31/500   error=0.045933
epoch 32/500   error=0.045123
epoch 33/500   error=0.043299
epoch 34/500   erro

In [19]:
out = net.predict(x_test)
tp=0
for pred, true in zip(out,y_test):
  if np.argmax(pred) == np.argmax(true):
    tp +=1
print(tp/len(x_test))

0.9411


We can see the accuarcy increased to 94.11%

### Modification 3
modified learning rate


In [20]:
# Network
net = Network()
net.add(FCLayer(28*28, 100))                # input (1, 28*28), output (1, 100)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(100, 50))                   # input (1, 100), output (1, 50)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(50, 10))                    # input (1, 50), output (1, 10)
net.add(ActivationLayer(tanh, tanh_prime))

net.compute_loss(mse, mse_prime)
net.fit(x_train, y_train, epochs=500, learning_rate=0.25, batch_size=256)


epoch 1/500   error=0.514421
epoch 2/500   error=0.333396
epoch 3/500   error=0.273577
epoch 4/500   error=0.271350
epoch 5/500   error=0.244830
epoch 6/500   error=0.251143
epoch 7/500   error=0.243138
epoch 8/500   error=0.225861
epoch 9/500   error=0.208194
epoch 10/500   error=0.197409
epoch 11/500   error=0.192042
epoch 12/500   error=0.193170
epoch 13/500   error=0.197939
epoch 14/500   error=0.188521
epoch 15/500   error=0.161210
epoch 16/500   error=0.174641
epoch 17/500   error=0.176487
epoch 18/500   error=0.166915
epoch 19/500   error=0.156196
epoch 20/500   error=0.167490
epoch 21/500   error=0.170247
epoch 22/500   error=0.150971
epoch 23/500   error=0.152150
epoch 24/500   error=0.161198
epoch 25/500   error=0.146892
epoch 26/500   error=0.144408
epoch 27/500   error=0.150950
epoch 28/500   error=0.147867
epoch 29/500   error=0.140241
epoch 30/500   error=0.153105
epoch 31/500   error=0.154405
epoch 32/500   error=0.150164
epoch 33/500   error=0.141264
epoch 34/500   erro

In [21]:
out = net.predict(x_test)
tp=0
for pred, true in zip(out,y_test):
  if np.argmax(pred) == np.argmax(true):
    tp +=1
print(tp/len(x_test))

0.9267


We can see the accuarcy increased to 92.67%