In [1]:
print("What can go wrong if we have a wide range of numbers in our input/output data and we don't do any pre-processing on them and feed the neural network with unprocessed data\n")

print("If one of the variables is in the range of 1000’s and another is in the range of 0.1’s then the coefficient found by neural network for the first variable would be (most likely) much larger than the the other. This does not really show if the first variable is more important or not, but this coefficient needs to be large just to compensate for the scale of that variable")

What can go wrong if we have a wide range of numbers in our input/output data and we don't do any pre-processing on them and feed the neural network with unprocessed data

If one of the variables is in the range of 1000’s and another is in the range of 0.1’s then the coefficient found by neural network for the first variable would be (most likely) much larger than the the other. This does not really show if the first variable is more important or not, but this coefficient needs to be large just to compensate for the scale of that variable


In [2]:
print("How do we tackle this problem? (Hint: Normalization, standardization)\n")

print("To handle this issue we rescale the input variable in range (0,1) by using normalization or we can standardize the values to have a mean of 0 and a standard deviation of 1")

How do we tackle this problem? (Hint: Normalization, standardization)

To handle this issue we rescale the input variable in range (0,1) by using normalization or we can standardize the values to have a mean of 0 and a standard deviation of 1


In [3]:
import numpy as np
import pandas as pd
data = pd.read_csv("mushrooms.csv")

In [4]:
# Base class
class Layer:
    def __init__(self):
        self.input = None
        self.output = None

    # computes the output Y of a layer for a given input X
    
    def forward_propagation(self, input):
        raise NotImplementedError

    # computes dE/dX for a given dE/dY (and update parameters if any)
    def backward_propagation(self, output_error, learning_rate):
        raise NotImplementedError

In [5]:
#from layer import Layer
import numpy as np

# inherit from base class Layer
class FCLayer(Layer):
    # input_size = number of input neurons
    # output_size = number of edges that connects to neurons in next layer
    def __init__(self, input_size, output_size):
        self.weights = np.random.rand(input_size, output_size) - 0.5
        self.bias = np.random.rand(1, output_size) - 0.5

    # returns output for a given input
    def forward_propagation(self, input_data):
        self.input = input_data
        self.output = np.dot(self.input, self.weights) + self.bias
        return self.output

    # computes dE/dW, dE/dB for a given output_error=dE/dY. Returns input_error=dE/dX.
    def backward_propagation(self, output_error, learning_rate):
        input_error = np.dot(output_error, self.weights.T)
        weights_error = np.dot(self.input.T, output_error)
        # dBias = output_error

        # update parameters
        self.weights -= learning_rate * weights_error
        self.bias -= learning_rate * output_error
        return input_error

In [6]:
#from layer import Layer

# inherit from base class Layer
class ActivationLayer(Layer):
    def __init__(self, activation, activation_prime):
        self.activation = activation
        self.activation_prime = activation_prime

    # returns the activated input
    def forward_propagation(self, input_data):
        self.input = input_data
        self.output = self.activation(self.input)
        return self.output

    # Returns input_error=dE/dX for a given output_error=dE/dY.
    # learning_rate is not used because there is no "learnable" parameters.
    def backward_propagation(self, output_error, learning_rate):
        return self.activation_prime(self.input) * output_error

In [7]:
import numpy as np

# activation function and its derivative
def tanh(x):
    return np.tanh(x);

def tanh_prime(x):
    return 1-np.tanh(x)**2;

In [8]:

import numpy as np

# loss function and its derivative
def mse(y_true, y_pred):
    return np.mean(np.power(y_true-y_pred, 2));

def mse_prime(y_true, y_pred):
    return 2*(y_pred-y_true)/y_true.size;

In [9]:
# example of a function for calculating softmax for a list of numbers
from numpy import exp
 
# calculate the softmax of a vector
def softmax(vector):
    e = exp(vector)
    return e / e.sum()

In [10]:
class Network:
    def __init__(self):
        self.layers = []
        self.loss = None
        self.loss_prime = None

    # add layer to network
    def add(self, layer):
        self.layers.append(layer)

    # set loss to use
    def use(self, loss, loss_prime):
        self.loss = loss
        self.loss_prime = loss_prime

        
    # predict output for given input
    def predict(self, input_data):
        # sample dimension first
        samples = len(input_data)
        result = []

        # run network over all samples
        for i in range(samples):
            # forward propagation
            output = input_data[i]
            for layer in self.layers:
                output = layer.forward_propagation(output)
            result.append(output)

        return result

    # train the network 
    
    def fit(self, x_train, y_train, epochs, learning_rate):
        '''
        Fit function does the training. 
        Training data is passed 1-by-1 through the network layers during forward propagation.
        Loss (error) is calculated for each input and back propagation is performed via partial 
        derivatives on each layer.
        '''
        # sample dimension first
        samples = len(x_train)

        # training loop
        for i in range(epochs):
            err = 0
            for j in range(samples):
                # forward propagation
                output = x_train[j]
                for layer in self.layers:
                    output = layer.forward_propagation(output)

                # compute loss (for display purpose only)
                err += self.loss(y_train[j], output)

                # backward propagation
                error = self.loss_prime(y_train[j], output)
                for layer in reversed(self.layers):
                    error = layer.backward_propagation(error, learning_rate)

            # calculate average error on all samples
            err /= samples
            print('epoch %d/%d   error=%f' % (i+1, epochs, err))

### Solve MNIST wihout normalization and standardization

In [11]:
report = {}

In [12]:
import numpy as np

#from network import Network
#from fc_layer import FCLayer
#from activation_layer import ActivationLayer
#from activations import tanh, tanh_prime
#from losses import mse, mse_prime

from keras.datasets import mnist
from keras.utils import np_utils

# load MNIST from server
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# training data : 60000 samples
# reshape and normalize input data
x_train = x_train.reshape(x_train.shape[0], 1, 28*28)
x_train = x_train.astype('float32')
#x_train /= 255
# encode output which is a number in range [0,9] into a vector of size 10
# e.g. number 3 will become [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
y_train = np_utils.to_categorical(y_train)

# same for test data : 10000 samples
x_test = x_test.reshape(x_test.shape[0], 1, 28*28)
x_test = x_test.astype('float32')
#x_test /= 255
y_test = np_utils.to_categorical(y_test)

# Network
net = Network()
net.add(FCLayer(28*28, 100))                # input_shape=(1, 28*28)    ;   output_shape=(1, 100)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(100, 50))                   # input_shape=(1, 100)      ;   output_shape=(1, 50)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(50, 10))                    # input_shape=(1, 50)       ;   output_shape=(1, 10)
net.add(ActivationLayer(tanh, tanh_prime))

# train on 1000 samples
# as we didn't implemented mini-batch GD, training will be pretty slow if we update at each iteration on 60000 samples...
net.use(mse, mse_prime)
net.fit(x_train[0:1000], y_train[0:1000], epochs=35, learning_rate=0.1)

# test on 3 samples
y_pred = net.predict(x_test)

epoch 1/35   error=0.282392
epoch 2/35   error=0.132190
epoch 3/35   error=0.124999
epoch 4/35   error=0.120291
epoch 5/35   error=0.116706
epoch 6/35   error=0.114691
epoch 7/35   error=0.115094
epoch 8/35   error=0.115340
epoch 9/35   error=0.113926
epoch 10/35   error=0.112681
epoch 11/35   error=0.111852
epoch 12/35   error=0.111107
epoch 13/35   error=0.110214
epoch 14/35   error=0.109581
epoch 15/35   error=0.109005
epoch 16/35   error=0.108457
epoch 17/35   error=0.108993
epoch 18/35   error=0.110814
epoch 19/35   error=0.110420
epoch 20/35   error=0.110062
epoch 21/35   error=0.109862
epoch 22/35   error=0.109317
epoch 23/35   error=0.108888
epoch 24/35   error=0.108505
epoch 25/35   error=0.108150
epoch 26/35   error=0.107813
epoch 27/35   error=0.107490
epoch 28/35   error=0.107178
epoch 29/35   error=0.106877
epoch 30/35   error=0.106583
epoch 31/35   error=0.106296
epoch 32/35   error=0.106015
epoch 33/35   error=0.105739
epoch 34/35   error=0.105453
epoch 35/35   error=0.1

In [13]:
y_pred_argmax = []
for i in range(len(y_pred)):
    y_pred_argmax.append(np.argmax(y_pred[i]))

In [14]:
y_test_argmax = []
for i in range(len(y_test)):
    y_test_argmax.append(np.argmax(y_test[i]))

In [15]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_pred_argmax,y_test_argmax)
report['w/o_any'] = accuracy
accuracy

0.2148

### Solve MNIST with normalization

In [16]:
def normalize(data):
    for sample in data:
        sample_max = sample[0].max()
        sample[0] = sample[0]/sample_max

In [17]:
import numpy as np

#from network import Network
#from fc_layer import FCLayer
#from activation_layer import ActivationLayer
#from activations import tanh, tanh_prime
#from losses import mse, mse_prime

from keras.datasets import mnist
from keras.utils import np_utils

# load MNIST from server
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# training data : 60000 samples
# reshape and normalize input data
x_train = x_train.reshape(x_train.shape[0], 1, 28*28)
x_train = x_train.astype('float32')
print(x_train.shape)
normalize(x_train)
# encode output which is a number in range [0,9] into a vector of size 10
# e.g. number 3 will become [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
y_train = np_utils.to_categorical(y_train)

# same for test data : 10000 samples
x_test = x_test.reshape(x_test.shape[0], 1, 28*28)
x_test = x_test.astype('float32')
normalize(x_test)
y_test = np_utils.to_categorical(y_test)

# Network
net = Network()
net.add(FCLayer(28*28, 100))                # input_shape=(1, 28*28)    ;   output_shape=(1, 100)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(100, 50))                   # input_shape=(1, 100)      ;   output_shape=(1, 50)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(50, 10))                    # input_shape=(1, 50)       ;   output_shape=(1, 10)
net.add(ActivationLayer(tanh, tanh_prime))

# train on 1000 samples
# as we didn't implemented mini-batch GD, training will be pretty slow if we update at each iteration on 60000 samples...
net.use(mse, mse_prime)
net.fit(x_train[0:1000], y_train[0:1000], epochs=35, learning_rate=0.1)

# test on 3 samples
y_pred = net.predict(x_test)

(60000, 1, 784)
epoch 1/35   error=0.235308
epoch 2/35   error=0.094617
epoch 3/35   error=0.072925
epoch 4/35   error=0.060449
epoch 5/35   error=0.051586
epoch 6/35   error=0.044783
epoch 7/35   error=0.039248
epoch 8/35   error=0.034798
epoch 9/35   error=0.031083
epoch 10/35   error=0.028240
epoch 11/35   error=0.026067
epoch 12/35   error=0.024224
epoch 13/35   error=0.022646
epoch 14/35   error=0.021264
epoch 15/35   error=0.020058
epoch 16/35   error=0.018976
epoch 17/35   error=0.017990
epoch 18/35   error=0.017132
epoch 19/35   error=0.016290
epoch 20/35   error=0.015520
epoch 21/35   error=0.014716
epoch 22/35   error=0.013975
epoch 23/35   error=0.013295
epoch 24/35   error=0.012632
epoch 25/35   error=0.012001
epoch 26/35   error=0.011380
epoch 27/35   error=0.010866
epoch 28/35   error=0.010388
epoch 29/35   error=0.010005
epoch 30/35   error=0.009624
epoch 31/35   error=0.009311
epoch 32/35   error=0.008977
epoch 33/35   error=0.008712
epoch 34/35   error=0.008446
epoch 3

In [18]:
y_pred_argmax = []
for i in range(len(y_pred)):
    y_pred_argmax.append(np.argmax(y_pred[i]))

In [19]:
y_test_argmax = []
for i in range(len(y_test)):
    y_test_argmax.append(np.argmax(y_test[i]))

In [20]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_pred_argmax,y_test_argmax)
report['normalization'] = accuracy
accuracy

0.794

### Solve MNIST with standardization

In [21]:
def standardize(data):
    for sample in data:
        sample_mean = sample[0].mean()
        sample_std = sample[0].std()
        sample[0] = (sample[0]-sample_mean)/sample_std

In [22]:
import numpy as np

#from network import Network
#from fc_layer import FCLayer
#from activation_layer import ActivationLayer
#from activations import tanh, tanh_prime
#from losses import mse, mse_prime

from keras.datasets import mnist
from keras.utils import np_utils

# load MNIST from server
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# training data : 60000 samples
# reshape and normalize input data
x_train = x_train.reshape(x_train.shape[0], 1, 28*28)
x_train = x_train.astype('float32')
standardize(x_train)
# encode output which is a number in range [0,9] into a vector of size 10
# e.g. number 3 will become [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
y_train = np_utils.to_categorical(y_train)

# same for test data : 10000 samples
x_test = x_test.reshape(x_test.shape[0], 1, 28*28)
x_test = x_test.astype('float32')
normalize(x_test)
y_test = np_utils.to_categorical(y_test)

# Network
net = Network()
net.add(FCLayer(28*28, 100))                # input_shape=(1, 28*28)    ;   output_shape=(1, 100)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(100, 50))                   # input_shape=(1, 100)      ;   output_shape=(1, 50)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(50, 10))                    # input_shape=(1, 50)       ;   output_shape=(1, 10)
net.add(ActivationLayer(tanh, tanh_prime))

# train on 1000 samples
# as we didn't implemented mini-batch GD, training will be pretty slow if we update at each iteration on 60000 samples...
net.use(mse, mse_prime)
net.fit(x_train[0:1000], y_train[0:1000], epochs=35, learning_rate=0.1)

# test on 3 samples
y_pred = net.predict(x_test)

epoch 1/35   error=0.277231
epoch 2/35   error=0.098760
epoch 3/35   error=0.075744
epoch 4/35   error=0.063000
epoch 5/35   error=0.053754
epoch 6/35   error=0.046423
epoch 7/35   error=0.041075
epoch 8/35   error=0.036868
epoch 9/35   error=0.033681
epoch 10/35   error=0.031078
epoch 11/35   error=0.028855
epoch 12/35   error=0.026857
epoch 13/35   error=0.025311
epoch 14/35   error=0.023879
epoch 15/35   error=0.022621
epoch 16/35   error=0.021457
epoch 17/35   error=0.020282
epoch 18/35   error=0.019103
epoch 19/35   error=0.018216
epoch 20/35   error=0.017382
epoch 21/35   error=0.016731
epoch 22/35   error=0.016009
epoch 23/35   error=0.015379
epoch 24/35   error=0.014705
epoch 25/35   error=0.014239
epoch 26/35   error=0.013771
epoch 27/35   error=0.013331
epoch 28/35   error=0.012926
epoch 29/35   error=0.012480
epoch 30/35   error=0.012342
epoch 31/35   error=0.011868
epoch 32/35   error=0.011377
epoch 33/35   error=0.010971
epoch 34/35   error=0.010543
epoch 35/35   error=0.0

In [23]:
y_pred_argmax = []
for i in range(len(y_pred)):
    y_pred_argmax.append(np.argmax(y_pred[i]))

In [24]:
y_test_argmax = []
for i in range(len(y_test)):
    y_test_argmax.append(np.argmax(y_test[i]))

In [25]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_pred_argmax,y_test_argmax)
report['standardization'] = accuracy
accuracy

0.6249

In [26]:
print(report)

{'w/o_any': 0.2148, 'normalization': 0.794, 'standardization': 0.6249}
