# Programming Exercise 4: Neural Networks Learning

In [1]:
# Importing the needed libraries
# import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from neuralNetwork import *

# Optimization module in scipy
from scipy import optimize

# We'll use loadmap to load the matlab dataset
from scipy.io import loadmat

# tells matplotlib to embed plots within the notebook
%matplotlib inline

In [2]:
input_layer_size  = 400  # 20x20 Input Images of Digits
hidden_layer_size = 25   # 25 hidden units
num_labels = 10          # 10 labels, from 0 to 9 - output layer

# trying to represent the NN using an array.
# nnDef.shape[0] = number of layers
# nnDef[i] = number of neurons on layer i
nnDef = np.array([input_layer_size, hidden_layer_size, num_labels])

In [3]:
# Loading all data on a dictonary
data = loadmat('ex4data1.mat')

# Convert the data into a numpy array
X = data['X']
y = data['y'].flatten()

# m = number of training examples
# n = number of features
(m,n) = X.shape

# note that X has mapped "0" to label 10 because Matlab arrays start on 1
# We'll normalize the 10 value back to 0, so it matches the 0 digit
y[y == 10] = 0

In [4]:
# We create a Theta array of arrays
Theta = np.zeros(nnDef.size, dtype=np.ndarray)

# Load the weights data to initialize Theta
thetaMat = loadmat('ex4weights.mat')
Theta[1] = thetaMat['Theta1']
Theta[2] = thetaMat['Theta2']

# swap first and last columns of Theta2, due to legacy from MATLAB indexing, 
# since the weight file ex3weights.mat was saved based on MATLAB indexing
# Explanation: 0 in MATLAB is represented by 10. 
# Therefore the theta for 10 in Matlab corresponds with 0 in Python
Theta[2] = np.roll(Theta[2], 1, axis=0)

In [5]:
# NNCOSTFUNCTION Implements the neural network cost function for the
# neural network which performs classification
#   J, grad = nnCostFunction(Theta, nnDef, X, y, lmbd) 
# computes the cost and gradient of the neural network
# We are passing the Theta values as a single vector
# because the minimize function only works with 1-D vectors.
# That means that the function must reconstruct Theta as a first step
def nnCostFunction(Theta1D, nnDef, X, y, lmbd):
    # Useful variables
    (m, n) = X.shape                   # m = number of training examples, n = number of features
    num_labels = nnDef[-1]             # Output Layer units
    num_layers = nnDef.size            # Number of layers including the Input Layer
    
    Theta = ThetaFrom1D(Theta1D, nnDef)
    
    # a array containing the activation arrays
    # (using numbering from 1 to be coerent with notation)
    a = np.zeros((nnDef.size+1,), dtype=np.ndarray)

    a[1] = X                # The activation for the Input layer is X
    
    # Extending the y vector into an array where 1 representents the label
    y10 = np.zeros((m,num_labels))
    y= y[:, np.newaxis]
    for i in range(num_labels):
        y10[:,i][:,np.newaxis] = np.where(y==i,1,0) 
    
    # Forward Propagation
    for i in range(1, num_layers):
        # Add the bias unit to the a layer
        mLayer = a[i].shape[0]
        a[i] = np.append(np.ones((mLayer, 1)), a[i], axis=1)
        a[i+1] = sigmoid(np.dot(a[i], Theta[i].T))
        
    # Cost Function
    J = (-1/m)*np.sum((np.multiply(np.log(a[num_layers]), y10) + np.multiply((1-y10), np.log(1-a[num_layers]))))
    # Cost adding regularization
    for i in range(1, num_layers):
        J = J + (lmbd/(2*m))*(np.sum((np.power(Theta[i][:, 1:], 2))))
        
    # Getting the gradient
    grad = np.zeros((Theta.shape), dtype=np.ndarray)
    delta = np.zeros((nnDef.size+1,), dtype=np.ndarray)
    
    delta[num_layers] = (a[num_layers] - y10)
    for i in reversed(range(2, num_layers)):
        delta[i] = (np.dot(delta[i+1], Theta[i]))*(a[i]*(1-a[i]))
        delta[i] = delta[i][:, 1:]
    
    # Regularization part of the gradient
    for i in reversed(range(1, num_layers)):
        grad[i] = ((1/m)*np.dot(delta[i+1].T, a[i])) + ((lmbd/m)*np.hstack((np.zeros((Theta[i].shape[0],1)),Theta[i][:,1:])))
    
    grad1D = ThetaTo1D(grad, nnDef)
    
    return J, grad1D

In [6]:
# Function that converts the Theta array of arrays
# in a 1D vector. This is required to be sent to 
# the minimize sCiPy function
def ThetaTo1D(Theta, nnDef):
    # Useful variables
    num_labels = nnDef[-1]             # Output Layer units
    num_layers = nnDef.size            # Number of layers including the Input Layer
    
    Theta1D = Theta[1].reshape(-1)
    for i in range(2, num_layers):
        Theta1D = np.concatenate((Theta1D, Theta[i].reshape(-1)), axis=None)
    
    return Theta1D

# Function that converts back Theta from a 1D vector
# into an array of arrays
def ThetaFrom1D(Theta1D, nnDef):
    # Useful variables
    num_labels = nnDef[-1]             # Output Layer units
    num_layers = nnDef.size            # Number of layers including the Input Layer
    
    Theta = np.zeros(nnDef.size, dtype=np.ndarray)
    start=0
    for i in range(1, num_layers):
        Theta[i]= Theta1D[start:(start + (nnDef[i] * (nnDef[i-1]+1)))].reshape(nnDef[i], nnDef[i-1]+1)
        start = start + (nnDef[i] * (nnDef[i-1]+1)+1) - 1 
    
    return Theta

## Compute Cost Function (Feedforward)

In [7]:
Theta1D = ThetaTo1D(Theta, nnDef)

In [8]:
J, grad = nnCostFunction(Theta1D, nnDef, X, y, 0);

print('Cost at parameters (loaded from ex4weights): {:.6f} \n(this value should be about 0.287629)\n'.format(J))

Cost at parameters (loaded from ex4weights): 0.287629 
(this value should be about 0.287629)



## Implement Regularization in the Cost Function

In [9]:
lmbd = 1

J, grad = nnCostFunction(Theta1D, nnDef, X, y, lmbd)

print('Cost at parameters (loaded from ex4weights): {:.6f} \n(this value should be about 0.383770)\n'.format(J))

Cost at parameters (loaded from ex4weights): 0.383770 
(this value should be about 0.383770)



## Checking NN gradients

In [10]:
lmbd = 3
J, grad = nnCostFunction(Theta1D, nnDef, X, y, lmbd)

print('Cost at (fixed) debugging parameters (w/ lambda = 3) {:.6f}\n(for lambda = 3, this value should be about 0.576051)\n'. format(J))

Cost at (fixed) debugging parameters (w/ lambda = 3) 0.576051
(for lambda = 3, this value should be about 0.576051)



## Train the NN

In [37]:
Theta= initializeTheta(nnDef)
Theta1D = ThetaTo1D(Theta, nnDef)

In [38]:
lmdb = 1

# set options for optimize.minimize
options= {'maxiter': 500}

# The function returns an object `OptimizeResult`
# We use truncated Newton algorithm for optimization which is 
# equivalent to MATLAB's fminunc
# See https://stackoverflow.com/questions/18801002/fminunc-alternate-in-numpy
res = optimize.minimize(nnCostFunction,
                        Theta1D,
                        (nnDef, X, y.flatten(), lmbd),
                        jac=True,
                        method='TNC',
                        options=options)

# the fun property of `OptimizeResult` object returns
# the value of costFunction at optimized theta
cost = res.fun

# the optimized theta is in the x property
theta = res.x

# Print theta to screen
print('Cost at theta found by optimize.minimize: {:.3f}'.format(cost))

Cost at theta found by optimize.minimize: 0.563


In [39]:
Theta = ThetaFrom1D(theta, nnDef)

In [40]:
def feedForwardPropagation(Theta, X, nnDef):
    # Useful variables
    (m, n) = X.shape                   # m = number of training examples, n = number of features
    num_labels = nnDef[-1]             # Output Layer units
    num_layers = nnDef.size            # Number of layers including the Input Layer
    p = np.zeros((1, num_labels))       # Classification vector
    Prob = np.zeros((m, num_labels))    # Matrix contaning all the probabilities per label
    # layerActiv= Array containing the activation arrays
    layerActiv = np.zeros((nnDef.size,), dtype=np.ndarray)

    layerActiv[0] = X             # The activation for the Input layer is X

    for i in range(num_layers-1):
        # Add the bias unit to the activation
        mLayer = layerActiv[i].shape[0]
        layerActiv[i] = np.append(np.ones((mLayer, 1)), layerActiv[i], axis=1)
        layerActiv[i+1] = sigmoid(np.dot(layerActiv[i], Theta[i+1].T))

    Prob = layerActiv[-1]
    p = np.argmax(layerActiv[-1], axis=1)

    return Prob, p

In [41]:
Prob, pred = feedForwardPropagation(Theta, X, nnDef)

In [42]:
print('\nTraining Set Accuracy: {:.2f} %'.format(np.mean(pred == y) * 100))


Training Set Accuracy: 97.56 %
