In [1]:
#show the figures buildin the notebook
%matplotlib inline 
import numpy as np
import matplotlib.pyplot as plt
import scipy.io #Used to load the OCTAVE *.mat files
import scipy.misc #Used to show matrix as an image
from scipy import optimize
import matplotlib.cm as cm #Used to display images in a specific colormap
import random
import sys
sys.path.append ('../../../../src/nn') 
from costs import NN_CostFunction

In [2]:
fileName = 'data/ex4data1.mat'
mat = scipy.io.loadmat(fileName)

X = mat['X']
Y = mat['y']
X = np.insert(X,0,1,axis=1)
Y = Y - 1#Y[Y==10]=0   # convert 10 to 0
print ('"X" shape:%s.X[0] shape:%s' %(X.shape,X[0].shape))
print ('"y" shape:%s. Unique elements in y:%s' %(Y.shape,np.unique(Y)))

"X" shape:(5000, 401).X[0] shape:(401,)
"y" shape:(5000, 1). Unique elements in y:[0 1 2 3 4 5 6 7 8 9]


In [3]:
 def sigmoid(arr, theta):
    """
    function that sigmoid both the input samples and the parameters
    """
    z = np.dot(arr, theta)
    return 1.0 / (1 + np.exp(-z))

def randInitializeWeights(input_layer_size, hidden_layer_size):
    """
    episilon = np.sqrt(6)/np.sqrt(Lin + Lout)
    Lin = the number of input layer unit
    Lout = the number of the adjacent layer unit
    """
    episilon = 0.12
    return np.random.rand(input_layer_size,hidden_layer_size+1) * 2.0 * episilon - episilon


def sigmoidGradient(arr, theta):
    sig = sigmoid(arr, theta)
    return sig * ( 1 - sig)

def reshapeParams(nn_params, input_layer_size=400, hidden_layer_size=25, num_labels=10):
    """
    function is used to reshape the input parameter:theta with type:list as 2 arrays, return it
    """
    print ("the type of nn_params in reshapeParams is:%s" % type(nn_params))
    theta1 = np.array(nn_params[:(input_layer_size+1) * hidden_layer_size]).reshape((hidden_layer_size,input_layer_size + 1))
    theta2 = np.array(nn_params[-num_labels * (hidden_layer_size+1):]).reshape((num_labels, hidden_layer_size+1))
    return (theta1, theta2)

def formatY(Y,num_labels=10):
    result = np.zeros((Y.shape[0],num_labels))
    for idx in range(Y.shape[0]):
        result[idx,Y[idx,0]] = 1
    return result

def nnForward (nn_params,  X):
    theta1, theta2 = reshapeParams(nn_params, input_layer_size, hidden_layer_size, num_labels)

    a1 = sigmoid(X, theta1.T) # m * hidden_layer_size
    a1 = np.insert(a1,0, 1, axis=1) # m * (hidden_layer_size + 1)
    #print a1[:10]
    #print "a1's shape:(%d,%d)" % a1.shape

    a2 = sigmoid(a1, theta2.T) # m * num_labels
    return a2

def nnCostFunction(nn_params,  X, Y, lamda=0.0,input_layer_size=400, hidden_layer_size=25,
                   num_labels=10):
    """
    function to calculate the loss error of the samples
    """
    print ("the type of nn_params in nnCostFunction is:%s" % type(nn_params))

    theta1, theta2 = reshapeParams(nn_params, input_layer_size, hidden_layer_size, num_labels)

    a1 = sigmoid(X, theta1.T) # m * hidden_layer_size
    a1 = np.insert(a1,0, 1, axis=1) # m * (hidden_layer_size + 1)
    #print a1[:10]
    #print "a1's shape:(%d,%d)" % a1.shape

    a2 = sigmoid(a1, theta2.T) # m * num_labels
    #print (a2[:10])
    #print "a2's shape:(%d,%d)" % a2.shape

    # format Y from m * 1 to a m*num_labels array
    fY = formatY(Y,num_labels)
    #print "Y's shape:(%d,%d)" % fY.shape
    #print (fY)
    J = -(np.sum(np.log(a2[fY==1])) + np.sum(np.log(1.0 - a2[fY==0])))
    m = len(X)

    J = J/m + lamda * (np.sum(theta1**2) + np.sum(theta2**2)) /(2*m)
    print ("cost value:%f" % J)
    return J

paramFile = 'data/ex4weights.mat'
params = scipy.io.loadmat(paramFile)
Theta1 = params['Theta1']
Theta2 = params['Theta2']

input_layer_size=400 # NO of features of samples
hidden_layer_size=25 # NO of Hidden Units 
num_labels = 10 # NO of Output Units

theta = np.append(Theta1.flatten(),Theta2.flatten())

test_prediction = nnForward (theta,X)
print ( nnCostFunction(theta,X,Y,0.0))

print (nnCostFunction(theta,X,Y,1.0))

the type of nn_params in reshapeParams is:<class 'numpy.ndarray'>
the type of nn_params in nnCostFunction is:<class 'numpy.ndarray'>
the type of nn_params in reshapeParams is:<class 'numpy.ndarray'>
cost value:0.287629
0.2876291651613189
the type of nn_params in nnCostFunction is:<class 'numpy.ndarray'>
the type of nn_params in reshapeParams is:<class 'numpy.ndarray'>
cost value:0.384488
0.384487796242894


In [4]:
from functools import reduce
Thetas = [Theta1,Theta2]
def sigmoidACV (Z):
    return 1 / (1 + np.exp (-Z))

def forward_prop (X,Theta):
    ai = X 
    cache = {'a1':X}
    n_layers = len (Theta) + 1
    for i in range(2,n_layers + 1):
        zi = np.dot (ai , Theta [i-2].T )
        ai = sigmoidACV (zi)
        cache ['z'+str(i)] = zi
        if i != n_layers :
            ai = np.insert (ai,0,1, axis=1)
        cache ['a'+str(i)] = ai

    return ai , cache

def NNcostFunction ( A_last, Y , Thetas, l2_lambda=0.0 ):
    cost = NN_CostFunction (A_last, Y )
    if l2_lambda != 0.0:
        # Calculamos regularization
        m = Y.shape [0]  # Número de muestras
        l2_cost = (l2_lambda / ( 2 * m ) ) * reduce (lambda ws, w: ws + np.sum(np.square(w)),Thetas,0)
        cost += l2_cost
        print ('costFunction, regularization value {} , with lambda {}'.format(l2_cost,l2_lambda))
    return cost

#ai_t = nnForward (theta,X)
ai,cache = forward_prop (X,Thetas)
fY = formatY(Y,num_labels)
cost = NNcostFunction ( ai , fY, Thetas,l2_lambda = 1.0)
print (cost)
print (cache['a1'].shape)
print (cache['z2'].shape)
print (cache['a2'].shape)
print (cache['a3'].shape)
print (cache['z3'].shape)

costFunction, regularization value 0.09685863108157507 , with lambda 1.0
0.384487796242894
(5000, 401)
(5000, 25)
(5000, 26)
(5000, 10)
(5000, 10)


In [5]:
def backward_prop (X,Y,Thetas,l2_lambda=0.0):
    n_layers = len (Thetas) + 1
    m = X.shape [0]
    prediction,cache = forward_prop (X,Thetas)
    prediction = np.array (prediction)
    print ('Prediction shape',prediction.shape)
    print ('Y shape' , Y.shape)
    delta = {}
    deltai = prediction - Y
    delta ['delta'+str(n_layers) ] = deltai 
    for i in reversed (range(2,n_layers)):
        print ('Calculating delta layer ', i)
        ai = cache['a'+str(i)]
        ai = np.delete (ai,0,axis=1)  # Elimino los 1's que no se tienen en cuenta en el cálculo de deltas
        gprima_z = ai * ( 1 - ai )
        print ('gprima_z shape',gprima_z.shape)
        print ('deltai shape',deltai.shape)

        step1 = np.dot(deltai,Thetas[i-1][:,1:])
        deltaiprev = step1 * gprima_z
        delta ['delta'+str(i) ] = deltaiprev 
        print ('deltaiPREV shape',deltaiprev.shape)
        deltai = deltaiprev
        
    grads = {}
    for i in reversed(range (1, n_layers ) ):
        print ('Calculating grad2 ' , i)
        grad = np.dot (delta['delta'+str(i+1)].T , cache['a'+str(i)] ) / m
        grad [:,1:] = grad [:,1:] + (l2_lambda * Thetas[i-1][:,1:] / m )
        grads ['grad'+str(i)] = grad
        print ('Grad {} shapes delta{} {} a{} {} , grad shape {} ' .format( i,(i+1),delta['delta'+str(i+1)].shape,i, cache['a'+str(i)].shape,grad.shape))
        
    return delta , grads


In [6]:
def backpropagation(nn_params,  X, Y, lamda=0.0,input_layer_size=400, hidden_layer_size=25,
                       num_labels=10):
    theta1, theta2 = reshapeParams(nn_params, input_layer_size, hidden_layer_size, num_labels)
    a2 = sigmoid(X, theta1.T) # m * hidden_layer_size
    a2 = np.insert(a2,0, 1, axis=1) # m * (hidden_layer_size + 1)
    a3 = sigmoid(a2, theta2.T) # m * num_labels

    # format Y from m * 1 to a m*num_labels array
    fY = formatY(Y,num_labels)

    delta3 = a3 - fY   # m * num_labels
    delta2 = np.dot(delta3, theta2[:,1:]) * sigmoidGradient(X, theta1.T)   # m * (hidden_layer_size)
    siggrad = sigmoidGradient(X, theta1.T) 
    print ('Sigmoid gradient - shapes X {} theta {} sigrad {}'.format(X.shape,theta1.shape,siggrad.shape))
    
    grad2 = np.dot(delta3.T, a2) / X.shape[0] # num_labels * (hidden_layer_size+1)
    print ('Grad2 shapes delta3 {} a2 {}, grad2 {} ' .format( delta3.shape,a2.shape,grad2.shape))
    grad2[:,1:] = grad2[:,1:] + (lamda * theta2[:,1:]/X.shape[0]) 
    
    grad1 = np.dot(delta2.T, X) / X.shape[0] # (hidden_layer_size) * (input_layer_size+1)
    grad1[:,1:] = grad1[:,1:] + (lamda * theta1[:,1:]/X.shape[0])

    return np.append(grad1.flatten(),grad2.flatten()) , {'delta3':delta3, 'delta2':delta2 ,'grad2':grad2,'grad1':grad1,'gradienteinicial':siggrad}

def computeNumericalGradient(mytheta, X, Y, mylambda=0.0,input_layer_size=400, hidden_layer_size=25,
                   num_labels=10):
    """
    mytheta is a flatten array
    """
    print (input_layer_size,hidden_layer_size,num_labels)
    print (mytheta.shape)
    ngrad = np.zeros((len(mytheta),1))
    episode = 0.0001
    for i in range(len(mytheta)):
        theta_plus = mytheta.copy()
        theta_plus[i]=theta_plus[i] + episode
        theta_minus = mytheta.copy()
        theta_minus[i] = theta_minus[i] - episode
        ngrad[i]=(nnCostFunction(theta_plus,  X, Y,mylambda,input_layer_size,hidden_layer_size,num_labels) - nnCostFunction(theta_minus, X, Y,mylambda,input_layer_size,hidden_layer_size,num_labels))/ (2 * episode)

    return ngrad

def checkNNGradient(mylambda=0.0):
    input_layer_size = 3;
    hidden_layer_size = 5;
    num_labels = 3;
    m = 5;
    theta1 = randInitializeWeights(hidden_layer_size,input_layer_size);
    theta2 = randInitializeWeights(num_labels,hidden_layer_size);
    X = randInitializeWeights(m, input_layer_size - 1)
    X = np.insert(X,0,1,axis=1)
    Y = (np.arange(m) % 3).reshape(m,1)

    ngrad = computeNumericalGradient(np.append(theta1.flatten(),theta2.flatten()),X,Y,mylambda,input_layer_size,hidden_layer_size,num_labels)

    print (ngrad.shape)

    grad = backpropagation(np.append(theta1.flatten(),theta2.flatten()),  X, Y, mylambda,input_layer_size, hidden_layer_size,num_labels)
    print (grad.shape)
    #print (ngrad.flatten(),grad.flatten())
    print ("%.15f" % (norm(ngrad.flatten() - grad) / norm(ngrad.flatten() + grad)))

def norm(arr):
    return np.sqrt(np.dot(arr,arr.T))

mylambda = 0.0
grad ,deltas= backpropagation(np.append(Theta1.flatten(),Theta2.flatten()),  X, Y, mylambda,input_layer_size, hidden_layer_size,num_labels)
mideltas , grads = backward_prop (X,fY,Thetas,l2_lambda=mylambda)
assert np.array_equal (deltas['delta3'],mideltas['delta3'])
assert np.array_equal (deltas['delta2'],mideltas['delta2'])
assert np.array_equal (deltas['grad2'],grads['grad2'])
assert np.array_equal (deltas['grad1'],grads['grad1'])

the type of nn_params in reshapeParams is:<class 'numpy.ndarray'>
Sigmoid gradient - shapes X (5000, 401) theta (25, 401) sigrad (5000, 25)
Grad2 shapes delta3 (5000, 10) a2 (5000, 26), grad2 (10, 26) 
Prediction shape (5000, 10)
Y shape (5000, 10)
Calculating delta layer  2
gprima_z shape (5000, 25)
deltai shape (5000, 10)
deltaiPREV shape (5000, 25)
Calculating grad2  2
Grad 2 shapes delta3 (5000, 10) a2 (5000, 26) , grad shape (10, 26) 
Calculating grad2  1
Grad 1 shapes delta2 (5000, 25) a1 (5000, 401) , grad shape (25, 401) 


In [7]:
import sys
sys.path.append ('../../../../src/nn') 
from layers import FullyConnected
from ActivationFunctions import Sigmoid
from NeuralNetwork import NeuralNetwork
from costs import NN_CostFunction , NN_Gradient

def formatY(Y,num_labels=10):
    result = np.zeros((Y.shape[0],num_labels))
    for idx in range(Y.shape[0]):
        result[idx,Y[idx,0]] = 1
    return result

paramFile = 'data/ex4weights.mat'
params = scipy.io.loadmat(paramFile)
Theta1 = params['Theta1']
Theta2 = params['Theta2']
print (Theta1.shape)
print (Theta2.shape)

(25, 401)
(10, 26)


In [8]:
inp_dimension = 400
num_labels=10
fY = formatY(Y,num_labels)
mX = np.delete (X,0,axis=1)

layers = [FullyConnected (25,Sigmoid(),'HiddenLayer1',debug=True,kargs={'W':Theta1[:,1:] , 'b':Theta1[:,0:1]}),
          FullyConnected (10,Sigmoid(),'OutputLayer',output_layer=True,debug=True,kargs={'W':Theta2[:,1:] , 'b':Theta2[:,0:1]})
         ]
nn = NeuralNetwork ( inp_dimension,
                    layers, 
                    NN_CostFunction ,
                    NN_Gradient ,
                    debug = True,
                    l2_lambda = mylambda
                    
                   )

prediction = nn.forward_prop (mX,training=True)
#print (prediction.shape)
#print (fY.shape)
Theta1Layer = np.insert (nn.layers[0].W,0,nn.layers[0].b,axis=1) 
Theta2Layer = np.insert (nn.layers[1].W,0,nn.layers[1].b,axis=1)
assert np.array_equal (Theta1,Theta1Layer)
assert np.array_equal (Theta2,Theta2Layer)

cost = nn.costFunction (prediction,fY)
print (cost)
gradsnn = nn.backward_prop(prediction,fY)

FC - HiddenLayer1 - init , initialization with input dimension 400 
FC - {} - init , initialization forzada 
FC - OutputLayer - init , initialization with input dimension 25 
FC - {} - init , initialization forzada 
FC - HiddenLayer1 - forward , starting forward prop
FC - HiddenLayer1 - forward , Shapes: A_prev (5000, 400), Z (5000, 25) , A (5000, 25) 
FC - OutputLayer - forward , starting forward prop
FC - OutputLayer - forward , Shapes: A_prev (5000, 25), Z (5000, 10) , A (5000, 10) 
0.2876291651613189
FC - OutputLayer - backward , starting backward prop
FC - HiddenLayer1 - backward , starting backward prop


In [24]:
#### grad2 = np.insert (gradsnn['grad2']['W'],0,gradsnn['grad2']['b'],axis=1)
grad1 = np.insert (gradsnn['grad1']['W'],0,gradsnn['grad1']['b'],axis=1)
r = deltas['grad2'] - grad2
np.testing.assert_allclose (deltas['grad2'],grad2,rtol=1e-10, atol=0)
np.testing.assert_allclose (deltas['grad1'],grad1,rtol=1e-10, atol=0)

In [28]:
epsilon = 1e-20
def SoftmaxCrossEntropyGrad (a_last,y):
    return -np.divide(y,np.clip(a_last,epsilon,1.0))
def SigmoidCrossEntropyGrad (a_last,y):
    return -np.divide(y,a_last) - np.divide(1-y,1-a_last)

ai,cache = forward_prop (X,Thetas)
fY = formatY(Y,num_labels)

dA = SoftmaxCrossEntropyGrad (ai,fY)
a = cache['a3']
y = dA * (-a)
print (y.shape)
dz = a - y

(5000, 10)


In [29]:
print (y)

[[0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]]


In [5]:
import sys
sys.path.append ('../../../../src/nn') 
from layers import FullyConnected
from ActivationFunctions import Sigmoid
from NeuralNetwork import NeuralNetwork
from costs import NN_CostFunction , NN_Gradient


In [26]:
n_layers=3
for i in range(2,n_layers+1):
    print (i)

2
3
