In [1]:
import numpy as np
from h5py import File
import scipy.io #Used to load the OCTAVE *.mat files
import numpy as np
import sys
sys.path.append ('../src')
from NeuralNetwork import NNClassifier
from ML_utils import softmax,sigmoid,UTIL_formatY,backward_prop,backpropagation
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix,classification_report




In [2]:
MNIST_data = File("data/MNISTdata.hdf5", 'r')
x_train = np.float32(MNIST_data['x_train'][:])
y_train = np.int32(np.array(MNIST_data['y_train'][:, 0])).reshape(-1, 1)
x_test  = np.float32(MNIST_data['x_test'][:])
y_test  = np.int32(np.array(MNIST_data['y_test'][:, 0])).reshape(-1, 1)
MNIST_data.close()

# stack together for next step
X = np.vstack((x_train, x_test))
print (X.shape)
y = np.vstack((y_train, y_test))
print (y.shape)

# one-hot encoding
digits = 10
examples = y.shape[0]
y = y.reshape(1, examples)
Y_new = np.eye(digits)[y.astype('int32')]
Y_new = Y_new.T.reshape(digits, examples)


# number of training set
m = 60000
m_test = X.shape[0] - m
X_train, X_test = X[:m].T, X[m:].T
Y_train, Y_test = Y_new[:, :m], Y_new[:, m:]


# shuffle training set
shuffle_index = np.random.permutation(m)
X_train, Y_train = X_train[:, shuffle_index], Y_train[:, shuffle_index]

(70000, 784)
(70000, 1)


In [3]:
opt = {'n_h':64,'n_x':784,'epochs':10, 'batch_size':64 , 'beta':0.9,'lr':0.5}

# initialization
params = {"W1": np.random.randn(opt['n_h'], opt['n_x']) * np.sqrt(1. / opt['n_x']),
          "b1": np.zeros((opt['n_h'], 1)) * np.sqrt(1. / opt['n_x']),
          "W2": np.random.randn(digits, opt['n_h']) * np.sqrt(1. / opt['n_h']),
          "b2": np.zeros((digits, 1)) * np.sqrt(1. / opt['n_h'])}
print (params['W1'].shape)
print (params['b1'].shape)
print (params['W2'].shape)
print (params['b2'].shape)

(64, 784)
(64, 1)
(10, 64)
(10, 1)


In [5]:
T1 = np.copy(np.hstack(( params['b1'],params['W1'])))
print (T1.shape)
T2 = np.copy(np.hstack(( params['b2'],params['W2'],)))
print (T2.shape)


(64, 785)
(10, 65)


In [6]:

nn_config={'n_a1':784,'n_a2':64,'n_a3':10 }   # Configuración de red NN , input layer , hidder layers , output layer
sgd_dict =   {'steps':10,'learning_rate':0.5,'mini_batch_size':2**8}
opt_dict =   {'maxiter':100,'algorithm' : 'TNC'}
activ = {'activation_a2':sigmoid,'activation_a3':softmax}
method = 'miniBatchGD'
nn = NNClassifier(optimization=method,bias=True,nn_config=nn_config,activ=activ,debug=False,kargs=sgd_dict)
print (nn.thetas['Theta1'].shape)
print (nn.thetas['Theta2'].shape)
nn.thetas['Theta1'] = T1
nn.thetas['Theta2'] = T2

(64, 785)
(10, 65)


In [7]:
def sigmoid(z):
    """
    sigmoid activation function.

    inputs: z
    outputs: sigmoid(z)
    """
    s = 1. / (1. + np.exp(-z))
    return s
def compute_loss(Y, Y_hat):
    """
    compute loss function
    """
    L_sum = np.sum(np.multiply(Y, np.log(Y_hat)))
    m = Y.shape[1]
    L = -(1./m) * L_sum

    return L
def feed_forward(X, params):
    """
    feed forward network: 2 - layer neural net

    inputs:
        params: dictionay a dictionary contains all the weights and biases

    return:
        cache: dictionay a dictionary contains all the fully connected units and activations
    """
    cache = {}

    # Z1 = W1.dot(x) + b1
    cache["Z1"] = np.matmul(params["W1"], X) + params["b1"]
    
    # A1 = sigmoid(Z1)
    cache["A1"] = sigmoid(cache["Z1"])

    # Z2 = W2.dot(A1) + b2
    cache["Z2"] = np.matmul(params["W2"], cache["A1"]) + params["b2"]

    # A2 = softmax(Z2)
    cache["A2"] = np.exp(cache["Z2"]) / np.sum(np.exp(cache["Z2"]), axis=0)
#    cache["A2"] = sigmoid(cache["Z2"])

    return cache
def back_propagate(X, Y, params, cache, m_batch):
    """
    back propagation

    inputs:
        params: dictionay a dictionary contains all the weights and biases
        cache: dictionay a dictionary contains all the fully connected units and activations

    return:
        grads: dictionay a dictionary contains the gradients of corresponding weights and biases
    """
    # error at last layer
    dZ2 = cache["A2"] - Y

    # gradients at last layer (Py2 need 1. to transform to float)
    dW2 = (1. / m_batch) * np.matmul(dZ2, cache["A1"].T)
    db2 = (1. / m_batch) * np.sum(dZ2, axis=1, keepdims=True)

    # back propgate through first layer
    dA1 = np.matmul(params["W2"].T, dZ2)
    dZ1 = dA1 * sigmoid(cache["Z1"]) * (1 - sigmoid(cache["Z1"]))

    # gradients at first layer (Py2 need 1. to transform to float)
    dW1 = (1. / m_batch) * np.matmul(dZ1, X.T)
    db1 = (1. / m_batch) * np.sum(dZ1, axis=1, keepdims=True)

    grads = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2}

    return grads

In [8]:


batches=10
for i in range(opt['epochs']):

    # shuffle training set
    permutation = np.random.permutation(X_train.shape[1])
    X_train_shuffled = X_train[:, permutation]
    Y_train_shuffled = Y_train[:, permutation]

    for j in range(batches):
        #np.testing.assert_allclose (params['W1'],nn.thetas['Theta1'][:,1:])
        #np.testing.assert_allclose (params['W2'],nn.thetas['Theta2'][:,1:])
        #np.testing.assert_allclose (params['b1'],nn.thetas['Theta1'][:,0:1])
        #np.testing.assert_allclose (params['b2'],nn.thetas['Theta2'][:,0:1])

        # get mini-batch
        begin = j * opt['batch_size']
        end = min(begin + opt['batch_size'], X_train.shape[1] - 1)
        X = X_train_shuffled[:, begin:end]
        Y = Y_train_shuffled[:, begin:end]
        m_batch = end - begin

        nnX_train = np.copy(X.reshape (X.shape[1],X.shape[0]))
        nnY_train = np.copy(Y.reshape (Y.shape[1],Y.shape[0]))

        # forward and backward
        cache = feed_forward(X, params)
        #prediction,nncache = nn.forward_prop (X.T)
        #np.testing.assert_allclose (prediction.T,cache['A2'])
        grads = back_propagate(X, Y, params, cache, m_batch)
        #delta,nngrads = nn.backward_prop (X.T,Y.T)
        #np.testing.assert_allclose (grads['dW1'],nngrads['grad1'][:,1:])
        #np.testing.assert_allclose (grads['dW2'],nngrads['grad2'][:,1:])
        #np.testing.assert_allclose (grads['db1'],nngrads['grad1'][:,0:1])
        #np.testing.assert_allclose (grads['db2'],nngrads['grad2'][:,0:1])

        # with momentum (optional)
        dW1 = grads ['dW1']
        db1 = grads ['db1']
        dW2 = grads ['dW2']
        db2 = grads ['db2']
        
        dW1 = (opt['beta'] * dW1 + (1. - opt['beta']) * grads["dW1"])
        db1 = (opt['beta'] * db1 + (1. - opt['beta']) * grads["db1"])
        dW2 = (opt['beta'] * dW2 + (1. - opt['beta']) * grads["dW2"])
        db2 = (opt['beta'] * db2 + (1. - opt['beta']) * grads["db2"])
    
        # gradient descent
        params["W1"] = params["W1"] - opt['lr'] * dW1
        params["b1"] = params["b1"] - opt['lr'] * db1
        params["W2"] = params["W2"] - opt['lr'] * dW2
        params["b2"] = params["b2"] - opt['lr'] * db2
        #nn._updateThetas (nngrads,opt['lr'])
        #np.testing.assert_allclose (params['W1'],nn.thetas['Theta1'][:,1:])
        
    # forward pass on training set
    cache = feed_forward(X_train, params)
    #prediction,_ = nn.forward_prop (X_train.T)
    train_loss = compute_loss(Y_train, cache["A2"])
    #cost = nn.costFunction (X_train.T,Y_train.T)
    # forward pass on test set
    cache = feed_forward(X_test, params)
    test_loss = compute_loss(Y_test, cache["A2"])
    print("Epoch {}: training loss = {}, test loss = {}".format(
        i + 1, train_loss, test_loss))

Epoch 1: training loss = 1.970743261533071, test loss = 1.9629573743205424
Epoch 2: training loss = 1.5994230516340866, test loss = 1.5875651002471802
Epoch 3: training loss = 1.2855953514193994, test loss = 1.2678200435568923
Epoch 4: training loss = 1.0735109749297258, test loss = 1.0567940953373172
Epoch 5: training loss = 0.9041955996535975, test loss = 0.8853720079015203
Epoch 6: training loss = 0.8177456112023864, test loss = 0.7986463499783694
Epoch 7: training loss = 0.7207538807386803, test loss = 0.7001803659369845
Epoch 8: training loss = 0.6841058784790693, test loss = 0.6695612783391528
Epoch 9: training loss = 0.6166655116708453, test loss = 0.6008955548506377
Epoch 10: training loss = 0.5808964832872017, test loss = 0.5640612607930451


In [3]:
from sklearn.metrics import multilabel_confusion_matrix,classification_report
result = feed_forward (X_test,params)
prediction = np.argmax(result['A2'],axis=0)
print (prediction[0:5])

print(f"Classification report for classifier :\n"
      f"{classification_report(y_test,prediction)}\n")

NameError: name 'feed_forward' is not defined

In [3]:
nn_config={'n_a1':784,'n_a2':64,'n_a3':10 }   # Configuración de red NN , input layer , hidder layers , output layer
sgd_dict =   {'steps':10,'learning_rate':0.5,'mini_batch_size':2**8}
opt_dict =   {'maxiter':100,'algorithm' : 'TNC'}
activ = {'activation_a2':sigmoid,'activation_a3':softmax}
method = 'miniBatchGD'

nn = NNClassifier(optimization=method,bias=True,nn_config=nn_config,activ=activ,debug=False,kargs=sgd_dict)
costs = nn.optimize (X_train.T,Y_train.T,l2_lambda=0.0)
prediction,_ = nn.forward_prop (X_train.T)
result = np.argmax(prediction,axis=1).reshape(-1,1)
y = np.argmax(Y_train.T,axis=1).reshape(-1,1)

accuracy = np.mean(y==result) * 100
print ('Trainig set accuracy :' , accuracy  )

test_predicted,_ = nn.forward_prop (X_test.T)
result = np.argmax(test_predicted,axis=1).reshape(-1,1)
y = np.argmax(Y_test.T,axis=1).reshape(-1,1)
accuracy = np.mean(y==result) * 100
print ('Test set accuracy :' , accuracy  )

print(f"Classification report for classifier {nn}:\n"
      f"{classification_report(y,result)}\n")

LR = 0.5:  30%|█████████████████████▉                                                   | 3/10 [00:09<00:21,  3.14s/it]


KeyboardInterrupt: 

In [None]:
nn_config={'n_a1':784,'n_a2':128 ,'n_a3':64,'n_a4':10 }   # Configuración de red NN , input layer , hidder layers , output layer
sgd_dict =   {'steps':10,'learning_rate':0.5,'mini_batch_size':2**8}
opt_dict =   {'maxiter':100,'algorithm' : 'TNC'}
activ = {'activation_a2':sigmoid,'activation_a3':sigmoid,'activation_a4':softmax}
method = 'miniBatchGD'

nn = NNClassifier(optimization=method,bias=True,nn_config=nn_config,activ=activ,debug=False,kargs=sgd_dict)
costs = nn.optimize (X_train.T,Y_train.T,l2_lambda=1.0)
prediction,_ = nn.forward_prop (X_train.T)
result = np.argmax(prediction,axis=1).reshape(-1,1)
y = np.argmax(Y_train.T,axis=1).reshape(-1,1)

accuracy = np.mean(y==result) * 100
print ('Trainig set accuracy :' , accuracy  )

test_predicted,_ = nn.forward_prop (X_test.T)
result = np.argmax(test_predicted,axis=1).reshape(-1,1)
y = np.argmax(Y_test.T,axis=1).reshape(-1,1)
accuracy = np.mean(y==result) * 100
print ('Test set accuracy :' , accuracy  )

print(f"Classification report for classifier {nn}:\n"
      f"{classification_report(y,result)}\n")

LR = 0.5:   0%|                                                                                 | 0/10 [00:00<?, ?it/s]

0.009658463671320509
costFunction, regularization value 1.0267061398988189 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.009596299146942835
costFunction, regularization value 1.0226846921182635 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.009560872827665639
costFunction, regularization value 1.0186613030022602 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 6

0.008880885931296564
costFunction, regularization value 0.9429117902991915 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.008852552622163723
costFunction, regularization value 0.9393381898179508 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.008835722362357155
costFunction, regularization value 0.9358086572030425 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 6

Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.00841135015335978
costFunction, regularization value 0.874818845845756 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.008393828841336595
costFunction, regularization value 0.8715837765404749 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.008377409211295952
costFunction, regularization value 0.8684051198196806 , with lam

Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.00819011098826656
costFunction, regularization value 0.8116992430380652 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.008195671163465446
costFunction, regularization value 0.8089761768932882 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), 

Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.008607130308915783
costFunction, regularization value 0.7576401483759218 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.008633985913580455
costFunction, regularization value 0.7555340381458648 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.008686558622229147
costFunction, regularization value 0.7536257419856615 , with l

Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.010142676649456984
costFunction, regularization value 0.7229813470940043 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.01019729987146396
costFunction, regularization value 0.7216624851996185 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,th

0.012656658637743269
costFunction, regularization value 0.7067812894036308 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.012794833710341484
costFunction, regularization value 0.7060917584199166 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.01290572506461524
costFunction, regularization value 0.7056162277752993 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64

0.015530810978373825
costFunction, regularization value 0.6963433410764601 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.015606246312022481
costFunction, regularization value 0.6957717942113331 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.01576767990301385
costFunction, regularization value 0.6952370004043138 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64

0.01853242504308865
costFunction, regularization value 0.683856060665431 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.018563104410320583
costFunction, regularization value 0.6832426214926083 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.018655712437730863
costFunction, regularization value 0.6828027744836686 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64)

0.020685585020795524
costFunction, regularization value 0.6690857160295237 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.020737570361852132
costFunction, regularization value 0.6683283922414963 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02076624709314013
costFunction, regularization value 0.6679389351817908 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64

Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.022023232719093594
costFunction, regularization value 0.6543214310551355 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.022126533504286817
costFunction, regularization value 0.6536096004787698 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02214919724267268
costFunction, regularization value 0.65277

LR = 0.5:  10%|███████▎                                                                 | 1/10 [00:06<00:59,  6.59s/it]

 3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.022223553008707216
costFunction, regularization value 0.6491351090845321 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.022330423999344342
costFunction, regularization value 0.6484053432518079 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), g

0.023263583545946584
costFunction, regularization value 0.6283448858815182 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02334499913840126
costFunction, regularization value 0.627436200679857 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.023367725911167737
costFunction, regularization value 0.6265249909275268 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64)

Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02401868213246618
costFunction, regularization value 0.6132799670700911 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.023979809013401163
costFunction, regularization value 0.612425030845225 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,the

costFunction, regularization value 0.5963940841713954 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.024881789279142384
costFunction, regularization value 0.595737121964871 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02496807571782776
costFunction, regularization value 0.5952607365944622 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65)

Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.025340124187165997
costFunction, regularization value 0.5848651399655218 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02528130077396295
costFunction, regularization value 0.5840435881534469 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02542134348243271
costFunction, regularization value 0.5837771570682515 , with lam

Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.025759077598113227
costFunction, regularization value 0.5738137739348479 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.025867937306048097
costFunction, regularization value 0.573417237732377 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02566609878042826
costFunction, regularization value 0.573078

Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026159775630505352
costFunction, regularization value 0.5640855821053528 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02627774869768719
costFunction, regularization value 0.5637545651369902 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026134798528409723
costFunction, regularization value 0.5629940644970046 , with la

Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026127485941312194
costFunction, regularization value 0.5540355626209283 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026080950939644798
costFunction, regularization value 0.5535830984155622 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026061162177665166
costFunction, regularization value 0.5531

Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02631574454017792
costFunction, regularization value 0.546568339936989 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026211551775073883
costFunction, regularization value 0.5461625330553661 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad s

Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02643846916781565
costFunction, regularization value 0.536998183647917 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02641837879530454
costFunction, regularization value 0.5369575201633277 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad sh

0.026488588352881526
costFunction, regularization value 0.5308651448769752 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02641421418594034
costFunction, regularization value 0.5305611503523779 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026463621190071337
costFunction, regularization value 0.5302474842295073 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64

LR = 0.5:  20%|██████████████▌                                                          | 2/10 [00:13<00:53,  6.67s/it]


costFunction, regularization value 0.5285312233703086 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026504298353067304
costFunction, regularization value 0.5283426018224631 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026374262438695263
costFunction, regularization value 0.5277009655059528 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 

Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026527482725485118
costFunction, regularization value 0.5189736039269042 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026484070548922607
costFunction, regularization value 0.518834647463221 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,th

Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026369210965851146
costFunction, regularization value 0.5143559142886776 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02629025167701297
costFunction, regularization value 0.5142520338745818 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad 

Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02628754840976822
costFunction, regularization value 0.5084827775693799 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026293137998175738
costFunction, regularization value 0.5082881490181513 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02631384059701665
costFunction, regularization value 0.5081202291159292 , with lam

0.026333471135230192
costFunction, regularization value 0.5033909801738288 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026313821785898854
costFunction, regularization value 0.5030949000037619 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026377538962850867
costFunction, regularization value 0.5031463440933782 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 6

Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026330707096774517
costFunction, regularization value 0.4990786314463466 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026342148647567093
costFunction, regularization value 0.4987934945726785 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,t

0.02628447564925535
costFunction, regularization value 0.4947899864875517 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026311919327298632
costFunction, regularization value 0.4948678859911383 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.026236626671295213
costFunction, regularization value 0.494300595902926 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64)

0.025811842418157913
costFunction, regularization value 0.48951024518010466 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.025759080982324246
costFunction, regularization value 0.48926643250642976 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02574287275399715
costFunction, regularization value 0.4891217009410235 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 

Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.025501918173972807
costFunction, regularization value 0.4862517965764631 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.025508043348997894
costFunction, regularization value 0.4863360501347893 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.025479275834153558
costFunction, regularization value 0.4862304086754109 , with l

0.02539318711006988
costFunction, regularization value 0.48212992631983886 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.025379711498347685
costFunction, regularization value 0.482083996861997 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.025354811579752414
costFunction, regularization value 0.4817521311884745 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64

Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02517590958004833
costFunction, regularization value 0.47841022271599426 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.025199512255175086
costFunction, regularization value 0.47838491639334074 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,

LR = 0.5:  30%|█████████████████████▉                                                   | 3/10 [00:19<00:44,  6.41s/it]


Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02499240873049306
costFunction, regularization value 0.47649593684902847 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (256, 10) a3 (256, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (256, 64) a2 (256, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (256, 128) a1 (256, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.06636911648157535
costFunction, regularization value 1.2708165533586198 , with lambda 1.0
Calculating grad2  3
Grad 3 shapes delta4 (96, 10) a3 (96, 65) ,theta 3(10, 64), grad shape (10, 65) 
Calculating grad2  2
Grad 2 shapes delta3 (96, 64) a2 (96, 129) ,theta 2(64, 128), grad shape (64, 129) 
Calculating grad2  1
Grad 1 shapes delta2 (96, 128) a1 (96, 785) ,theta 1(128, 784), grad shape (128, 785) 
0.02471431382262368
costFunction, regularization value 0.47351273277