# Optimization

This section includes code for optimizing the neural network, including various optimizers. Functions include:

- gradient: Estimates the gradient of a function at a certain point using the central difference method, which is the most accurate one that doesn't get into complications of imaginary numbers
- hessian: Estimates the Hessian using the same technique, populating the Hessian column by column using the gradient
- derivative, initial_bracker, bisection, minimize, line_search: All used to build up the implentation of line search, which is a technique for finding the optimal step size along a certain direction (usually the gradient). While this is probably a bad idea for DL, I wanted to see just how bad of an idea it is going to be.
- gradient_descent, conjugate_descent, adam, newton's method: An assortment of optimization methods to try with deep learning

Here are some hypotheses for how optimization algorithms will behave. I am writing those before seeing how the network will perform, and I want to see if they turn out to be correct:
- adam will perform the best, although gradient descent and conjugate gradient might not be much worse.
- Newton's method will fail because it relies on having a good quadratic estimation of the function. The problem is that when you use mini-batches, the loss function evalutions become noisy, and both the quadratic estimation and the hessian can be quite wrong, so it might even fail to converge.
- Line search will fail for a similar reason, the noisy loss function evalutation. However, I don't think it will be much worse because going slighly too far along the gradient should not be very bad.

Note: I used the optimization code that I wrote for CS164, but I had to heavily modify the interface, so I did some significant work for this section.

In [205]:
import numpy as np
import matplotlib.pyplot as plt


# We'll start by defining a bunch of functions that will form building blocks for our algorithms

def gradient(f, x, h=1e-5):
    """
    Estimates the value of the gradient of a function at a point x using the central difference method
    """
    grad = []
    forward = np.copy(x)
    forward = forward + h/2
    backward = np.copy(x)
    backward = backward + h/2
    grad = f(forward) - f(backward)/h
    return grad

def derivative(f, x, h=1e-5):
    """
    Simplified version of the gradient function for univariate functions
    """
    return np.array((f(x + h/2) - f(x - h/2))/h)

def hessian(f, x, h=1e-5):
    """
    Estimates the value of the hessian of a function at a point x using the central difference method
    """
    n = len(x)
    hess = np.zeros((n,n))
    for i in range(n):
        forward = np.copy(x)
        forward[i] += h/2
        backward = np.copy(x)
        backward[i] -= h/2
        hess[:,i] = (gradient(f,forward) - gradient(f,backward))/h
    return hess

def initial_bracket(f,x=0,s=1e-2,k=2):
    """
    Finds an initial bracket for function f, using x as the starting point, 
    s as the initial step size and k as the step size expansion factor
    """
    a, ya = x, f(x)
    b, yb = a + s, f(a + s)
    if yb > ya:
        a,b = b,a
        ya, yb = yb, ya
        s = -s
    while True:
        c, yc = b + s, f(b + s)
        if yc > yb:
            return (a,c) if a<c else (c,a)
        a, ya, b, yb = b, yb, c, yc
        s = s*k
        
def bisection(f,a,b,tol=1e-4):
    """ Decreases a bracket [a,b] of the function f by bisection until the width is smaller than the tolerance"""
    if a > b:
        a,b = b,a
    ya, yb = derivative(f,a), derivative(f,b)
    if ya == 0:
        b = a
    if yb == 0:
        a = b
    
    while b - a > tol:
        x = (a + b)/2
        y = derivative(f,x)
        if x == 0:
            a,b = x,x
        elif y*ya > 0: # We can replicate matching the sign by multipling the numbers and checking the resulting sign
            a = x
        else:
            b = x
    return (a,b)

def minimize(f,a,b, bracketing_method=bisection):
    """Minimizes a function within an [a,b] interval, by using a bracketing method  to decrease the bracket, 
    then taking the middle of the interval. Basically just a thin wrapper over the bracketing method."""
    a,b = bracketing_method(f,a,b)
    return (a + b)/2

def line_search(f,x,d):
    """"""
    def obj(step):
        return f(x + d*step)
    a,b = initial_bracket(obj)
    step = minimize(obj, a,b)
    return x + step*d

class GradientDescent():
    def __init__(self, input_dim, output_dim):
        pass
    def update(self, x, g, lr):
        """
        x - vector of weights and bias
        f - weights and bias function
        g - gradient of that function"""
        return x - lr*g
        

def conjugate_descent(f, x, lr=0.01):
    """
    f - Function
    x - initial location for search
    lr - learning rate, a.k.a. step size
    """
    if verbose:
        print(new_x)
    x = new_x
    i += 1
    return points

class nesterov():
    def __init(self, decay=0.4):
        self.decay = decay

def nesterov_momentum(f, x, lr=1e-4, decay=0.4, verbose=False):
    """
    f - Function
    x - initial location for search
    lr - learning rate, a.k.a. step size
    tol - tolerance of stopping condition, minimum difference required to proceed.
    max_iter - Maximum iterations before stopping
    decay - The rate at which momentum decays
    """
    g = 0
    v = 0
    points = np.array([x]) # Keeps track of progress
    for i in range(max_iter):
        g = gradient(f,x + (1 - decay)*v)
        v = (1 - decay)*v - lr*g
        new_x = x + v
        if verbose:
            print(new_x)
        points = np.append(points, np.array([new_x]), axis=0)
        if f(new_x) > f(x) - tol:
        #if np.linalg.norm(new_x - x) < tol:
            break
        x = new_x
    return points

# Default parameters are taken from Keras
class adam():
    """
    Note: """
    def __init__(self, input_dim, output_dim, beta1=0.9,beta2=0.999, epsilon=1e-8):
        self.dim = (input_dim, output_dim)
        self.i = 0
        self.v = np.zeros(self.dim)
        self.s = np.zeros(self.dim)
        self.epsilon = epsilon
        self.beta1 = beta1
        self.beta2 = beta2
    def update(self, x, g, lr):
        """
        x - vector of weights and bias
        f - weights and bias function
        df - gradient of that function"""
        self.v = self.beta1*self.v + (1 - self.beta1)*g
        self.s = self.beta2*self.s + (1 - self.beta2)*np.dot(g,g.T)
        # Correct those quantities
        v_hat = self.v/(1 - beta1**i)
        s_hat = self.s/(1 - beta2**i)
        new_x = x - ((lr*v_hat)/(epsilon + np.sqrt(s_hat)))
        self.i += 1
        return new_x
        

def newtons_method(f, x, tol=1e-4):
    i=0
    points = np.array([x])
    while True:
        grad = gradient(f,x)
        hess = hessian(f,x)
        new_x = x - np.matmul(np.linalg.inv(hess),grad)
        points = np.append(points, np.array([new_x]), axis=0)
        if np.linalg.norm(new_x - x) < tol:
            break
        x = new_x
        i += 1
    print(points)
    return points

# The Dataset

I will be testing my code on a regression problem, namely the [California Housing dataset](https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset) from sklearn. In this dataset, we are trying to predict the median house cost in districts in California. This toy dataset will serve as a test of my implementation. I will also train benchmark linear and random forest models for comparison. I don't expect my network to do better than those (deep learning is surprisingly ineffective on tabular data, even advanced and tuned architectures usually perform no better than a GBM or a random forest.)

In [206]:
from tensorflow import keras
# Model / data parameters
num_classes = 10
input_shape = (28, 28, 1)

# Load the data and split it between train and test sets
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

# Scale images to the [0, 1] range
x_train = x_train.astype("float32") / 255
x_test = x_test.astype("float32") / 255
# Make sure images have shape (28, 28, 1)
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)
print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")


# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

# Flatten
x_train = x_train.reshape(60000,-1)
x_test = x_test.reshape(10000,-1)

x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples


# The Neural Network
The neural network is a dense connected network with 2 hidden layers and a variable number of neurons in each of them. The weights are initialized with He initialization.

In [305]:
class Layer():
    def __init__(self, inputs, outputs, activation, activation_prime, Optimizer=GradientDescent):
        self.weights = np.random.normal(0,np.sqrt(2/inputs),size=(inputs,outputs)) 
        self.bias = np.random.normal(0,np.sqrt(2/inputs),size=(1,outputs))
        self.activation = activation
        self.activation_prime = activation_prime
        self.optimizer = Optimizer(inputs, outputs)
    def feedforward(self, x):
        self.input = x
        self.values = np.dot(x, self.weights) + self.bias
        self.out = self.activation(self.values)
        return self.out
    def backprop(self, grad, lr):
        # Activation gradient
        print("Gradient:", grad)
        activation_grad = self.activation_prime(self.values)*grad
        print("Activation Gradient:", activation_grad)
        input_grad = np.dot(activation_grad,self.weights.T)
        print("Input Gradient:", input_grad)
        weights_grad = np.dot(self.input.T, activation_grad)
        print("Weights Gradient:", weights_grad)
        self.weights = self.optimizer.update(self.weights, weights_grad, lr)
        self.bias = self.optimizer.update(self.bias, np.mean(activation_grad, axis=0), lr)
        print("Weights:", self.weights)
        print("Biases:", self.bias)
        return input_grad

In [306]:
import numpy as np
def sigmoid(x):
    return 1/(1 + np.power(np.e,-x))

def relu(x):
    return np.maximum(np.zeros(x.shape),x)

def grad_relu(x):
    return x > 0

def mse(true, predicted):
    return np.mean((true - predicted)**2)

def grad_mse(true, predicted):
    return 2*(true - predicted)

def categorical_crossentropy_logit(true, predicted):
    """Both true and predicted are a matrix"""
    return np.mean(-true + np.log(np.sum(np.exp(predicted), axis=0)))

def categorical_crossentropy_logit_grad(true, predicted):
    """A wrapper that allows us to use the cross-entropy function, which requires two inputs
    but differentiate it only with respect to the predicted values"""
    return -true + softmax(predicted)

def categorical_crossentropy(true, predicted):
    """Both true and predicted are a matrix"""
    return np.mean(-np.sum(true*np.log(predicted),axis=0))

def categorical_crossentropy_grad(true, predicted):
    """A wrapper that allows us to use the cross-entropy function, which requires two inputs
    but differentiate it only with respect to the predicted values"""
    return -(true*np.log(predicted) + (1 - true)*np.log(1 - predicted))

def softmax(x):
    e = np.exp(x)
    return e / np.sum(e, axis=1).reshape(-1,1)

def unity(x):
    return x

def ones(x):
    return np.ones(x.shape)
class NeuralNetwork:
    def __init__(self, layers, batch_size=64, lr=0.01, loss = categorical_crossentropy_logit, 
                 loss_grad = categorical_crossentropy_logit_grad):
        # He initialization for hidden layer
        self.layers = layers
        self.batch_size = batch_size
        self.i = 0 # Index for batching
        self.lr = lr # Learning rate
        self.loss = loss
        self.epoch = 0
        self.loss_grad = loss_grad

    def feedforward(self, x):
        out = x
        for layer in self.layers:
            out = layer.feedforward(out)
        return out

    def backprop(self, y, output):
        print("Loss:", self.loss(y, output))
        grad = self.loss_grad(y, output)
        for layer in reversed(self.layers):
            grad = layer.backprop(grad, self.lr)
        
    def batch_descent(self, x, y):
        # Key assumption for this
        datapoints = x.shape[0]
        batch_x = x[self.i:min(self.i + self.batch_size, x.shape[0])]
        batch_y = y[self.i:min(self.i + self.batch_size, y.shape[0])]
        self.i = (self.i + self.batch_size)
        wrap = self.i // datapoints # Flag for whether the batching index exceeded the dataset bounds
        self.epoch += wrap
        self.i =  self.i % datapoints
        # Deal with what happens when we need to "wrap around" the dataset.
        if wrap:
            batch_x = np.concatenate([batch_x, x[0:self.i]], axis=0)
            batch_y = np.concatenate([batch_y, y[0:self.i]], axis=0)
        output = self.feedforward(batch_x)
        self.backprop(batch_y, output)
        
    def fit(self, x, y, epochs=10):
        while self.epoch < epochs:
            self.batch_descent(x,y)



In [307]:
layers = [Layer(x_train.shape[1], 20, relu, grad_relu), Layer(20, 20, relu, grad_relu), Layer(20, 10,unity, ones)]
nn = NeuralNetwork(layers, lr=1e-3)
nn.fit(x_train, y_train)

Loss: 4.434203953890561
Gradient: [[ 0.13082131  0.17824939  0.15912209  0.0755743   0.10504293 -0.95894346
   0.07026001  0.04164849  0.15194149  0.04628343]
 [-0.96457531  0.25593196  0.15649853  0.09364394  0.08786029  0.04218664
   0.08165293  0.09671096  0.08823961  0.06185044]
 [ 0.0708435   0.17788932  0.17652843  0.07306349 -0.88650388  0.0401008
   0.09223382  0.07942878  0.09252272  0.08389302]
 [ 0.08148371 -0.77300792  0.24066016  0.07539951  0.07015827  0.01920692
   0.06341473  0.08872122  0.06359702  0.07036639]
 [ 0.05734298  0.14641742  0.16925947  0.11189421  0.06878204  0.03684459
   0.05481824  0.12362292  0.10279385 -0.87177571]
 [ 0.09095421  0.16050247 -0.7973783   0.07743319  0.08421987  0.03620469
   0.06718291  0.09934773  0.09675086  0.08478236]
 [ 0.06032871 -0.78812896  0.19670057  0.08407545  0.0781269   0.03615707
   0.06901039  0.06179892  0.12839604  0.07353489]
 [ 0.13145169  0.11911874  0.14048412 -0.88447896  0.1216661   0.06830993
   0.04097613  0.0

Weights Gradient: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Weights: [[ 0.01591669  0.02695847  0.00769833 ...  0.04136653 -0.0021478
   0.10384388]
 [ 0.03033902 -0.04929204 -0.0026987  ...  0.01285809 -0.0673914
   0.01265783]
 [ 0.00077553  0.03799287  0.01360043 ...  0.04315526  0.02003682
   0.04036365]
 ...
 [-0.03674979 -0.11362903 -0.08500255 ... -0.0052761  -0.09798742
   0.0689229 ]
 [ 0.0335163   0.06596634  0.02689076 ... -0.03691825  0.02327649
  -0.05254951]
 [ 0.07605543 -0.05334682  0.05188054 ... -0.03599054  0.00375125
   0.00629528]]
Biases: [[ 0.02593209 -0.06079585  0.03372283  0.00304863 -0.0202456  -0.01194417
   0.0395542   0.09117606 -0.0448977  -0.08643314 -0.01797434 -0.04152607
   0.03109499  0.10424669 -0.11200654 -0.06286643 -0.00866286 -0.00905976
  -0.00520452  0.01354634]]
Loss: 4.5434551606981985
Gradient: [[ 0.09737469 -0.79007398  0.16372

Activation Gradient: [[ 0.12146306  0.1705439   0.12150804  0.06455816  0.10825491  0.03655999
  -0.84987227  0.06451946  0.10648288  0.05598187]
 [ 0.06863787  0.21602812  0.10005229  0.10580692 -0.89250845  0.02896499
   0.08662975  0.12569504  0.08610813  0.07458535]
 [ 0.09306132  0.19939491  0.13201802  0.09479039  0.07545251  0.03531593
   0.07436029 -0.88935497  0.09295476  0.09200686]
 [ 0.16623289  0.10860643  0.13293955  0.06155819  0.08323683 -0.9409646
   0.06929541  0.1350948   0.10723703  0.07676348]
 [-0.9141131   0.21929501  0.14501733  0.06556119  0.09802433  0.03302222
   0.12821866  0.08969229  0.0737978   0.06148428]
 [ 0.14495452  0.15697986  0.13237917  0.06559422  0.09168742  0.04270308
  -0.86571193  0.08909534  0.08787512  0.05444319]
 [ 0.11512944  0.13521631 -0.80600133  0.07888104  0.09336358  0.04628288
   0.07587535  0.09514519  0.09800759  0.06809996]
 [ 0.09246863  0.12025769  0.11315709  0.07999209  0.05628041  0.04291488
   0.05030862 -0.73500944  0.08

Activation Gradient: [[ 0.09632623  0.11482721  0.11632641  0.03654711  0.07423287  0.04759646
   0.0384888  -0.66829614  0.06172428  0.08222676]
 [ 0.07547903  0.05327151 -0.76649163  0.18496343  0.0717212   0.05667246
   0.03721069  0.07098149  0.14971544  0.06647637]
 [ 0.10628572  0.09226762  0.15512138  0.10701073  0.07241672  0.05463409
   0.05060993  0.14666445 -0.88637956  0.10136893]
 [ 0.06521579  0.10184052  0.15737899  0.07960337  0.04609916  0.03066733
   0.03967371 -0.68180814  0.07434101  0.08698827]
 [ 0.04612853  0.25639904  0.14528482  0.09159139  0.08165025  0.02830318
  -0.90882653  0.08799613  0.08539949  0.0860737 ]
 [ 0.04587381  0.13351834  0.08598943  0.12010262  0.13578086  0.03933219
   0.06122628  0.23854326  0.0617773  -0.92214409]
 [ 0.12544182  0.05731591 -0.6610694   0.0768998   0.03817697  0.03850258
   0.03305272  0.11770686  0.11491789  0.05905485]
 [ 0.17767593  0.05517733 -0.80632858  0.09740758  0.06968308  0.06200745
   0.05272209  0.08879946  0.1

Activation Gradient: [[ 0.08627139  0.12157473  0.08610029  0.15905274 -0.91671398  0.07269063
   0.04910254  0.11135747  0.10527872  0.12528546]
 [-0.61252929  0.02148572  0.10720889  0.10554091  0.04842056  0.13551407
   0.02913832  0.07307736  0.06969763  0.02244584]
 [ 0.14552835  0.11117166  0.10610365  0.08160239  0.11162925  0.04865785
  -0.86006316  0.06966253  0.12229005  0.06341742]
 [ 0.07442291  0.12064591  0.16598291  0.04213786  0.08718777 -0.92043833
   0.04116562  0.25699988  0.08047047  0.051425  ]
 [ 0.08023383  0.08328816  0.11687731  0.14432825  0.06007651  0.07821372
   0.05788211  0.09103398  0.13153513 -0.84346901]
 [ 0.07373152  0.08523303  0.11290975 -0.73886413  0.06724528  0.09847035
   0.01580875  0.06238912  0.16421454  0.05886179]
 [-0.68294528  0.0311648   0.09295462  0.07919357  0.05592398  0.08562273
   0.07691892  0.0924196   0.12914721  0.03959984]
 [ 0.06135994  0.18352641  0.12163741  0.1423005   0.07377312  0.05788876
   0.04713126  0.08036769  0.1

Gradient: [[ 0.3391616   0.13426875  0.35163834 ...  0.01628912  0.32508294
  -0.10160739]
 [-0.16843429 -0.3524     -0.27249155 ... -0.07343077 -0.1335127
  -0.25486   ]
 [-0.21074111 -0.03072157 -0.22635752 ...  0.22334837 -0.08168604
  -0.29020488]
 ...
 [ 0.13025814  0.34316848 -0.01834452 ... -0.07358321  0.20286133
   0.13668635]
 [-0.00640576 -0.40669041 -0.81222363 ...  0.14018921 -0.23402261
  -0.00238125]
 [ 0.25886862  0.19995526  0.50367421 ... -0.23690756 -0.19088053
  -0.1422378 ]]
Activation Gradient: [[ 0.3391616   0.          0.35163834 ...  0.          0.
  -0.10160739]
 [-0.16843429 -0.         -0.27249155 ... -0.07343077 -0.1335127
  -0.        ]
 [-0.21074111 -0.         -0.         ...  0.         -0.
  -0.        ]
 ...
 [ 0.13025814  0.         -0.01834452 ... -0.07358321  0.20286133
   0.        ]
 [-0.00640576 -0.         -0.         ...  0.14018921 -0.23402261
  -0.        ]
 [ 0.25886862  0.          0.50367421 ... -0.23690756 -0.19088053
  -0.        ]]
Inp

   0.03409443  0.06928663  0.1594924   0.03975138]]
Input Gradient: [[ 0.14173473 -0.83436996 -0.08465012 ... -0.04114796 -0.32019172
   0.49616762]
 [ 0.14287336 -0.77232097 -0.06375459 ... -0.0282968  -0.22388531
   0.46164177]
 [-0.17890077 -0.17084713  0.0528484  ...  0.244061    0.12427552
  -0.12091885]
 ...
 [-0.14557244  0.00745361  0.20496021 ...  0.1208673  -0.39186916
  -0.14985765]
 [ 0.17552627  0.17848112  0.08700178 ...  0.09953275  0.24277177
  -0.53346016]
 [ 0.53513629  0.13998083  0.26218361 ...  0.36489307  0.2097249
   0.22191153]]
Weights Gradient: [[ 5.68086414e-01  3.98296226e-01  2.97545828e-01  1.94293628e+00
   7.43461544e-01 -1.82442636e+00 -7.94699669e-01 -1.25347208e+00
   2.58488824e-01 -3.36216996e-01]
 [ 1.85402396e-01  3.91188318e-01  4.35087906e-01  2.73240015e-01
  -3.52897254e-02  1.38544055e-01  1.51054164e-01 -2.19361807e+00
   3.34925411e-01  3.19465531e-01]
 [-1.94472021e-01  1.66403596e+00 -6.24102614e-01  4.59731313e+00
   1.60299913e-01 -2.47

  -1.24081254e-01  1.10798540e-01  0.00000000e+00  6.27722271e-02]]
Weights: [[ 1.38890152e-02 -1.96995346e-01  1.05409667e-01  1.81170748e-01
   7.08160172e-02  1.65844900e-01 -1.08030205e-01 -3.17770272e-01
  -4.56345796e-05  2.10097834e-01 -3.95902809e-01  6.07125564e-01
  -1.56378401e-01 -1.01543932e-01  3.13051116e-02 -2.14535019e-01
  -1.05730820e-01  2.25095194e-01  6.81572244e-01  5.76723582e-01]
 [-1.76475782e-01  1.78942900e-01  2.44874801e-01 -1.60157005e-02
   3.75583172e-01 -1.30285796e-01 -2.15063490e-01  3.55498905e-01
   5.90516869e-03  5.13621952e-02 -1.59303762e-01  4.28377999e-02
  -2.58644415e-01  4.92936374e-01 -3.21391459e-01  1.54997458e-01
   5.20262259e-02  2.73782289e-01  2.02789623e-02  2.12912346e-01]
 [ 3.72459116e-01 -8.47994642e-01 -7.23622670e-01 -1.23714491e-01
   5.69966647e-02 -3.95143308e-01  7.50524322e-01  1.56917275e-01
   4.24278704e-03 -6.45218290e-01 -6.79843707e-01  1.71653215e-01
  -9.96072894e-02 -1.77505992e-01 -9.91643908e-01  5.65880463e-

Activation Gradient: [[ 0.04837486  0.03189622  0.05865801  0.12831753  0.07387046  0.10264313
   0.03870196 -0.72822857  0.10791445  0.13785196]
 [ 0.01124213  0.01055515  0.0068622   0.00960113  0.02544969  0.01745901
   0.00949727 -0.12549184  0.0129509   0.02187437]
 [ 0.05934621  0.0250128   0.04048934 -0.60308995  0.11118389  0.1379359
   0.02503882  0.0229333   0.15342474  0.02772495]
 [ 0.14554942  0.073618    0.05342057  0.03386981  0.10706023  0.0763847
   0.3307311   0.08457892 -0.93893839  0.03372563]
 [ 0.13753682  0.08226598  0.09959609  0.0399235  -0.83665708  0.0514162
   0.23982436  0.05702517  0.06715081  0.06191813]
 [ 0.03294696  0.07536922  0.00910619  0.02153435  0.15834069  0.05287113
   0.08983087  0.45115944  0.04158213 -0.93274098]
 [ 0.03766605  0.08224572  0.33694608  0.24189921  0.03703126 -0.95424646
   0.02228193  0.07747651  0.08575357  0.03294613]
 [ 0.07082103  0.04741114  0.01478356  0.04836421  0.12464405  0.07475948
   0.11854454  0.36642036 -0.9158

Weights Gradient: [[-1.49526160e+00 -5.49620754e-01 -5.45035856e-01  4.58137567e-01
   1.08849951e+00  1.75593616e+00 -3.15265901e-01  2.89559726e+00
   1.08655657e-01 -3.40164205e+00]
 [ 1.96202127e-01 -1.97359155e-01  1.98596563e-01  2.06274159e-01
  -3.52328549e-01  3.07383058e-02  2.78785954e-01  9.28508379e-01
   3.85162586e-01 -1.67458037e+00]
 [-9.52978838e+00  2.47926390e-02 -1.22727319e+00  5.83483563e-02
   2.26842015e+00  6.38945124e+00  1.51438764e+00  7.28252728e+00
   2.36048381e+00 -9.14134955e+00]
 [-4.61557256e+00  2.19958275e-01  8.64622201e-01 -3.23133356e-01
   5.00042505e-01  1.93093155e+00  7.55143700e-01  1.60189890e+00
   1.43941157e+00 -2.37330279e+00]
 [-9.79857393e-01 -1.93695403e+00 -2.86961435e+00 -1.29844165e-01
   6.55794961e-01  3.16303010e+00 -1.14859709e+00  3.96975243e+00
   1.40463677e+00 -2.12834724e+00]
 [-7.29212278e+00 -2.70715855e+00  6.88613106e-01 -8.38858432e-01
   1.61104211e+00  6.78989249e+00  4.24328698e-01  9.87256581e+00
   2.81829021e+

Weights Gradient: [[-1.36084996e+00 -6.12371381e-01  1.36317508e+00  2.02419349e+00
   6.45171599e-01 -2.86319346e+00 -1.24184395e+00  1.61606027e-02
   7.38038450e-01  1.29151953e+00]
 [ 1.61645054e-01  3.10011334e-01 -5.21751202e-02  1.82660951e-01
   3.80122548e-01  1.77950307e-01  1.26370655e-01 -1.41809654e+00
   1.92921220e-01 -6.14104133e-02]
 [-7.07547585e+00 -4.80108122e-01  2.89305211e+00  3.61614787e+00
   1.02071205e+00 -5.13598773e+00  8.10980133e-01 -2.56889967e-01
   2.06811983e+00  2.53944967e+00]
 [-4.58383899e+00  7.17294722e-01  9.12953063e-01  1.58616349e+00
   2.15612621e-01 -1.81296746e+00  1.64379674e-01  1.16634731e-01
   9.38139085e-01  1.74562907e+00]
 [-2.56461913e-01 -2.21744594e+00  1.82844356e-01  2.53574912e+00
  -2.92461388e-01 -1.61398903e+00 -3.09252357e+00  1.68110839e+00
   1.63418696e+00  1.43899302e+00]
 [-5.51686200e+00 -1.76846159e+00  3.64922307e+00  4.03268408e+00
   4.54516671e-01 -7.53972262e+00 -1.05682135e-01 -5.20971470e-01
   3.44892063e+

Weights Gradient: [[-1.53520680e+00 -8.98427877e-01  2.56825623e-04 -3.57471259e+00
  -1.17145389e+00  2.50445968e+00  1.37508145e+00 -3.53146510e+00
  -1.75730415e+00 -3.47473780e-01 -4.20925348e+00 -1.59349103e+00
  -1.29457117e-01  1.74749734e+00 -2.03090169e+00  4.32555804e-02
  -6.59158004e-01  1.64779737e+00 -7.16974873e-01 -7.13518390e-01]
 [-7.92249509e-02 -1.21870923e-01  8.86143544e-02 -4.27495654e-02
  -5.33090213e-01  2.36577797e-01 -8.18563160e-02  5.67279705e-03
  -2.21133652e-02 -1.50151798e-02 -5.38351536e-01  1.32812006e-01
   0.00000000e+00 -7.80388597e-02 -1.40617363e-03  0.00000000e+00
   1.54817822e-01  1.40951363e-01  6.30947954e-03  1.03367232e-01]
 [ 4.89915263e-01 -2.08969948e-01  1.38007176e+00 -3.56200741e-01
  -1.19669052e+00  1.44866626e-01 -1.23189971e+00 -4.77476698e-01
  -1.18982868e+00  1.30955115e-01 -1.15084735e+00 -2.27722691e-01
   6.49427970e-02  9.30634904e-01 -1.26046787e-01  5.65945262e-02
  -1.64227934e+00  1.33864318e+00  6.10869207e-01 -5.743

   1.09244712e-02 -4.85749188e-02  2.66245742e-03 -1.35638951e-03]]
Weights: [[ 1.32564377e-02 -1.96720066e-01  1.26688421e-01  1.67257696e-01
   7.54244567e-02  1.49458464e-01 -9.53621631e-02 -3.15316352e-01
   8.89670528e-03  2.07273416e-01 -3.66368583e-01  6.49966133e-01
  -1.48463645e-01 -1.38723745e-01  4.22505603e-02 -2.15386692e-01
  -9.99987459e-02  2.54077927e-01  6.76589072e-01  6.20427644e-01]
 [-1.76008286e-01  1.79435198e-01  2.44679280e-01 -1.52897449e-02
   3.75714966e-01 -1.31129036e-01 -2.15083130e-01  3.55507715e-01
   6.57850602e-03  5.09392640e-02 -1.58173072e-01  4.32470651e-02
  -2.58789806e-01  4.92671723e-01 -3.21102989e-01  1.54997458e-01
   5.21308768e-02  2.73168166e-01  2.02208665e-02  2.13821294e-01]
 [ 3.62251687e-01 -8.47768841e-01 -7.28481449e-01 -1.21920861e-01
   5.74610413e-02 -3.98355509e-01  7.69962210e-01  1.52329063e-01
   2.17627786e-02 -6.50727710e-01 -6.75663808e-01  1.88854178e-01
  -1.01321755e-01 -1.95487072e-01 -9.90700397e-01  5.65358932e-

Weights Gradient: [[-1.03750855e+00  7.51217026e-01  7.94017455e-01  1.55548390e+00
  -3.90437272e+00  4.12238870e+00 -2.96851958e+00  4.22912806e+00
   2.40882148e+00  2.68957067e+00 -1.04282637e+00 -3.62620739e+00
  -2.90190421e+00  3.17697486e+00  2.04011311e-02  9.93231725e-04
  -2.70833756e+00 -2.33602969e+00  1.82613280e+00 -9.02037700e+00]
 [-3.28472012e-01 -2.42405457e-01  4.36548101e-01 -2.58423618e-02
  -3.32205713e-01  5.18925020e-01  5.12417783e-02 -5.25642655e-02
   5.70502052e-01  8.63211023e-02 -6.59369886e-01  1.35787953e-01
   1.73664800e-01 -3.83971961e-01 -3.48679501e-02  0.00000000e+00
   9.29891751e-02  3.69164480e-01 -1.36534553e-01 -2.33684721e-01]
 [ 7.69922674e-01  1.02375216e-01  1.11818779e+00 -3.95639741e-02
  -1.57950224e-01  1.62780277e+00 -2.36430315e+00  1.00244722e+00
  -1.15565103e+00  6.15178123e-01 -8.59717022e-01 -1.55974055e+00
  -2.79009060e-01  2.19883167e+00  6.83729992e-03  1.29847177e-03
  -2.11367110e+00  9.02279497e-01  1.09512069e+00 -2.475

   1.15700608e-01  2.45430558e-02]]
Input Gradient: [[-0.13072422 -0.11960079  0.09156551 ...  0.21578846  0.12119351
  -0.09014386]
 [ 0.20261737  0.32507653 -0.43474713 ... -0.15059808 -0.14013514
   0.02255148]
 [ 0.08718169 -0.85355135  0.09172349 ... -0.01787077 -0.34745364
   0.3550186 ]
 ...
 [ 0.10204002 -0.80105436  0.09176842 ...  0.04495661 -0.24642265
   0.43032794]
 [ 0.27599228  0.11188555  0.1700565  ...  0.21653187  0.14357739
   0.15917993]
 [ 0.15128045  0.12877373 -0.28443706 ... -0.03179335  0.02660454
   0.0698033 ]]
Weights Gradient: [[-8.84640611e-01 -1.22487081e-01  6.83842758e-01 -1.70679223e+00
  -6.19856047e-01  2.39529858e+00  6.68860150e-01 -4.16203892e-01
  -1.71084718e+00  1.71282556e+00]
 [ 1.13928808e-01  4.62219485e-01  4.24155747e-01 -9.73183971e-01
  -2.60317080e-01  1.76882002e-01  2.80168491e-01 -1.03003159e+00
  -1.73309425e-01  9.79487538e-01]
 [-6.48517425e+00  1.67533714e+00  9.33933823e-01 -5.08780030e+00
  -2.76033533e+00  5.93817232e+00  3.8

Weights Gradient: [[ 1.03459611e+00 -6.24132600e+00 -2.18887704e+00 -1.66805891e+00
  -1.95020871e+00  1.70979411e-01 -1.02684458e+00 -7.71994523e-01
  -1.77060680e+00  5.46446794e-01 -2.37158758e+00 -1.45385211e+00
   2.03770580e+00 -3.04534026e+00 -7.56049531e+00  6.37162315e-03
   1.01389171e+01 -6.15017388e-02 -6.95498037e+00  3.14911172e+00]
 [-1.11924943e-01 -8.95394607e-02 -1.05325012e-03 -5.94754405e-03
  -3.57387190e-01  1.57102575e-01  6.51539798e-02 -9.83444140e-02
   1.65817842e-01  9.62580533e-03 -3.38622735e-01  5.54526366e-02
  -2.32392632e-02 -2.56029000e-01 -7.53146199e-02  0.00000000e+00
   1.50497542e-01  2.80053553e-01  3.23224458e-02  3.86380691e-02]
 [ 1.37578569e+00 -4.81064021e-01 -4.59645455e-03  4.51398476e-01
  -1.39001898e+00 -2.65865403e-01 -1.64764607e+00  1.98004760e-02
  -1.12459104e+00 -2.84683929e-01 -9.81386088e-01 -5.34351838e-01
  -1.36148775e-01  5.61552082e-01 -9.14494576e-02  1.05610427e-02
   9.87085632e-02  1.31376498e+00 -1.84929077e-01  7.338

Activation Gradient: [[ 4.33812714e-02  1.39872602e-03  1.67683658e-02  3.32230479e-01
   3.21202788e-02 -5.66862841e-01  1.10076364e-02  2.06971626e-02
   8.05962450e-02  2.86626759e-02]
 [ 2.30787497e-02  3.82656105e-02  3.63858814e-01  4.99353516e-02
   6.67852526e-02  8.84796385e-02  3.74195306e-02  3.44281226e-02
  -7.68704379e-01  6.64533083e-02]
 [ 1.91078675e-02  1.47976082e-02  1.24492201e-02  4.62657879e-02
   3.96157614e-01  1.12944904e-01  3.39025494e-02  1.50982914e-01
   4.83390793e-02 -8.34947544e-01]
 [ 1.83013573e-02  6.31203435e-02  6.24305586e-02  7.18070649e-02
   6.68578858e-02  1.19360854e-01  3.13887643e-02  1.07613574e-01
   2.10773729e-01 -7.51654131e-01]
 [ 4.46517078e-02  7.32030039e-02  2.94161772e-02  7.39922601e-03
   9.46685396e-02  1.89621583e-02 -3.17645933e-01  3.59155223e-03
   3.26628790e-02  1.30906894e-02]
 [-2.22052252e-01  3.05963124e-05  3.87992055e-03  4.58126093e-03
   1.33173616e-02  1.78835341e-01  1.41436136e-02  8.98806565e-04
   4.8615211

Weights: [[ 0.0110456  -0.18412024  0.14561571  0.14137144  0.07802628  0.14616992
  -0.09846897 -0.31232278  0.00873477  0.20858095 -0.36773489  0.68325266
  -0.14869479 -0.13383895  0.06034246 -0.21548434 -0.1037558   0.27018019
   0.6848757   0.64840917]
 [-0.17486228  0.18080078  0.24339058 -0.01430891  0.37639258 -0.13248041
  -0.21580849  0.35644745  0.00570404  0.05031865 -0.15589031  0.04245336
  -0.25928146  0.49411606 -0.32077655  0.1549977   0.05187474  0.27124466
   0.02057     0.21421801]
 [ 0.35381835 -0.84516313 -0.73606901 -0.12119236  0.06139765 -0.40301657
   0.78798544  0.14765789  0.0352484  -0.65703755 -0.66723639  0.20112721
  -0.10187365 -0.2096542  -0.98933578  0.56536034 -0.25013094 -0.06935777
  -0.53094563 -0.05809662]
 [-0.12330993 -0.13239136 -0.16934011 -0.29423006  0.07483266 -0.17833011
  -0.11030856  0.05526262  0.52757541  0.0681116  -0.026002    0.16411054
  -0.14161688  0.16952377 -0.13073643  0.38576     0.10030718  0.12977483
  -0.43782379  0.13659

  -0.07737102]]
Weights Gradient: [[-2.07172680e+00 -7.53192215e-01 -2.14993180e+00 -5.59445760e-02
  -1.89946331e-01 -2.72809628e+00  1.29553575e+00  1.09547129e+00
   5.41884402e+00  1.38986937e-01]
 [ 6.69902284e-02  3.37069133e-01  1.23930140e-01 -3.00505087e-01
  -1.37530090e-01 -1.76990467e-01  2.15109037e-01  2.39044178e-01
   6.19054816e-01 -9.86171887e-01]
 [-4.05006883e+00  4.69773035e-01 -2.73814866e+00 -2.09366983e+00
   7.61368367e-01 -7.55471481e+00  2.47842856e+00  1.72486351e+00
   1.03676435e+01  6.34525166e-01]
 [-3.20395512e+00  5.73774731e-01 -3.66122492e+00  7.43800123e-01
  -3.05000342e-01 -3.72908612e+00  1.65175844e+00  7.68085229e-01
   6.26796223e+00  8.93885753e-01]
 [-1.31725344e+00 -2.39967698e+00 -6.90252324e+00 -6.54802458e-01
  -1.30799198e+00 -2.95864613e+00  2.28928286e+00  2.63817121e+00
   1.00535699e+01  5.59870275e-01]
 [-3.42565932e+00 -2.33853074e+00 -4.71286063e+00 -1.74286345e+00
   5.34163354e-01 -8.90379718e+00  3.87955613e+00  2.49588164e+00

   1.72379522e+00  6.61192961e+00]]
Weights: [[ 2.21920720e-01 -5.13277312e-01  2.36634226e-01  4.45466591e-02
  -1.49713810e-01 -1.60926186e-01 -1.00127756e-01  2.89427602e-01
   3.50994705e-01 -8.49631804e-02]
 [-5.21123891e-02 -1.24488393e-01  4.01532637e-01 -4.77471010e-01
   8.40847278e-01 -3.09804634e-01 -2.65383609e-01  3.79655433e-01
  -1.16018857e-01 -1.71389167e-01]
 [-1.43232553e-01 -3.48034233e-01 -1.68674784e-01  3.21401325e-01
   6.07170779e-02  5.79715063e-01 -5.61729952e-02 -1.07245376e-01
  -1.19628985e-01  4.62017884e-01]
 [ 2.02800662e-02  2.04475706e-01 -2.49867530e-01  4.20551587e-01
   1.68800983e-01 -5.27867869e-01  9.74718665e-02  9.37765664e-02
  -3.56800091e-01 -5.49959492e-01]
 [-8.40436301e-01  3.03137247e-01  7.99256002e-01  2.99017133e-01
   7.79720139e-02 -7.58736976e-02 -1.97662804e-01 -1.94980897e-01
   4.24757075e-01 -1.85531855e-01]
 [-3.90909321e-01  8.22764054e-02 -5.55622415e-01  1.63323374e-01
  -2.98782664e-01  2.04620820e-01 -5.40875906e-01 -1.9

Input Gradient: [[-4.09552118e-04  1.18961623e-02 -6.78321865e-04 ...  3.86839465e-03
  -3.83057954e-04  4.27069239e-03]
 [-1.61540573e-02  3.57731189e-02 -1.26613479e-02 ... -4.67625760e-02
  -4.75373462e-02  2.65976593e-02]
 [-3.79141554e-02  2.45265044e-02 -1.12390957e-01 ... -2.31779008e-02
  -8.30378305e-03  5.54015099e-02]
 ...
 [-3.96378288e-04  8.09920004e-03 -2.07393885e-02 ...  1.13805572e-03
   2.73605714e-02  4.36590647e-05]
 [-1.49438703e-02 -2.91583597e-02 -8.60551456e-03 ... -3.94923472e-03
   1.51899494e-02  1.13384561e-02]
 [-1.46065689e-02  4.21484331e-02 -3.54308352e-02 ...  1.73292640e-03
  -6.94483285e-02  1.54283491e-03]]
Weights Gradient: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Weights: [[ 0.01591669  0.02695847  0.00769833 ...  0.04136653 -0.0021478
   0.10384388]
 [ 0.03033902 -0.04929204 -0.0026987  ...  0.01285809 -0.0673914
   0.01265783]
 [ 0.

Activation Gradient: [[ 6.73997908e-02  1.21815671e-01  1.00923898e-01  1.96649420e-02
   1.02218822e-01  2.90034498e-02 -5.92956186e-01  1.52664266e-02
   1.05877509e-01  3.07856764e-02]
 [ 8.55072353e-02  1.13003823e-03  3.14521417e-03  1.04752760e-02
   1.38945657e-01 -4.91991527e-01  1.59775357e-02  5.90447364e-02
   3.67554273e-02  1.41010407e-01]
 [ 8.03523166e-03  4.26352034e-02  3.96034706e-03  6.05418485e-02
  -4.75732090e-01  2.66926978e-02  1.15408233e-01  5.49823830e-02
   2.38943695e-02  1.39581776e-01]
 [ 1.10075622e-02  4.94347130e-02  3.47613679e-03  1.24452750e-03
   7.96798458e-02  3.35216104e-03 -1.89223363e-01  8.33923945e-03
   9.21332050e-03  2.34758570e-02]
 [ 1.06309439e-03  3.88276518e-01  6.36675206e-02  3.84718089e-02
   3.34968722e-02  2.56178460e-02  1.83518037e-02 -8.87439571e-01
   1.85915651e-01  1.32578456e-01]
 [ 1.23438588e-01  4.66554255e-04  7.10199854e-03  9.52012937e-03
   4.35064925e-02 -4.82506222e-01  2.20952321e-02  7.32808884e-02
   1.1023571

Input Gradient: [[ 0.02225357  0.03659494  0.03822483 ... -0.07203475  0.03470578
   0.03671816]
 [ 0.03996658  0.22226751 -0.07032649 ... -0.16867434  0.33686927
   0.51367234]
 [-0.28813405 -0.27407774  0.38806986 ... -0.15529888 -0.07063615
  -0.32066616]
 ...
 [-0.06609873  0.06666615 -0.15892689 ...  0.03860836  0.17013823
  -0.02197686]
 [-0.51831892 -0.28072244  0.10955777 ... -0.29219813  0.03847209
  -0.12979159]
 [-0.52787172 -0.02620767 -0.05383848 ...  0.38524874  0.16369464
  -0.09994942]]
Weights Gradient: [[ 2.28377955e+00  5.07417846e+00 -7.56247580e+00  3.14790878e+00
   1.39307993e+00  3.43937646e-01  4.46063049e-01  2.63291816e+00
   5.17290888e-01 -3.32236828e-01 -4.54443220e+00 -3.40431082e+00
  -4.27813802e+00  5.11940381e+00 -2.24235292e-01 -8.98704202e-03
  -2.45142250e-02 -1.76634742e+00  7.64397812e-01 -5.60234130e+00]
 [-3.49092059e-02 -1.57876154e-02 -1.74125444e-02 -8.99724515e-02
   5.12710138e-02  3.84605987e-02  5.31292087e-02  4.85488369e-02
  -1.882988

Weights: [[ 1.52036805e-02 -1.83323186e-01  1.58386144e-01  1.29974866e-01
   7.66578390e-02  1.38594565e-01 -9.62575071e-02 -3.18474841e-01
   8.04931125e-05  2.07960206e-01 -3.52343179e-01  7.07409228e-01
  -1.41963658e-01 -1.42970670e-01  6.00408883e-02 -2.15420636e-01
  -1.13727235e-01  2.97903109e-01  6.98900654e-01  6.74086067e-01]
 [-1.73900667e-01  1.81219308e-01  2.42601871e-01 -1.36357061e-02
   3.76084300e-01 -1.34201832e-01 -2.16877617e-01  3.57582230e-01
   6.24605404e-03  4.97485556e-02 -1.54302163e-01  4.27913550e-02
  -2.59979682e-01  4.94475805e-01 -3.20746226e-01  1.54997702e-01
   5.35234538e-02  2.69277059e-01  2.01789932e-02  2.15684378e-01]
 [ 3.50376404e-01 -8.42156746e-01 -7.43856116e-01 -1.19662330e-01
   6.21808896e-02 -4.08341493e-01  8.00709701e-01  1.43602077e-01
   4.34695701e-02 -6.62941327e-01 -6.58091553e-01  2.10925932e-01
  -1.04323858e-01 -2.19775420e-01 -9.88142228e-01  5.65336311e-01
  -2.38725871e-01 -7.42870424e-02 -5.31121141e-01 -4.18502241e-02

Activation Gradient: [[ 3.13072743e-02  8.93675840e-04  2.84515625e-02  3.59241941e-02
   2.39581536e-02  1.19131775e-01  2.00997387e-02  1.01191979e-02
  -3.50708441e-01  8.08228700e-02]
 [-5.17198087e-01  2.71823121e-06  1.00342444e-02  1.50239618e-01
   6.05740429e-04  3.34079645e-01  2.73988881e-04  1.58489086e-03
   1.98838701e-02  4.93370435e-04]
 [ 1.43867086e-04  5.03703287e-01  1.65589881e-02  2.70459095e-03
   2.35822718e-02  1.24158978e-03 -6.17906887e-01  2.53070474e-03
   5.48131235e-02  1.26284639e-02]
 [ 5.68914936e-04 -1.48212393e-01  2.80046526e-02  1.49398724e-02
   2.47789147e-02  2.64654968e-03  3.43734579e-02  4.97397874e-03
   2.99035026e-02  8.02254907e-03]
 [ 6.04476547e-03  8.99044394e-02  9.07910007e-03  3.79771175e-02
   1.46989796e-01  2.77847820e-02  7.65536083e-02  4.73388025e-02
   8.65604435e-02 -5.28232855e-01]
 [ 4.62999012e-04  1.63618202e-03 -1.87630576e-02  2.46836381e-03
   2.70892265e-04  2.73435474e-04  2.25253481e-03  2.57271429e-04
   1.1097843

Weights Gradient: [[-8.03935822e-01  5.05601638e-01  2.85033862e+00 -1.21143695e+00
   1.15444984e+00  3.19100833e+00 -8.60079862e-01 -8.35814570e-01
  -5.53303985e+00 -2.61899464e+00 -5.93607119e+00 -1.11838799e+00
  -1.16633866e+00  6.97629478e+00  6.28301271e-01  4.04401091e-03
  -9.85367230e+00  2.63479206e+00  4.03482702e+00 -8.04603753e-01]
 [-2.83856323e-01 -1.82667946e-01  3.78280709e-01 -6.06368878e-02
  -1.20359031e-01  3.76123190e-01  1.98160022e-02 -1.08568222e-01
   1.46960119e-01 -4.89953406e-02 -1.98551946e-01  1.26783850e-02
   0.00000000e+00 -2.26450743e-01 -9.42310161e-02  0.00000000e+00
  -2.23934476e-01  1.55147250e-01  6.18253550e-02 -8.10330389e-02]
 [ 1.73944145e+00 -1.38414807e-01  8.14697034e-01 -1.55706436e-01
   1.29605277e+00  1.66211366e-01 -2.81686945e+00  1.11269755e+00
  -3.24211304e+00  6.98478261e-01  4.97948173e-01 -2.38558033e+00
   4.77032275e-02  3.13074095e+00 -4.75772626e-01  7.37790053e-03
  -2.70927490e+00  1.22155242e+00  7.50008627e-01 -2.048

Gradient: [[ 8.28648111e-03  8.67410757e-02  1.06476130e-01  4.48850303e-02
   2.47627667e-02  1.84469427e-02  2.89242415e-02  3.68027537e-01
   8.17290508e-02 -7.68279256e-01]
 [ 3.26114932e-03  7.13810195e-03  5.70203343e-04  6.98784708e-04
  -2.29192304e-01  1.02123791e-02  3.35571195e-02  1.73455195e-02
   5.11970160e-03  1.51289345e-01]
 [-5.98220115e-01  1.02986805e-04  5.13922673e-01  1.51699073e-02
   2.73978664e-03  2.40141295e-02  3.89009109e-03  1.84356628e-02
   1.79030319e-02  2.04184593e-03]
 [ 3.22726142e-01  5.33885616e-05  2.87827508e-03  1.74309315e-03
   3.23717583e-02 -5.06424119e-01  6.56835018e-02  1.42010559e-03
   5.25634946e-02  2.69843605e-02]
 [ 2.41209964e-02  6.55234206e-03  7.16805276e-03  6.22634109e-04
   8.88111943e-02  4.47266225e-03 -1.62760061e-01  2.62576449e-03
   1.62509685e-02  1.21354457e-02]
 [ 1.70977211e-04  5.10717192e-04  3.87326170e-03  2.59104033e-04
   8.01311515e-04  9.91479116e-04  2.70643144e-05 -4.25411906e-02
   1.61820203e-03  3.42

Weights Gradient: [[ 2.54267572e-01 -8.58941858e-01  3.99455507e+00 -2.03368981e+00
   1.05944309e+00  1.78589166e+00 -4.15306672e-01 -7.79289035e-01
  -2.75841102e+00 -2.48519011e-01]
 [ 9.13636129e-02  9.53877358e-03  1.41817568e-01  1.55257969e-02
   3.84892390e-01  8.33030940e-01  2.94110906e-02 -8.60458836e-01
   7.28922136e-02 -7.18013548e-01]
 [ 1.15413721e+00 -2.64016978e+00  1.03757552e+01 -8.13245804e+00
   1.70447421e+00  6.96132409e+00  1.77847732e+00 -6.10205785e+00
  -4.34497359e+00 -7.54508742e-01]
 [ 1.79172849e-01  2.13166875e-01  4.83919813e+00 -4.95615756e+00
   3.87289016e-01  3.07494663e+00  2.94535061e-01 -2.39630298e+00
  -2.10969651e+00  4.73848492e-01]
 [ 6.84788524e-01 -9.07255088e-01  1.17536146e+01 -7.70010093e+00
   2.11338007e-01  2.72004519e+00 -1.34166066e+00 -3.10426546e-03
  -5.76483502e+00  3.47169647e-01]
 [ 1.51648642e+00 -3.46844632e+00  1.11702834e+01 -8.44287055e+00
   2.24880213e+00  6.64037792e+00  1.13538604e+00 -3.60410697e+00
  -5.39590026e+

  -0.33216205  0.13270147]]
Biases: [[ 0.03053692 -0.1312836   0.0125871  -0.46430836  0.24929269 -0.06121859
   0.46801868 -0.19944621  0.51205226  0.51296677  0.53997647  0.05347081
   0.20291657 -0.34666841 -0.29298746 -0.26417426  0.4613842   0.33421188
  -0.11478408 -0.30533898]]
Gradient: [[ 1.54245191e-01  2.12465646e-01 -3.38719240e-01 ... -1.06774553e-01
   1.74949966e-01  4.81514836e-02]
 [ 1.23727589e-01  1.43733924e-01 -7.81009153e-02 ... -7.65191880e-02
   2.07464562e-01  3.35670821e-01]
 [ 4.73359664e-02  3.02720090e-02 -1.53610931e-02 ... -3.87758400e-02
   1.24351009e-02  1.92713766e-02]
 ...
 [-8.22327367e-02  3.53776219e-02 -1.32243090e-01 ...  4.47430977e-02
   1.84325960e-01 -1.09312482e-02]
 [-4.30224199e-03  9.30314282e-02  6.26727892e-03 ... -3.39904589e-02
   1.26291349e-01  1.55305268e-01]
 [-5.08973557e-04  4.09939109e-03  8.94727538e-03 ... -6.05208910e-03
   4.78213354e-03 -2.01416158e-04]]
Activation Gradient: [[ 0.15424519  0.         -0.33871924 ... -0.10

   0.00629528]]
Biases: [[ 0.02658943 -0.06070646  0.0336196   0.00349472 -0.02002764 -0.01070151
   0.03966828  0.09221765 -0.0449879  -0.08647284 -0.01789651 -0.04158627
   0.03187851  0.10611848 -0.1117666  -0.06209679 -0.00844701 -0.00857556
  -0.00540705  0.01353841]]
Loss: 7.173837743961627
Gradient: [[ 1.23026431e-03  1.99999675e-02  3.63570761e-02 -2.06686787e-01
   9.93652721e-03  4.39843951e-02  5.92339229e-04  7.02946853e-03
   6.98609202e-02  1.76958284e-02]
 [ 3.22593737e-04  6.72571980e-04  3.16367651e-04  8.69717817e-04
   1.92052981e-02  6.39890811e-03  1.24901208e-04 -2.03725180e-01
   1.28912061e-03  1.74525701e-01]
 [ 1.38651047e-02  7.62619396e-04  9.73298760e-03  1.21201223e-02
   1.65552841e-02  2.80358666e-02  1.26472776e-03 -2.22782619e-01
   1.10245929e-02  1.29421314e-01]
 [ 1.74601716e-02  7.81911686e-03 -1.95698916e-01  1.75619333e-03
   2.94195484e-03  4.87200643e-03  1.09514567e-01  3.02069196e-02
   1.95912515e-02  1.53673477e-03]
 [-9.65929433e-02  1.103

  -1.52223634e+00  3.07245804e+00]]
Weights: [[ 0.22086778 -0.51246321  0.23303232  0.04242486 -0.16698439 -0.15557571
  -0.09902744  0.28784307  0.36652448 -0.0821261 ]
 [-0.05529237 -0.12869498  0.39890626 -0.48003856  0.83715934 -0.30913165
  -0.26952908  0.39626082 -0.12012787 -0.16414462]
 [-0.13198852 -0.35427311 -0.18606191  0.35082824  0.04022723  0.60767254
  -0.0868211  -0.09660924 -0.12750627  0.46539455]
 [ 0.03470806  0.19895246 -0.25294755  0.44403774  0.16212689 -0.53019189
   0.09068306  0.09699032 -0.36009094 -0.56340634]
 [-0.85208312  0.3179646   0.82515532  0.30908983  0.06405769 -0.0918752
  -0.19642228 -0.19680071  0.44178172 -0.21121394]
 [-0.39792559  0.09702964 -0.59471173  0.18715627 -0.31973462  0.23566618
  -0.57585051 -0.18600188  0.08202455  0.06438975]
 [ 0.01387298  0.99688012  0.32344683  0.02861497  0.38508431 -0.0942073
   0.57741027 -0.47247394  0.11282504 -0.06498411]
 [ 0.08048601 -0.14943746 -0.0701629  -0.42938597  0.09842862  0.13217032
  -0.411

Loss: 7.993595741244974
Gradient: [[ 9.91745158e-05  4.08725255e-04  9.09039453e-03  1.99291377e-03
   1.04285310e-03  3.92583045e-03  9.70019563e-06 -1.06675402e-01
   4.70068337e-03  8.54051264e-02]
 [ 2.51639354e-02  1.49702055e-04  2.89551375e-02  1.24890345e-04
   2.47079485e-02  2.75084336e-02 -1.32144188e-01  4.22135031e-05
   2.47907790e-02  7.01147710e-04]
 [ 1.34423875e-05  3.43571607e-05  1.07067567e-03 -3.95267734e-02
   1.16373255e-04  2.64650766e-02  1.08025225e-06  4.37336582e-05
   1.16387315e-02  1.43302895e-04]
 [ 3.44858923e-04  3.39299119e-06  2.30960878e-03  1.80710878e-03
   9.22187323e-04  2.13245717e-01  1.32918971e-05  6.62881673e-06
  -2.20340945e-01  1.68815097e-03]
 [ 6.58123011e-05 -9.25912396e-01  2.67207655e-01  4.95871093e-02
   1.92999757e-02  1.60971992e-02  8.31953938e-04  6.38239157e-04
   5.61756105e-01  1.04283468e-02]
 [ 5.59963214e-04  4.38361220e-04  1.10900456e-02  8.30644446e-02
   7.15793861e-04  2.75881457e-01  1.48994058e-04  1.88984333e-03

KeyboardInterrupt: 

In [290]:
y_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]], dtype=float32)