# Trying to build a simple Neural Network from scratch

In [95]:
# Basic functions to build NN
import numpy as np

def softmax(x):
    x = x - np.max(x, axis=-1, keepdims=True)
    return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)


def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

In [96]:
# Always works OK
def numerical_gradient_2d(f, X):
    h = 1e-4
    grad = np.zeros_like(X)
    for i, row in enumerate(X):
        for j, val in enumerate(row):
            X[i,j] -= h
            y1 = f(X)
            X[i,j] += 2*h
            y2 = f(X)
            grad[i,j] = (y2-y1)/(2*h)
            X[i,j] = val
    return grad


# Doesn't work in some of the following cases
def numerical_gradient_nd(f, X):
    '''Simply nemerically calcurate gradient of parameter matrix X'''
    h = 1e-4
    grad = []
    Xf = X.flatten()
    for i, val in enumerate(Xf):
        Xf[i] -= h
        y1 = f(Xf.reshape(X.shape))

        Xf[i] += 2*h
        y2 = f(Xf.reshape(X.shape))
        
        Xf[i] = val
        grad.append((y2-y1) / (2*h))

    return np.array(grad).reshape(X.shape)

numerical_gradient = numerical_gradient_nd # Set N-dimensional by default

Here, I implement 2 NN classes, Net0 and Net1. Net0 doesn't work but Net1 seems to work. Only difference between them is self.gradient(). In self.gradient(), loss function is passed to numerical_gradient. Net1 updates a parameter matrix(net.W) in addition to Net0.gradient()

In [97]:
# This doesn't work :(
class Net0:
    '''Simple NN class which holds parameter W'''
    def __init__(self):
        self.W = np.random.randn(2,3)
        
    def predict(self, x):
        a = np.dot(x, self.W)
        z = softmax(a)
        y = z
        return y

    def loss(self, x, t):
        y = self.predict(x)
        _loss = cross_entropy_error(y, t)
        return _loss
    
    def gradient(self, x, t):
        def f(W):
            return self.loss(x, t)
        return numerical_gradient(f, net.W)


# This works with only modification of self.gradient() against the Net0.gradient().
class Net1(Net0):
    '''Simple NN class which holds parameter W'''
    def gradient(self, x, t):
        def f(W):
            net.W = W # why do we need this with numerical_gradient_nd() used?      
            return self.loss(x, t)
        return numerical_gradient(f, net.W)

The following test with **Net0** doesn't work. Loss seems always constant.

In [98]:
# data
np.random.seed(0)
x0 = np.array([0.6, 0.9])
t0 = np.array([0, 0, 1])

# Doesn't work
net = Net0()
for i in np.arange(9):
    dW = net.gradient(x0, t0)
    net.W -=  0.5 * dW
    print(f"loss={net.loss(x0, t0)}")

loss=3.6674507891066104
loss=3.6674507891066104
loss=3.6674507891066104
loss=3.6674507891066104
loss=3.6674507891066104
loss=3.6674507891066104
loss=3.6674507891066104
loss=3.6674507891066104
loss=3.6674507891066104


If numerical_gradient is replaced with the following simpler numerical_gradient_2d, this works OK

In [99]:
numerical_gradient = numerical_gradient_2d # replace nd with 2d here, and this works OK

net = Net0()
for i in np.arange(9):
    dW = net.gradient(x0, t0)
    net.W -=  0.5 * dW
    print(f"loss={net.loss(x0, t0)}")

loss=0.49818359250842825
loss=0.3721732894742513
loss=0.2926569470459612
loss=0.2392641046837959
loss=0.2014525634086171
loss=0.17349119992150133
loss=0.1520794863966645
loss=0.13521166003877994
loss=0.12161009078258005


In [100]:
numerical_gradient = numerical_gradient_nd # restore with the original 'nd' version

# Alternatively use Net1() instead of Net0()
# Works OK
net = Net1()
for i in np.arange(9):
    dW = net.gradient(x0, t0)
    net.W -=  0.5 * dW
    print(f"loss={net.loss(x0, t0)}")    

loss=1.3001854908965156
loss=0.8729299051185798
loss=0.6049938177985322
loss=0.4422993266571196
loss=0.34053264446630105
loss=0.273466707069256
loss=0.22690856615419813
loss=0.1931017152108553
loss=0.1676234167519967


Another testing without Class, more simpified

In [101]:
np.random.seed(0)
W = np.random.rand(2,3)

def predict(x):
    a = np.dot(x, W)
    z = softmax(a)
    y = z
    return y

def loss(x, t):
    y = predict(x)
    return cross_entropy_error(y, t)

def loss_W(_W):    
    #W = _W      # this won't work
    #W[...] = _W # this works OK
    return loss(x0, t0)

In [102]:
numerical_gradient = numerical_gradient_nd # NG

for i in np.arange(9):
    dW = numerical_gradient(loss_W, W)
    W -= 0.5 * dW
    print(f"loss_W={loss_W(W)}") 

loss_W=1.0151821715228941
loss_W=1.0151821715228941
loss_W=1.0151821715228941
loss_W=1.0151821715228941
loss_W=1.0151821715228941
loss_W=1.0151821715228941
loss_W=1.0151821715228941
loss_W=1.0151821715228941
loss_W=1.0151821715228941


In [103]:
numerical_gradient = numerical_gradient_2d # OK

for i in np.arange(9):
    dW = numerical_gradient(loss_W, W)
    W -= 0.5 * dW
    print(f"loss_W={loss_W(W)}") 

loss_W=0.6959934206606462
loss_W=0.49938552069857356
loss_W=0.37757736153655047
loss_W=0.29870927133781544
loss_W=0.24493252040955152
loss_W=0.20649812932314293
loss_W=0.17791777406640177
loss_W=0.15595756807225267
loss_W=0.13862179022094626


To summarize,

| numerical_gradient_      | Net0 | Net1
| ----------- | ----------- |----------- 
| 2d      | OK      | OK 
| nd   | NG        | NG

The difference betwenn Net0 and Net1 is the existance of explicit net.W update in loss_W() function. So probably numerical_gradient_nd() seems to miss something?