In [57]:
#!pip install numpy datasets Pillow

In [58]:
import numpy as np
from datasets import load_dataset
import math
import PIL.Image as Image

In [59]:
def relu(Z):
    return np.maximum(0,Z)

In [60]:
Z = np.random.rand(10,2)
print(relu(Z))

[[0.49100846 0.20234597]
 [0.52323675 0.53469981]
 [0.05449806 0.67790417]
 [0.22211327 0.32422883]
 [0.3604039  0.17657048]
 [0.87616125 0.14433414]
 [0.49603478 0.21707525]
 [0.7067448  0.71885359]
 [0.24163933 0.17996736]
 [0.8994299  0.8141308 ]]


In [61]:
def softmax(Z):
    exps = np.exp(Z - np.max(Z))
    sum = np.sum(exps, axis=0, keepdims=True)
    return exps / sum

In [62]:
A = softmax(np.random.randn(10,2))
print(np.sum(A))
# using math.isclose because of floating point errors
assert math.isclose(2, np.sum(A))

2.0


In [63]:
def linear_forward(A_prev, W, b):
    """Linear Transformation
    Takes activation output of previous, W and b of current 
    returns tuple with Z and cache of the inputs to this function
    """
    
    Z = np.dot(W, A_prev) + b
    cache = (A_prev, W, b)

    return Z, cache

In [64]:
fake_train = np.random.randn(784, 5)
fake_W = np.random.randn(10, 784)
b = 0
fake_Z, fake_cache = linear_forward(fake_train, fake_W, 0)

fake_W2 = np.random.randn(20,10)
fake_b2 = 0

fake_Z2, fake_cache = linear_forward(fake_Z, fake_W2, fake_b2)

In [65]:
def forward_activation(Z, activation):
    """
    @param Z output of linear layer
    @param relu or softmax
    returns output of activation and Z
    """
    if activation == "relu":
        A = relu(Z)
        return A, Z
    if activation == "softmax":
        A = softmax(Z)
        return A, Z

In [82]:
def compute_cost(Y_hat, Y):
    m = Y.shape[1]
    print(m)
    return -np.sum(Y * np.log(Y_hat)) / m

In [67]:
Y = np.zeros((10,2))
Y[0][0] = 1
Y[1][1] = 1
print(Y)
A = softmax(np.array(np.random.randn(10,2)))
print(A)
print(compute_cost(A, Y))

[[1. 0.]
 [0. 1.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]
[[0.04621014 0.11502364]
 [0.07369277 0.01184268]
 [0.02460916 0.02877173]
 [0.06570856 0.46181361]
 [0.08935576 0.02288577]
 [0.01483292 0.21319953]
 [0.09575363 0.03576535]
 [0.49592079 0.05943528]
 [0.05901588 0.00851679]
 [0.03490039 0.04274562]]
3.755300853309631


In [68]:
def linear_backward(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]
    
    dW = np.dot(dZ, A_prev.T) / m
    db = np.sum(dZ, axis = 0, keepdims = True) / m
    dA_prev = np.dot(W.T, dZ)

    return dA_prev, dW, db

In [69]:
def relu_activation_backward(dA, Z):
    dZ = np.array(dA, copy=True)
    dZ[Z <= 0] = 0

    return dZ

In [70]:
def cross_entropy_softmax_activation_backward(Y_hat, Y):
    return Y_hat - Y

In [71]:
def update_params(W, b, dW, db, learning_rate):
    updated_W = W - dW * learning_rate
    updated_b = b - db * learning_rate

    return updated_W, updated_b

In [72]:
update_W = np.array([[-2,2],[3,3]])
update_b = 1
print(update_params(update_W, update_b, 2, 1, 0.01))

(array([[-2.02,  1.98],
       [ 2.98,  2.98]]), 0.99)


In [73]:
def model_forward(X, params):
    """
    Passes input through layers. Params should account for at least 2 layers.
    returns Y_hat, params, and cache for each layer
    """
    if len(params) <= 1:
        raise Exception("model_forward params should contain at least 2 layers")
    
    caches = []

    # first layer uses the input X
    first_W, first_b = params[0]
    Z, forward_cache = linear_forward(X, first_W, first_b)
    A, activation_cache = forward_activation(Z, "relu")
    caches.append((forward_cache, activation_cache))

    # all of the middle layers use the previous layers output
    for W,b in params[1:-1]:
        Z, forward_cache = linear_forward(A, W, b)
        A, activation_cache = forward_activation(Z, "relu")
        caches.append((forward_cache, activation_cache))

    # last layer uses the softmax activation function
    last_W, last_b = params[-1]
    Z, forward_cache = linear_forward(A, last_W, last_b)
    A, activation_cache = forward_activation(Z, "softmax")
    caches.append((forward_cache, activation_cache))

    return A, params, caches

In [74]:
def model_backward(Y_hat, Y, params, caches):
    """
    @param Y_hat predicated outputs from forward pass
    @param Y true labels
    @param params to be updated
    @param caches tuple containing forward_cache and activation_cache from forward
    """
    learning_rate = 0.01
    
    last_linear_cache, last_activation_cache = caches[-1]
    dZ = cross_entropy_softmax_activation_backward(Y_hat, Y)
    dA_prev, dW, db = linear_backward(dZ, last_linear_cache)
    temp_W, temp_b = params[-1]
    print(dZ.shape)
    params[-1] = update_params(temp_W, temp_b, dW, db, learning_rate)

    # don't include the last
    params_pointer = len(params) - 2
    for linear_cache, activation_cache in reversed(caches[:-1]):
        dZ = relu_activation_backward(dA_prev, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        temp_W, temp_b = params[params_pointer]
        params[params_pointer] = update_params(temp_W, temp_b, dW, db, learning_rate)

    return params

In [99]:
dataset = load_dataset('mnist')
train_dataset = dataset['train']
train_images = []

Y = np.zeros((10, 60000))
for i, example in enumerate(train_dataset):
    train_images.append(np.array(example['image']).flatten())
    label = example['label']
    Y[label][i] = 1

train_images = np.array(train_images).T

In [101]:
print(Y[:,0])
print(train_dataset['label'][0])

[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
5


In [102]:
n_h = 784
W1 = np.random.randn(n_h,784) * 0.01
b1 = np.zeros((n_h,1))

W2 = np.random.randn(10, n_h) * 0.01
b2 = np.zeros((10,1))

params = [(W1, b1), (W2, b2)]

In [None]:
for i in range(5):
    Y_hat, params, caches = model_forward(train_images, params) 
    print(compute_cost(Y_hat, Y))
    model_backward(Y_hat, Y, params, caches)
    if i == 4:
        print(Y[:,0])
        print(Y_hat[:,0])

60000
7.630199988730435
(10, 60000)
60000
nan


  return -np.sum(Y * np.log(Y_hat)) / m
  return -np.sum(Y * np.log(Y_hat)) / m


(10, 60000)
60000
nan
(10, 60000)
