In [1]:
#!pip install numpy datasets Pillow

In [2]:
import numpy as np
from datasets import load_dataset
import math
import PIL.Image as Image

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def relu(Z):
    return np.maximum(0,Z)

In [4]:
Z = np.random.rand(10,2)
print(relu(Z))

[[0.87680351 0.27015617]
 [0.11788694 0.84693214]
 [0.26065975 0.71332669]
 [0.19572592 0.50501034]
 [0.87722157 0.62631142]
 [0.26091206 0.84459024]
 [0.14998426 0.97023634]
 [0.14403422 0.04037952]
 [0.29609888 0.41325615]
 [0.65380029 0.40945695]]


In [5]:
def softmax(Z):
    exps = np.exp(Z - np.max(Z))
    sum = np.sum(exps, axis=0, keepdims=True)
    return exps / sum

In [6]:
A = softmax(np.random.randn(10,2))
print(np.sum(A))
# using math.isclose because of floating point errors
assert math.isclose(2, np.sum(A))

2.0000000000000004


In [7]:
def linear_forward(A_prev, W, b):
    """Linear Transformation
    Takes activation output of previous, W and b of current 
    returns tuple with Z and cache of the inputs to this function
    """
    
    Z = np.dot(W, A_prev) + b
    cache = (A_prev, W, b)

    return Z, cache

In [8]:
fake_train = np.random.randn(784, 5)
fake_W = np.random.randn(10, 784)
b = 0
fake_Z, fake_cache = linear_forward(fake_train, fake_W, 0)

fake_W2 = np.random.randn(20,10)
fake_b2 = 0

fake_Z2, fake_cache = linear_forward(fake_Z, fake_W2, fake_b2)

In [9]:
def forward_activation(Z, activation):
    """
    @param Z output of linear layer
    @param relu or softmax
    returns output of activation and Z
    """
    if activation == "relu":
        A = relu(Z)
        return A, Z
    if activation == "softmax":
        A = softmax(Z)
        return A, Z

In [10]:
def compute_cost(Y_hat, Y):
    m = Y.shape[1]
    return -np.sum(Y * np.log(Y_hat)) / m

In [11]:
Y = np.zeros((10,2))
Y[0][0] = 1
Y[1][1] = 1
print(Y)
A = softmax(np.array(np.random.randn(10,2)))
print(A)
print(compute_cost(A, Y))

[[1. 0.]
 [0. 1.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]
[[0.22144355 0.02745741]
 [0.05756838 0.06711479]
 [0.06647727 0.11267901]
 [0.01098744 0.05124392]
 [0.06085267 0.08185695]
 [0.07218655 0.03524041]
 [0.10169815 0.19416332]
 [0.18797884 0.0883971 ]
 [0.02432658 0.08781962]
 [0.19648056 0.25402749]]
2.104469221160171


In [12]:
def linear_backward(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]
    
    dW = np.dot(dZ, A_prev.T) / m
    db = np.sum(dZ, axis = 0, keepdims = True) / m
    dA_prev = np.dot(W.T, dZ)

    return dA_prev, dW, db

In [13]:
def relu_activation_backward(dA, Z):
    dZ = np.array(dA, copy=True)
    dZ[Z <= 0] = 0

    return dZ

In [14]:
def cross_entropy_softmax_activation_backward(Y_hat, Y):
    return Y_hat - Y

In [15]:
def update_params(W, b, dW, db, learning_rate):
    updated_W = W - dW * learning_rate
    updated_b = b - db * learning_rate

    return updated_W, updated_b

In [16]:
update_W = np.array([[-2,2],[3,3]])
update_b = 1
print(update_params(update_W, update_b, 2, 1, 0.01))

(array([[-2.02,  1.98],
       [ 2.98,  2.98]]), 0.99)


In [17]:
def model_forward(X, params):
    """
    Passes input through layers. Params should account for at least 2 layers.
    returns Y_hat, params, and cache for each layer
    """
    if len(params) <= 1:
        raise Exception("model_forward params should contain at least 2 layers")
    
    caches = []

    # first layer uses the input X
    first_W, first_b = params[0]
    Z, forward_cache = linear_forward(X, first_W, first_b)
    A, activation_cache = forward_activation(Z, "relu")
    caches.append((forward_cache, activation_cache))

    # all of the middle layers use the previous layers output
    for W,b in params[1:-1]:
        Z, forward_cache = linear_forward(A, W, b)
        A, activation_cache = forward_activation(Z, "relu")
        caches.append((forward_cache, activation_cache))

    # last layer uses the softmax activation function
    last_W, last_b = params[-1]
    Z, forward_cache = linear_forward(A, last_W, last_b)
    A, activation_cache = forward_activation(Z, "softmax")
    caches.append((forward_cache, activation_cache))

    return A, params, caches

In [18]:
def model_backward(Y_hat, Y, params, caches):
    """
    @param Y_hat predicated outputs from forward pass
    @param Y true labels
    @param params to be updated
    @param caches tuple containing forward_cache and activation_cache from forward
    """
    learning_rate = 0.01
    
    last_linear_cache, last_activation_cache = caches[-1]
    dZ = cross_entropy_softmax_activation_backward(Y_hat, Y)
    dA_prev, dW, db = linear_backward(dZ, last_linear_cache)
    temp_W, temp_b = params[-1]
    params[-1] = update_params(temp_W, temp_b, dW, db, learning_rate)

    # don't include the last
    params_pointer = len(params) - 2
    for linear_cache, activation_cache in reversed(caches[:-1]):
        dZ = relu_activation_backward(dA_prev, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        temp_W, temp_b = params[params_pointer]
        params[params_pointer] = update_params(temp_W, temp_b, dW, db, learning_rate)

    return params

In [19]:
dataset = load_dataset('mnist')
train_dataset = dataset['train']
train_images = []

Y = np.zeros((10, 60000))
for i, example in enumerate(train_dataset):
    train_images.append(np.array(example['image']).flatten())
    label = example['label']
    Y[label][i] = 1

train_images = np.array(train_images).T

In [20]:
print(Y[:,0])
print(train_dataset['label'][0])

[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
5


In [28]:
n_h = 784
W1 = np.random.randn(n_h,784) * 0.01
b1 = np.zeros((n_h,1))

W2 = np.random.randn(10, n_h) * 0.01
b2 = np.zeros((10,1))

params = [(W1, b1), (W2, b2)]

In [45]:
for i in range(100):
    Y_hat, params, caches = model_forward(train_images, params) 
    print(compute_cost(Y_hat, Y))
    model_backward(Y_hat, Y, params, caches)

4.085413982531726
3.316406262126225
2.0700471398685947
1.262309302496258
1.0056241518436944
0.8228189837669786
1.069458322222579
1.5485774027289823
1.8952673544336
1.1655356851986098
0.829332941778737
0.6719739755461703
0.7704902209092545
0.9982031655577659
0.8984827379898178
0.7531528982523117
0.7111493812065169
0.9722820590322082
1.179620631749935
1.580294695160553
0.8536644026659025
0.712781487584349
0.5510943969999746
0.45007069873892314
0.5877110779134657
0.9074953068180116
0.902162213216696
0.8665547814713752
0.7529173834783813
0.624455136849103
0.5420401235559168
0.4982369259270695
0.5413434332778497
0.8752782727040261
0.8194199739268196
0.5891593480570656
0.45536656037213474
0.48737419860186953
0.5776022124399351
0.8110200042647763
0.79250397152216
0.6855824456225841
0.48403193614798135
0.47148096310234694
0.4130332917703356
0.44383064427707875
0.43128587002592617
0.38965783874124843
0.5870149842940161
0.4545809542956719
0.37227583098935924
0.3307695282100946
0.4114038095280192

In [46]:
Y_hat, params, caches = model_forward(train_images, params)
target = 20009
Y_pred = np.argmax(Y_hat, axis=0)
Y_answer = np.argmax(Y, axis=0)

diff = Y_pred - Y_answer
print(np.count_nonzero(diff))

3062
