In [1]:
#!pip install numpy datasets Pillow

In [2]:
import numpy as np
from datasets import load_dataset
import math
import PIL.Image as Image

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def relu(Z):
    return np.maximum(0,Z)

In [4]:
Z = np.random.rand(10,2)
print(relu(Z))

[[0.67435404 0.49117437]
 [0.03065551 0.52555793]
 [0.85209841 0.17836228]
 [0.63882956 0.01119575]
 [0.23267744 0.89489578]
 [0.6890755  0.48012301]
 [0.29770143 0.66773982]
 [0.97727825 0.86105917]
 [0.70028144 0.95928848]
 [0.16573499 0.54229028]]


In [5]:
def softmax(Z):
    exps = np.exp(Z - np.max(Z))
    sum = np.sum(exps, axis=0, keepdims=True)
    return exps / sum

In [6]:
A = softmax(np.random.randn(10,2))
print(np.sum(A))
# using math.isclose because of floating point errors
assert math.isclose(2, np.sum(A))

2.0


In [7]:
def linear_forward(A_prev, W, b):
    """Linear Transformation
    Takes activation output of previous, W and b of current 
    returns tuple with Z and cache of the inputs to this function
    """
    Z = np.dot(W, A_prev) + b
    cache = (A_prev, W, b)

    return Z, cache

In [8]:
fake_train = np.random.randn(784, 5)
fake_W = np.random.randn(10, 784)
fake_b = np.zeros((10,1))
fake_Z, fake_cache = linear_forward(fake_train, fake_W, fake_b)

fake_W2 = np.random.randn(20,10)
fake_b2 = np.zeros((20,1))

fake_Z2, fake_cache = linear_forward(fake_Z, fake_W2, fake_b2)

In [9]:
def forward_activation(Z, activation):
    """
    @param Z output of linear layer
    @param relu or softmax
    returns output of activation and Z
    """
    if activation == "relu":
        A = relu(Z)
        return A, Z
    if activation == "softmax":
        A = softmax(Z)
        return A, Z

In [10]:
def compute_cost(Y_hat, Y):
    m = Y.shape[1]
    return -np.sum(Y * np.log(Y_hat)) / m

In [11]:
Y = np.zeros((10,2))
Y[0][0] = 1
Y[1][1] = 1
print(Y)
A = softmax(np.array(np.random.randn(10,2)))
print(A)
print(compute_cost(A, Y))

[[1. 0.]
 [0. 1.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]
[[0.0713835  0.14123887]
 [0.28809995 0.07909781]
 [0.0576663  0.01979414]
 [0.0324106  0.06552825]
 [0.19993752 0.17377762]
 [0.03979986 0.29224956]
 [0.06096412 0.02920601]
 [0.06282414 0.03695044]
 [0.08812575 0.09513607]
 [0.09878825 0.06702124]]
2.588379323272076


In [12]:
def linear_backward(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]
    
    dW = np.dot(dZ, A_prev.T) / m
    db = np.sum(dZ, axis = 1, keepdims = True) / m
    dA_prev = np.dot(W.T, dZ)

    return dA_prev, dW, db

In [13]:
def relu_activation_backward(dA, Z):
    dZ = np.array(dA, copy=True)
    dZ[Z <= 0] = 0

    return dZ

In [14]:
def cross_entropy_softmax_activation_backward(Y_hat, Y):
    return Y_hat - Y

In [15]:
def update_params(W, b, dW, db, learning_rate):
    updated_W = W - learning_rate * dW
    updated_b = b - learning_rate * db

    return updated_W, updated_b

In [16]:
update_W = np.array([[-2,2],[3,3]])
update_b = 1
print(update_params(update_W, update_b, 2, 1, 0.01))

(array([[-2.02,  1.98],
       [ 2.98,  2.98]]), 0.99)


In [17]:
def model_forward(X, params):
    """
    Passes input through layers. Params should account for at least 2 layers.
    returns Y_hat, params, and cache for each layer
    """
    if len(params) <= 1:
        raise Exception("model_forward params should contain at least 2 layers")
    
    caches = []

    # first layer uses the input X
    first_W, first_b = params[0]
    Z, forward_cache = linear_forward(X, first_W, first_b)
    A, activation_cache = forward_activation(Z, "relu")
    caches.append((forward_cache, activation_cache))

    # all of the middle layers use the previous layers output
    for W,b in params[1:-1]:
        Z, forward_cache = linear_forward(A, W, b)
        A, activation_cache = forward_activation(Z, "relu")
        caches.append((forward_cache, activation_cache))

    # last layer uses the softmax activation function
    last_W, last_b = params[-1]
    Z, forward_cache = linear_forward(A, last_W, last_b)
    A, activation_cache = forward_activation(Z, "softmax")
    caches.append((forward_cache, activation_cache))

    return A, params, caches

In [18]:
def model_backward(Y_hat, Y, params, caches):
    """
    @param Y_hat predicated outputs from forward pass
    @param Y true labels
    @param params to be updated
    @param caches tuple containing forward_cache and activation_cache from forward
    """
    learning_rate = 0.01
    
    last_linear_cache, last_activation_cache = caches[-1]
    dZ = cross_entropy_softmax_activation_backward(Y_hat, Y)
    dA_prev, dW, db = linear_backward(dZ, last_linear_cache)
    temp_W, temp_b = params[-1]
    params[-1] = update_params(temp_W, temp_b, dW, db, learning_rate)

    # don't include the last
    params_pointer = len(params) - 2
    for linear_cache, activation_cache in reversed(caches[:-1]):
        dZ = relu_activation_backward(dA_prev, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        temp_W, temp_b = params[params_pointer]
        params[params_pointer] = update_params(temp_W, temp_b, dW, db, learning_rate)

    return params

In [19]:
def format_data(dataset):
    """
    Formats dataset to work with model
    returns binary images, labels
    """
    images = []
    Y = np.zeros((10, dataset.shape[0]))
    for i, example in enumerate(dataset):
        images.append(np.array(example['image']).flatten())
        label = example['label']
        Y[label][i] = 1
    
    images = np.array(images).T
    return images, Y

In [20]:
dataset = load_dataset('mnist')
train_dataset = dataset['train']

train_images, Y = format_data(train_dataset)

In [21]:
print(Y[:,0])
print(train_dataset['label'][0])

[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
5


In [22]:
n_h = 784
W1 = np.random.randn(n_h,784) * 0.01
b1 = np.zeros((n_h,1))

W2 = np.random.randn(n_h, n_h) * 0.01
b2 = np.zeros((n_h, 1))

W3 = np.random.randn(10, n_h) * 0.01
b3 = np.zeros((10,1))

params = [(W1, b1), (W2, b2), (W3, b3)]

In [23]:
for i in range(100):
    Y_hat, params, caches = model_forward(train_images, params) 
    print(compute_cost(Y_hat, Y))
    model_backward(Y_hat, Y, params, caches)

2.568793484883357
3.931390899852869
5.2751618162809555
3.8879711339708125
1.950962827392716
1.6607210537695596
1.4242308683637683
1.200956956178693
1.0155211074009862
0.8828934838926144
0.7918003836631741
0.7318835318508988
0.7206140411646177
0.8743962430989263
1.4809432532382187
1.2995428382597722
1.1098122533389796
0.7844938512065005
0.6290066048272102
0.5700698040193339
0.5462421903151008
0.5378120486819471
0.5701164199073904
0.6565056383959655
0.7783718476083309
0.783080707061853
0.5662206653103093
0.4795231580442267
0.45471472390347895
0.43813926681623266
0.4277329556067663
0.4199231168160232
0.4154682556742228
0.4160663645290645
0.42858308412672624
0.45572666892618485
0.5458694258176973
0.577498649661187
0.686012415899268
0.4965627413934627
0.43921575346427066
0.3927599818207807
0.3823524803092129
0.37175358492180666
0.36793623366208567
0.36333793490684774
0.3626561944933779
0.3615697753349816
0.3653801599789353
0.37026460746440165
0.3832433732284846
0.3997937629876694
0.42716976

In [24]:
test_dataset = dataset["test"]
test_images, Y_test = format_data(test_dataset)
print(test_images.shape)

(784, 10000)


In [25]:
print(params[1][1].shape)
Y_hat, params, caches = model_forward(test_images, params)
Y_pred = np.argmax(Y_hat, axis=0)
Y_answer = np.argmax(Y_test, axis=0)

diff = Y_pred - Y_answer
# print number of incorrect
print(np.count_nonzero(diff))

(784, 1)
752
