In [1]:
#!pip install numpy datasets Pillow

In [2]:
import numpy as np
from datasets import load_dataset
import math
import PIL.Image as Image

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def relu(Z):
    return np.maximum(0,Z)

In [4]:
Z = np.random.rand(10,2)
print(relu(Z))

[[0.28347142 0.65338154]
 [0.99725726 0.67745403]
 [0.07005719 0.85323463]
 [0.41283128 0.09592087]
 [0.8013789  0.86727702]
 [0.84431865 0.32316221]
 [0.12950783 0.87403506]
 [0.98190163 0.20377207]
 [0.65903617 0.94753198]
 [0.31700911 0.64517294]]


In [5]:
def softmax(Z):
    exps = np.exp(Z - np.max(Z))
    sum = np.sum(exps, axis=0, keepdims=True)
    return exps / sum

In [6]:
A = softmax(np.random.randn(10,2))
print(np.sum(A))
# using math.isclose because of floating point errors
assert math.isclose(2, np.sum(A))

2.0


In [7]:
def linear_forward(A_prev, W, b):
    Z = np.dot(W, A_prev) + b
    cache = (A_prev, W, b)

    return Z, cache

In [8]:
fake_train = np.random.randn(784, 5)
fake_W = np.random.randn(10, 784)
b = 0
fake_Z, fake_cache = linear_forward(fake_train, fake_W, 0)

fake_W2 = np.random.randn(20,10)
fake_b2 = 0

fake_Z2, fake_cache = linear_forward(fake_Z, fake_W2, fake_b2)

In [9]:
def forward_activation(Z, activation):
    if activation == "relu":
        A = relu(Z)
        return A, Z
    if activation == "softmax":
        A = softmax(Z)
        return A, Z

In [10]:
def compute_cost(Y_hat, Y):
    m = Y.shape[1]
    return -np.sum(Y * np.log(Y_hat)) / m

In [11]:
Y = np.zeros((10,2))
Y[0][0] = 1
Y[1][1] = 1
print(Y)
A = softmax(np.array(np.random.randn(10,2)))
print(A)
print(compute_cost(A, Y))

[[1. 0.]
 [0. 1.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]
[[0.04825164 0.00642334]
 [0.08064569 0.02182243]
 [0.02601519 0.02976439]
 [0.22047259 0.07170474]
 [0.04664958 0.06349295]
 [0.08848984 0.24364418]
 [0.07361539 0.04407425]
 [0.29674571 0.03936513]
 [0.07531153 0.06686991]
 [0.04380285 0.41283868]]
3.4280711168717053


In [12]:
def linear_backward(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]
    
    dW = np.dot(dZ, A_prev.T) / m
    db = np.sum(dZ, axis = 0, keepdims = True) / m
    dA_prev = np.dot(W.T, dZ)

    return dW, db, dA_prev

In [13]:
def relu_activation_backward(Z):
    grad = np.zeros_like(Z)
    grad[Z > 0] = 1

    return grad

In [14]:
Z = np.random.randn(10,2)
print(Z)
print(relu_activation_backward(Z))

[[ 0.80061251 -1.22711744]
 [-1.55060884  0.92693936]
 [-0.51135563  0.20048524]
 [-1.02015128  0.35791063]
 [-0.1944842  -0.55313325]
 [ 0.11978371 -0.71084491]
 [ 0.25655035  1.33420512]
 [-2.01562255 -0.55228805]
 [ 0.91933995 -0.12020466]
 [ 1.16276473 -0.63124937]]
[[1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 0.]
 [1. 0.]
 [1. 1.]
 [0. 0.]
 [1. 0.]
 [1. 0.]]


In [15]:
def softmax_activation_backward(Y_hat, Y):
    return Y_hat - Y

In [16]:
def update_params(W, b, grads, learning_rate):
    dW, db = grads

    updated_W = W - dW * learning_rate
    updated_b = b - db * learning_rate

    return updated_W, updated_b

In [17]:
update_W = np.array([[-2,2],[3,3]])
update_b = 1
print(update_params(update_W, update_b, (2, 1), 0.01))

(array([[-2.02,  1.98],
       [ 2.98,  2.98]]), 0.99)


In [18]:
dataset = load_dataset('mnist')
train_dataset = dataset['train']
train_images = []

Y = np.zeros((10, 60000))
for i, example in enumerate(train_dataset):
    train_images.append(np.array(example['image']).flatten())
    label = example['label']
    Y[label][i-1] = 1

train_images = np.array(train_images).T

In [19]:
n_h = 784
W1 = np.random.randn(n_h,784) * 0.01
b1 = np.zeros((n_h,1))

W2 = np.random.randn(10, n_h) * 0.01
b2 = np.zeros((10,1))

In [None]:
#print(W1[:,0])

In [20]:
for i in range(100):
    Z1, cache1 = linear_forward(train_images, W1, b1)
    A1, activation_cache1 = forward_activation(Z1, "relu")
    
    Z2, cache2 = linear_forward(A1, W2, b2)
    Y_hat, activation_cache2 = forward_activation(Z2, "softmax")

    if i % 10 == 0:
        print(compute_cost(Y_hat, Y))

    # back prop
    dZ2 = softmax_activation_backward(Y_hat, Y)
    dW2, db2, dA_prev = linear_backward(dZ2, cache2)
    
    dZ1 = relu_activation_backward(dA_prev)
    dW1, db1, dA_prev = linear_backward(dZ1, cache1)

    learning_rate = 0.1
    W2, b2 = update_params(W2, b2, (dW2, db2), learning_rate)
    W1, b1 = update_params(W1, b1, (dW1, db1), learning_rate)

7.332375290418528
2.3025850929940455
2.3025850929940455
2.3025850929940455
2.3025850929940455
2.3025850929940455
2.3025850929940455
2.3025850929940455
2.3025850929940455
2.3025850929940455


In [21]:
Z1, cache1 = linear_forward(train_images, W1, b1)
A1, activation_cache1 = forward_activation(Z1, "relu")

Z2, cache2 = linear_forward(A1, W2, b2)
Y_hat, activation_cache2 = forward_activation(Z2, "softmax")
print(Y_hat[:,20])
print(W1)

[0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]
[[ 1.36660868e-02  7.29232681e-03  1.91740950e-03 ...  2.36161189e-03
   4.87373041e-04 -3.69570747e-03]
 [-4.52904594e-03 -3.00460148e-03 -2.39135796e-03 ... -1.55388207e-02
   3.88808363e-03 -8.11918698e-03]
 [-9.17735191e-04 -1.35629760e-02  4.38444203e-03 ...  1.43089489e-02
  -9.96084746e-03 -1.54381320e-03]
 ...
 [-4.66638665e-03 -5.24440675e-03  7.17981893e-05 ...  5.35902131e-03
  -2.63796801e-03  3.29085753e-03]
 [-5.35252940e-03 -3.58190663e-03  7.48062909e-03 ...  1.47151582e-03
  -8.55522341e-03  6.55363024e-03]
 [ 8.05459437e-03 -5.87409968e-03  1.34903865e-02 ... -2.07140787e-03
  -2.56660109e-03 -7.94257614e-03]]
