In [2]:
#!pip install numpy datasets Pillow

In [3]:
import numpy as np
from datasets import load_dataset
import math
import PIL.Image as Image

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def relu(Z):
    return np.maximum(0,Z)

In [5]:
Z = np.random.rand(10,2)
print(relu(Z))

[[0.937116   0.47780433]
 [0.97468063 0.2416996 ]
 [0.13901059 0.36441213]
 [0.0834418  0.90643079]
 [0.68140029 0.91512293]
 [0.32823982 0.42770444]
 [0.72103609 0.26227339]
 [0.5652994  0.07028255]
 [0.54161478 0.459428  ]
 [0.4881677  0.33040054]]


In [6]:
def softmax(Z):
    exps = np.exp(Z - np.max(Z))
    sum = np.sum(exps, axis=0, keepdims=True)
    return exps / sum

In [7]:
A = softmax(np.random.randn(10,2))
print(np.sum(A))
# using math.isclose because of floating point errors
assert math.isclose(2, np.sum(A))

1.9999999999999996


In [8]:
def linear_forward(A_prev, W, b):
    Z = np.dot(W, A_prev) + b
    cache = (A_prev, W, b)

    return Z, cache

In [39]:
fake_train = np.random.randn(784, 5)
fake_W = np.random.randn(10, 784)
b = 0
fake_Z, fake_cache = linear_forward(fake_train, fake_W, 0)

fake_W2 = np.random.randn(20,10)
fake_b2 = 0

fake_Z2, fake_cache = linear_forward(fake_Z, fake_W2, fake_b2)

In [40]:
def forward_activation(Z, activation):
    if activation == "relu":
        A = relu(Z)
        return A, Z
    if activation == "softmax":
        A = softmax(Z)
        return A, Z

In [41]:
def compute_cost(A, Y):
    m = Y.shape[1]
    return -np.sum(Y * np.log(A)) / m

In [42]:
Y = np.zeros((10,2))
Y[0][0] = 1
Y[1][1] = 1
print(Y)
A = softmax(np.array(np.random.randn(10,2)))
print(A)
print(compute_cost(A, Y))

[[1. 0.]
 [0. 1.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]
[[0.32598639 0.09907379]
 [0.09126729 0.12823868]
 [0.10596463 0.14511197]
 [0.06816549 0.08458995]
 [0.01670704 0.29026963]
 [0.11008453 0.17050867]
 [0.07835719 0.0164674 ]
 [0.13058747 0.01262707]
 [0.03398516 0.01597294]
 [0.03889481 0.0371399 ]]
1.5873808350873695


In [13]:
def linear_backward(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]
    
    dW = np.dot(dZ, A_prev.T) / m
    db = np.sum(dZ, axis = 0, keepdims = True) / m
    dA_prev = np.dot(W.T, dZ)

    return dW, db, dA_prev

In [14]:
def relu_activation_backward(Z):
    grad = np.zeros_like(Z)
    grad[Z > 0] = 1

    return grad

In [15]:
Z = np.random.randn(10,2)
print(Z)
print(relu_activation_backward(Z))

[[-1.70632903  0.33819933]
 [ 1.14691521 -1.48115663]
 [-0.99449234 -0.0722806 ]
 [ 1.38766455 -0.35198183]
 [ 0.98944235 -0.53754047]
 [-0.57653467  0.65965233]
 [-0.10858695  0.08211944]
 [-0.14262915 -2.11618676]
 [-0.3604113   1.02989566]
 [-0.29653028  0.40668129]]
[[0. 1.]
 [1. 0.]
 [0. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 0.]
 [0. 1.]
 [0. 1.]]


In [16]:
def softmax_activation_backward(Y_hat, Y):
    return Y_hat - Y

In [17]:
def update_params(W, b, grads, learning_rate):
    dW, db = grads

    updated_W = W - dW * learning_rate
    updated_b = b - db * learning_rate

    return updated_W, updated_b

In [29]:
update_W = np.array([[-2,2],[3,3]])
update_b = 1
print(update_params(update_W, update_b, (2, 1), 0.01))

(array([[-2.02,  1.98],
       [ 2.98,  2.98]]), 0.99)


In [18]:
dataset = load_dataset('mnist')
train_dataset = dataset['train']
train_images = []

Y = np.zeros((10, 60000))
for i, example in enumerate(train_dataset):
    train_images.append(np.array(example['image']).flatten())
    label = example['label']
    Y[label][i-1] = 1

train_images = np.array(train_images).T

In [35]:
n_h = 784
W1 = np.random.randn(n_h,784) * 0.01
b1 = np.zeros((n_h,1))

W2 = np.random.randn(10, n_h) * 0.01
b2 = np.zeros((10,1))

In [1]:
#print(W1[:,0])

In [43]:
for i in range(100):
    Z1, cache1 = linear_forward(train_images, W1, b1)
    A1, activation_cache1 = forward_activation(Z1, "relu")
    
    Z2, cache2 = linear_forward(A1, W2, b2)
    Y_hat, activation_cache2 = forward_activation(Z2, "softmax")

    if i % 10 == 0:
        print(compute_cost(Y_hat, Y))

    # back prop
    dZ2 = softmax_activation_backward(Y_hat, Y)
    dW2, db2, dA_prev = linear_backward(dZ2, cache2)
    
    dZ1 = relu_activation_backward(dA_prev)
    dW1, db1, dA_prev = linear_backward(dZ1, cache1)

    learning_rate = 0.1
    W2, b2 = update_params(W2, b2, (dW2, db2), learning_rate)
    W1, b1 = update_params(W1, b1, (dW1, db1), learning_rate)

[0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]


ValueError: operands could not be broadcast together with shapes (10,2) (10,60000) 

In [38]:
Z1, cache1 = linear_forward(train_images, W1, b1)
A1, activation_cache1 = forward_activation(Z1, "relu")

Z2, cache2 = linear_forward(A1, W2, b2)
Y_hat, activation_cache2 = forward_activation(Z2, "softmax")
print(Y_hat[:,20])
print(W1)

[0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]
[[-0.01547446 -0.02148361 -0.01285288 ... -0.00489947 -0.0067364
  -0.00313343]
 [ 0.01113908 -0.00750254  0.00646999 ... -0.00476036 -0.00960414
  -0.00348149]
 [ 0.00192891  0.01259681  0.00373571 ... -0.00351112 -0.01162191
   0.01828954]
 ...
 [-0.02846834 -0.00248466  0.01407171 ...  0.00983741 -0.01320366
   0.00492162]
 [ 0.00381576  0.01674401  0.01547204 ...  0.00131335  0.00327611
   0.00182039]
 [-0.00538691 -0.0029411   0.01474322 ...  0.00855426 -0.00564906
   0.02036078]]
