In [1]:
#!pip install numpy datasets Pillow

In [2]:
import numpy as np
from datasets import load_dataset
import math
import PIL.Image as Image

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def relu(Z):
    return np.maximum(0,Z)

In [4]:
Z = np.random.rand(10,2)
print(relu(Z))

[[0.96607012 0.3168394 ]
 [0.64886777 0.67772625]
 [0.52885148 0.19134125]
 [0.55048054 0.63753456]
 [0.89960723 0.81837653]
 [0.04732853 0.61661569]
 [0.59127758 0.17794064]
 [0.02766858 0.81537959]
 [0.04399048 0.44378568]
 [0.07615009 0.5275206 ]]


In [5]:
def softmax(Z):
    exps = np.exp(Z - np.max(Z))
    sum = np.sum(exps, axis=0, keepdims=True)
    return exps / sum

In [6]:
A = softmax(np.random.randn(10,2))
print(np.sum(A))
# using math.isclose because of floating point errors
assert math.isclose(2, np.sum(A))

2.0


In [7]:
def linear_forward(A_prev, W, b):
    Z = np.dot(W, A_prev) + b
    cache = (A_prev, W, b)

    return Z, cache

In [8]:
fake_train = np.random.randn(784, 5)
fake_W = np.random.randn(10, 784)
b = 0
fake_Z, fake_cache = linear_forward(fake_train, fake_W, 0)
print(fake_Z.shape)

fake_W2 = np.random.randn(20,10)
fake_b2 = 0

fake_Z2, fake_cache = linear_forward(fake_Z, fake_W2, fake_b2)
print(fake_Z2.shape)

(10, 5)
(20, 5)


In [9]:
def forward_activation(Z, activation):
    if activation == "relu":
        A = relu(Z)
        return A, Z
    if activation == "softmax":
        A = softmax(Z)
        return A, Z

In [10]:
def compute_cost(A, Y, m):
    return -np.sum(Y * np.log(A))/m

In [11]:
Y = np.zeros((10,2))
Y[0][0] = 1
Y[1][1] = 1
print(Y)
A = softmax(np.array(np.random.randn(10,2)))
print(A)
print(compute_cost(A, Y, 2))

[[1. 0.]
 [0. 1.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]
[[0.25275634 0.12482923]
 [0.15255884 0.07994123]
 [0.03333412 0.0655302 ]
 [0.14386324 0.27351115]
 [0.05429998 0.02509985]
 [0.02723412 0.19535305]
 [0.15866386 0.0730627 ]
 [0.08853699 0.14411045]
 [0.05119893 0.00993916]
 [0.0375536  0.00862298]]
1.9508964268136677


In [34]:
def linear_backward(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]
    
    dW = np.dot(W, A_prev.T) / m
    db = np.sum(dZ, axis = 0, keepdims = True) / m
    dA_prev = np.dot(W.T, dZ)

    return dW, db, dA_prev

In [35]:
def relu_activation_backward(Z):
    grad = np.zeros_like(Z)
    grad[Z > 0] = 1

    return grad

In [36]:
Z = np.random.randn(10,2)
print(Z)
print(relu_activation_backward(Z))

[[-0.02122622  1.35463616]
 [-0.25994417 -1.50828852]
 [-1.46301262 -0.78484939]
 [-1.85765904 -0.06063431]
 [-0.57632942 -0.65555518]
 [ 1.43746918 -0.2336613 ]
 [-2.65643845  1.45927216]
 [-0.59053698 -1.47517829]
 [-1.30091946  0.53267311]
 [ 0.21624597 -0.81412323]]
[[0. 1.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [1. 0.]
 [0. 1.]
 [0. 0.]
 [0. 1.]
 [1. 0.]]


In [15]:
def softmax_activation_backward(Y_hat, Y):
    return Y_hat - Y

In [27]:
def update_params(W, b, grads, learning_rate):
    dW, db = grads
    print(dW.shape)
    W = W - (dW * learning_rate)
    b = b - (db * learning_rate)

    return W, b

In [17]:
dataset = load_dataset('mnist')
train_dataset = dataset['train']
train_images = []

Y = np.zeros((10, 60000))
for i, example in enumerate(train_dataset):
    train_images.append(np.array(example['image']).flatten())
    label = example['label']
    Y[label][i] = 1

train_images = np.array(train_images).T

In [31]:
n_h = 10
W1 = np.random.randn(n_h,784) * 0.01
b1 = np.zeros((n_h,1))

W2 = np.random.randn(n_h, 10) * 0.01
b2 = np.zeros((n_h,1))

Z1, cache1 = linear_forward(train_images, W1, b1)
A1, activation_cache1 = forward_activation(Z1, "relu")

Z2, cache2 = linear_forward(A1, W2, b2)
Y_hat, activation_cache2 = forward_activation(Z2, "softmax")

In [40]:
print(W1.shape)
print(compute_cost(Y_hat, Y, train_images.shape[1]))

(10, 784)
2.445579636612893


In [42]:
dZ2 = softmax_activation_backward(Y_hat, Y)
dW2, db2, dA_prev = linear_backward(dZ2, cache2)

dZ1 = relu_activation_backward(dA_prev)
dW1, db1, dA_prev = linear_backward(dZ1, cache1)

ValueError: shapes (10,10) and (60000,10) not aligned: 10 (dim 1) != 60000 (dim 0)

In [41]:
print(dZ2.shape)

(10, 60000)


In [23]:
W2, b2 = update_params(W2, b2, (dW2, db2), 0.01)
W1, b1 = update_params(W1, b1, (dW1, db1), 0.01)

(10, 60000)


ValueError: operands could not be broadcast together with shapes (10,10) (10,60000) 