In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data=pd.read_pickle('dataset.pkl')['data']

In [3]:
data = data.reshape((60000, 784))/255
data_train=data[:50000].T
data_test=data[50000:].T
data_train.shape

(784, 50000)

In [4]:
labels=pd.read_pickle('dataset.pkl')['labels']
y_train=labels[:50000]

In [5]:
y_train.shape

(50000,)

In [6]:
y_train

array([9, 0, 0, ..., 5, 1, 7], dtype=uint8)

In [7]:
def random_initialization():
    W1 = np.random.randn(17, 784)*0.01
    b1 = np.zeros((17,1))
    W2 = np.random.randn(10, 17)*0.02
    b2 = np.zeros((10,1))

    return W1, b1, W2, b2


In [8]:
def ReLu(z):
    return np.maximum(z,0)

In [9]:
def softmax(z):
    a = np.exp(z) / sum(np.exp(z))
    return a

In [10]:
def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLu(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

In [11]:
def d_relu(Z):
    return Z > 0

In [12]:
def onehot_encoding(data, num_labels):
    one_hot_encoded = np.zeros((data.shape[0], num_labels))
    
    for i, label in enumerate(data):
        one_hot_encoded[i, label] = 1

    return one_hot_encoded.T


In [13]:
def backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
    one_hot_Y = onehot_encoding(Y,10)
    m=Y.size
    dZ2 = A2 - one_hot_Y
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2)
    dZ1 = W2.T.dot(dZ2) * d_relu(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1)
    return dW1, db1, dW2, db2

In [14]:
def updates(W1, b1, W2, b2, dW1, db1, dW2, db2, l):
    W1 = W1 - l*dW1
    b1 = b1 - l*db1    
    W2 = W2 - l*dW2  
    b2 = b2 - l*db2    
    return W1, b1, W2, b2

In [15]:
def gradient_descent(X, Y, alpha, epochs):
    W1, b1, W2, b2 = random_initialization()
    
    for i in range(epochs):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, b1, W2, b2 = updates(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
    return W1, b1, W2, b2

In [16]:
W1, b1, W2, b2 = gradient_descent(data_train, y_train, 0.1,1000)

In [17]:
#def testforward_prop(W1, b1, W2, b2, X):
Z1 = W1.dot(data_train) + b1
A1 = ReLu(Z1)
Z2 = W2.dot(A1) + b2
A2 = softmax(Z2)

In [18]:
sum(y_train == np.argmax(A2, axis=0))/y_train.shape[0]

0.83812

In [19]:
np.argmax(A2, axis=0)

array([9, 0, 3, ..., 5, 1, 7])

In [20]:
y_test=labels[50000:]

In [21]:
data_test.shape

(784, 10000)

In [22]:
Z1 = W1.dot(data_test) + b1
A1 = ReLu(Z1)
Z2 = W2.dot(A1) + b2
A2 = softmax(Z2)


In [23]:
np.argmax(A2[:,0])

9

In [24]:
y_test.shape

(10000,)

In [25]:
sum(y_test == np.argmax(A2, axis=0))/y_test.shape[0]

0.8287