In [None]:
# pandas is for reading data
import pandas as pd
# numpy is for linear algebra
import numpy as np
from matplotlib import pyplot as plt
;

In [None]:
data_1 = pd.read_csv("data_files/train.csv")


In [None]:
data_small = data_1.iloc[0:3,0:7]
data_small.head()

In [None]:
data = np.array(data_small)


In [None]:
m,n = data.shape

# In order to avoid overfitting, we want to randomize the data and then split it into train and dev
np.random.shuffle(data)

# transpose the data so that each column is a row (easier)
data_dev = data[0:10].T
# each column is now an image
# first row is now labels
# following rows (783) are each pixel
Y_dev = data_dev[0]
X_dev = data_dev[1:]
X_dev = X_dev / 255

data_train = data.T
Y_train = data_train[0]
X_train = data_train[1:]
X_train = X_train / 255

In [None]:
print("orig_data: ", data.shape)
print(data)
print()
print("data_train: ", data_train.shape)
print(data_train)
print('X_train: ', X_train.shape)
print(X_train)
print('Y_train: ', Y_train.shape)
print(Y_train)

In [None]:
def init_params():
    # random.randn makes dist between -.5 and .5
    # wi is the weight vector (# second layer neurons, # first layer neurons)
        # each input neuron (784) connects to each output neuron (10)
    # bi is the bias in the output layer neurons
    w1 = np.random.randn(10,6)
    b1 = np.random.randn(10,1)
    w2 = np.random.randn(10,10)
    b2 = np.random.randn(10,1)
    print(f"w1: {w1.shape}")
    print(f"b1: {b1.shape}")
    print(f"w2: {w2.shape}")
    print(f"b2: {b2.shape}")
    return w1, b1, w2, b2

In [None]:
w1, b1, w2, b2 = init_params()
# print('w1:', w1.shape, '\n')
# print('b1:',b1.shape, '\n')
# print('w2:',w2.shape, '\n')
# print('b2:',b2.shape, '\n')

In [None]:
def ReLU(Z):
    # maximum is element-wise so it runs that calc for each element in Z 
    return np.maximum(0, Z)

# converts a vector of real numbers into a prob dist of K possible outcomes
def softmax(Z):
    print("sum(np.exp(Z)) : ",sum(np.exp(Z)) )
    return np.exp(Z) / sum(np.exp(Z))

In [None]:
# left off at 16:54
def forward_prop(w1, b1, w2, b2, X):
    print("Forward Propogation")
    z1 = w1.dot(X) + b1
    print("z1 = w1.dot(X) + b1")
    print(f"{z1.shape} = {w1.shape} .dot {X.shape} + {b1.shape}")
    A1 = ReLU(z1)
    print("A1 = ReLU(z1)")
    print(f"{A1.shape} = Relu{z1.shape}")
    z2 = w2.dot(A1) + b2
    print("z2 = w2.dot(A1) + b2")
    print(f"{z2.shape} = {w2.shape} .dot {A1.shape} + {b1.shape}")
    A2 = softmax(z2)
    print("A2 = softmax(z2)")
    print(f"{z2.shape} = {w2.shape} .dot {A1.shape} + {b1.shape}")
    print("z1: ")
    print()
    print(z1)
    print("A1: ")
    print()
    print(A1)
    print("z2: ")
    print()
    print(z2)
    print("A2: ")
    print()
    print(A2)
    print()

    return z1, A1, z2, A2

In [None]:
# transform a vector Y of class labels into a one-hot encoded matrix
# one-hot encoding is a common way to represent categorical variables as binary vectors 
# Y is going to be an array (mx1) where each element is the predicted class for the equivalent instance column of the input data array
def one_hot(Y):
    # np.zeros line creates a 2D array of zeros with shape determined by number of samples and number of unique classes   
        # y.size returns the total number of elements in Y which represents the number of samples or instances
        # Y.max() + 1 calculates the max value in Y and adds 1 to determine the number of unique classes
            # adding one is necessary because the classes start from 0 (0-9)
    print()
    print("One hot encoding")
    print("Y ", Y, Y.size)
    ohY = np.zeros((Y.size, 10))
    # "for each row, go to the column specified by the label in Y and set it equal to 1"
    # by indexing ohY like this, we are effectively selecting one position per row, determined by the class label in Y
    # each row in ohY corresponds to a sample in Y and each column in ohY corresponds to a class
    # for each row in ohY, the column corresponding to its class label is set to 1 (all other columns remain 0)
        #  np.arange(Y.size) generates an array of indices from 0 to Y.size - 1 corresponding to each sample in Y --> specifies what row to access
        # Y contains the class label for each sample
            # when used as an index, Y selects the column in ohY that corresponds to its class label
    # 
    ohY[np.arange(Y.size), Y] = 1
    print('\n',"ohy ", ohY)
    print("ohY.T ", ohY.T, ohY.T.size)
    # transpose because we want each column to be a sample not each row
    return ohY.T

def deriv_ReLU(Z):
    # relu has deriv of 1 for x > 0 (because x = x) and 0 for x <=0 (because x = 0)
    # this works because booleans are converted to 1 for true and 0 for false so if a number is positive then its deriv was 1
    # since
    return Z > 0

In [None]:
def back_prop(z1, A1, z2, A2, w2, X, Y):
    print('Back Propogation')
    m = Y.size
    print("m = Y.size", Y.shape)

    ohY = one_hot(Y)
    print("ohY: ", ohY.size)

    dz2 = A2 - ohY
    print("dz2 = A2 - ohY")
    print(f"{dz2.shape} = {A2.shape} - {ohY.shape}")

    dw2 = 1/m * dz2.dot(A1.T)
    print("dw2 = 1/m * dz2.dot(A1.T)")
    print(f"{dw2.shape} = {1/m} * {dz2.shape} dot {A1.T.shape}")
    
    db2 = 1/m * np.sum(dz2, axis=1)
    print("db2 = 1/m * np.sum(dz2)")
    print(f"{db2.shape} = {1/m} * np.sum{dz2.shape}")
    
    # I don't understand this next part
    dz1 = w2.T.dot(dz2) * deriv_ReLU(z1)
    print("dz1 = w2.T.dot(dz2) * deriv_ReLU(z1)")
    print(f"{dz1.shape} = {w2.T.shape} dot {dz2.shape} * deriv_ReLU{z1.shape}")

    dw1 = 1/m * dz1.dot(X.T)
    print("dw1 = 1/m * dz1.dot(X.T)")
    print(f"{dw1.shape} = {1/m} * {dz1.shape} dot {X.T.shape}")

    db1 = 1/m * np.sum(dz1, axis=1)
    print("db1 = 1/m * np.sum(dz1)")
    print(f"{db1.shape} = {1/m} * np.sum{dz1.shape}")
    return dw1, db1, dw2, db2

In [None]:
def update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, lr):
    print(f"w1: {w1.shape}")
    print(f"b1: {b1.shape}")
    print(f"w2: {w2.shape}")
    print(f"b2: {b2.shape}")
    print(f"dw1: {dw1.shape}")
    print(f"db1: {db1.shape}")
    print(f"dw2: {dw2.shape}")
    print(f"db2: {db2.shape}")
    w1 = w1 - lr*dw1
    b1 = b1 - lr*db1.reshape(-1,1)
    w2 = w2 - lr*dw2
    b2 = b2 - lr*db2.reshape(-1,1)
    return w1, b1, w2, b2

In [None]:
def singleUpToA2(X, Y):
    w1, b1, w2, b2 = init_params()
    z1, A1, z2, A2 = forward_prop(w1, b1, w2, b2, X)
    return z1, A1, z2, A2

def singleBackprop(z1, A1, z2, A2, w2, X, Y):
    dw1, db1, dw2, db2 = back_prop(z1, A1, z2, A2, w2, X, Y)
    return dw1, db1, dw2, db2

In [None]:
X_rand = pd.DataFrame([[1., 0., 0.],
          [0., 1., 0.],
          [0., 0., 1.],
          [1., 0., 0.],
          [0., 0., 1.],
          [0., 0., 1.]])

print(X_rand)
print(X_rand.shape)
print(Y_train)
print(Y_train.shape)
z1, A1, z2, A2 = singleUpToA2(X_rand, Y_train)

In [None]:
Y = pd.DataFrame(np.array([1, 2, 3]))[0]

dw1, db1, dw2, db2 = singleBackprop(z1, A1, z2, A2, w2, X_rand, Y)

In [None]:
def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    print(predictions, '\n', Y)
    return np.sum(predictions==Y) / Y.size


def gradient_descent(X, Y, iterations, alpha):
    w1, b1, w2, b2 = init_params()
    for i in range(iterations):
        z1, A1, z2, A2 = forward_prop(w1, b1, w2, b2, X)
        dw1, db1, dw2, db2 = back_prop(z1, A1, z2, A2, w2, X, Y)
        w1, b1, w2, b2 = update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha)
        if (i%10) == 0:
            print("Iteration: ", i)
            print(f"Accuracy: {get_accuracy(get_predictions(A2), Y)}")
            print()

    return w1, b1, w2, b2

In [None]:
w1, b1, w2, b2 = gradient_descent(X_train, Y_train, 1, .1)