In [5]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [6]:
!unzip train.csv.zip

Archive:  train.csv.zip
  inflating: train.csv               


In [7]:
data = pd.read_csv("train.csv")

In [8]:
data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# converting to numpy array to be able to manipulate it
data = np.array(data)

m,n = data.shape
np.random.shuffle(data)


data_dev = data[0:1000].T
Y_dev = data_dev[0]
X_dev = data_dev[1:n] / 255

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n] / 255

In [10]:
def init_params():
  W1 = np.random.rand(10,784) - 0.5
  b1 = np.random.rand(10,1) - 0.5
  W2 = np.random.rand(10,10) - 0.5
  b2 = np.random.rand(10,1) - 0.5

  return W1,b1,W2,b2

def reLU(Z):
  return np.maximum(0,Z)

def softmax(Z):
  #Z = np.zeros(Z.shape)
  return np.exp(Z) / sum(np.exp(Z))


def forward_prop(W1,b1,W2,b2,X):
  Z1 = W1.dot(X) + b1
  A1 = reLU(Z1)
  Z2 = W2.dot(A1) + b2
  A2 = softmax(Z2)

  return Z1,A1,Z2,A2

def one_hot(Y):
  one_hot_Y = np.zeros((Y.size,Y.max()+1))
  one_hot_Y[np.arange(Y.size),Y] = 1
  one_hot_Y = one_hot_Y.T
  return one_hot_Y

def deriv_reLU(Z):
  return Z > 0

def back_prop(Z1,A1,Z2,A2,W2,Y,X):
  m = Y.size
  one_hot_Y = one_hot(Y)
  dZ2 = A2 - one_hot_Y
  dW2 = 1/m * dZ2.dot(A1.T)
  db2 = 1/m * np.sum(dZ2,1)
  dZ1 = W2.T.dot(dZ2) * deriv_reLU(Z1)
  dW1 = 1/m * dZ1.dot(X.T)
  db1 = 1/m * np.sum(dZ1,1)

  return dW1,db1,dW2,db2

def update_params(W1,b1,W2,b2,dW1,db1,dW2,db2,alpha):
  W1 = W1 - alpha*dW1
  b1 = b1 - alpha*np.reshape(db1,(10,1))
  W2 = W2 - alpha*dW2
  b2 = b2 - alpha*np.reshape(db2,(10,1))

  return W1,b1,W2,b2


In [11]:
def get_predictions(A2):
  return np.argmax(A2,0)

def get_accuracy(predictions,Y):
  print(predictions,Y)
  return np.sum(predictions==Y) / Y.size

def gradient_descent(X,Y,iterations,alpha):
  W1, b1, W2, b2 = init_params()
  for i in range(iterations):
    Z1,A1,Z2,A2 = forward_prop(W1,b1,W2,b2,X)
    dW1,db1,dW2,db2 = back_prop(Z1,A1,Z2,A2,W2,Y,X)
    W1,b1,W2,b2 = update_params(W1,b1,W2,b2,dW1,db1,dW2,db2,alpha)
    if i % 50 == 0:
      print(f'iteration {i}')
      print(f'accuracy {get_accuracy(get_predictions(A2),Y)}')
  return W1,b1,W2,b2



In [12]:
W1,b1,W2,b2 = gradient_descent(X_train,Y_train,10000,0.3)

iteration 0
[7 1 4 ... 4 1 7] [2 5 6 ... 7 2 2]
accuracy 0.10824390243902439
iteration 50
[2 3 6 ... 8 0 2] [2 5 6 ... 7 2 2]
accuracy 0.6898780487804878
iteration 100
[2 5 6 ... 8 2 2] [2 5 6 ... 7 2 2]
accuracy 0.8013902439024391
iteration 150
[2 5 6 ... 8 2 2] [2 5 6 ... 7 2 2]
accuracy 0.8403170731707317
iteration 200
[2 5 6 ... 8 2 2] [2 5 6 ... 7 2 2]
accuracy 0.8602926829268293
iteration 250
[2 5 6 ... 8 2 2] [2 5 6 ... 7 2 2]
accuracy 0.8725609756097561
iteration 300
[2 5 6 ... 8 2 2] [2 5 6 ... 7 2 2]
accuracy 0.8808536585365854
iteration 350
[2 5 6 ... 8 2 2] [2 5 6 ... 7 2 2]
accuracy 0.8865609756097561
iteration 400
[2 5 6 ... 8 2 2] [2 5 6 ... 7 2 2]
accuracy 0.8910487804878049
iteration 450
[2 5 6 ... 8 2 2] [2 5 6 ... 7 2 2]
accuracy 0.8949268292682927
iteration 500
[2 5 6 ... 8 2 2] [2 5 6 ... 7 2 2]
accuracy 0.8972439024390244
iteration 550
[2 5 6 ... 8 2 2] [2 5 6 ... 7 2 2]
accuracy 0.9004390243902439
iteration 600
[2 5 6 ... 8 2 2] [2 5 6 ... 7 2 2]
accuracy 0.90304