In [1]:
import argparse
import numpy as np
import json
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [2]:
digits = 10

test = np.load('test.npz')
train = np.load('train.npz')

y_test = test['label']
x_test = test['image']
y_train = train['label']
x_train = train['image']

x_train = x_train.reshape(-1, x_train.shape[1] * x_train.shape[2])
x_test = x_test.reshape(-1, x_test.shape[1] * x_test.shape[2])
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

X = np.vstack((x_train, x_test))
y = np.vstack((y_train, y_test))

examples = y.shape[0]
y = y.reshape(1, examples)
Y_new = np.eye(digits)[y.astype('int32')]
Y_new = Y_new.T.reshape(digits, examples)
  #print(y)

m = x_train.shape[0]
X_train, X_test = X[:m].T, X[m:].T
Y_train, Y_test = Y_new[:, :m], Y_new[:, m:]

shuffle_index = np.random.permutation(m)
X_train, Y_train = X_train[:, shuffle_index], Y_train[:, shuffle_index]

In [3]:
def sigmoid(z):
    s = 1. / (1. + np.exp(-z))
    return s


In [4]:
def sigmoid_gradient(z):
#     # To prevent from overflow
#     z = np.clip(z, 1e-15, 1 - 1e-15)
    s = sigmoid(z) * (1 - sigmoid(z))
    return s


In [5]:
def cross_entropy(Y, Y_hat):
    L_sum = np.sum(np.multiply(Y, np.log(Y_hat)))
    m = Y.shape[1]
    L = -(1./m) * L_sum

    return L

In [6]:
def cross_entropy_gradient(Y, Y_hat):
    L = Y_hat - Y
    return L

In [7]:
def softmax(z):
    s = np.exp(z) / np.sum(np.exp(z), axis=0)
    return s

In [8]:
def evaluation(predicts, golds):
    correct = 0
    total = len(predicts)
    assert len(predicts) == len(golds)
    for predict, gold in zip(predicts, golds):
        if predict == gold:
            correct += 1
    accurancy = correct / total
    return accurancy

In [9]:
class Layer:
    def __init__(self, input_, output):
        self.input = input_
        self.output = output  # number of layer node
        self.W = np.random.randn(self.output, self.input) * np.sqrt(1. / self.input)
        self.b = np.zeros((self.output, 1)) * np.sqrt(1. / self.input)
        
    def forward(self, last_layer):
        self.last_layer = last_layer
        layer_output = np.matmul(self.W, self.last_layer) + self.b
#         layer_output = sigmoid(layer_output_temp)
        return layer_output
    
    def back_propagation(self, CE_gradientorgradient, m_batch, learning_rate):
        W_temp = self.W
        W_gradient = (1. / m_batch) * np.matmul(CE_gradientorgradient, self.last_layer.T)
        b_gradient = (1. / m_batch) * np.sum(CE_gradientorgradient, axis=1, keepdims=True)
        self.W_new = self.W - learning_rate * W_gradient
        self.b_new = self.b - learning_rate * b_gradient
        self.W = self.W_new
        self.b = self.b_new
        gradient_temp = np.matmul(W_temp.T, CE_gradientorgradient)
        return gradient_temp
        
    
    

In [10]:
hiddenlayer1 = Layer(784, 400)
hiddenlayer2 = Layer(400, 400)
outputlayer = Layer(400, 10)

In [11]:
def SGD_train_epoch(X_train, Y_train, batch_size = 64, epoch = 10, learning_rate = 0.03):
    TrainError = []
    TestError = []
    for i in range(epoch):
       
        # shuffle training set
        permutation = np.random.permutation(X_train.shape[1])
        X_train_shuffled = X_train[:, permutation]
        Y_train_shuffled = Y_train[:, permutation]
    
        batch_num = len(X_train) // batch_size
        predicts = []
        golds = []
        predicts_test = []
        golds_test = []
        
        for j in range(batch_num):
            begin = j * batch_size
            end = min(begin + batch_size, X_train.shape[1] - 1)
            X = X_train[:, begin:end]
            Y = Y_train[:, begin:end]
            m_batch = end - begin
            
            output1_temp = hiddenlayer1.forward(X)
            output1 = sigmoid(output1_temp)
            output2_temp = hiddenlayer2.forward(output1)
            output2 = sigmoid(output2_temp)
            y_hat_temp = outputlayer.forward(output2)
            y_hat = softmax(y_hat_temp)
            #print(y_hat)
            
            predicts += np.argmax(y_hat, axis = 0).tolist()
            golds += np.argmax(Y, axis = 0).tolist()
            
            
            loss = cross_entropy(Y, y_hat)
            gradient = cross_entropy_gradient(Y, y_hat)
            
            back_output1 = outputlayer.back_propagation(gradient, m_batch, learning_rate)
            back_output2_temp = sigmoid_gradient(output2_temp) * back_output1
            back_output2 = hiddenlayer2.back_propagation(back_output2_temp, m_batch, learning_rate)
            back_output3_temp = sigmoid_gradient(output1_temp) * back_output2
            back_output3 = hiddenlayer1.back_propagation(back_output3_temp, m_batch, learning_rate)
            
            #---------test data-----------
            
            
            output1_temp = hiddenlayer1.forward(X_test)
            output1 = sigmoid(output1_temp)
            output2_temp = hiddenlayer2.forward(output1)
            output2 = sigmoid(output2_temp)
            y_hat_temp = outputlayer.forward(output2)
            y_hat = softmax(y_hat_temp)
#             print(y_hat.shape)
            #print(y_hat)
#             print(Y_test.shape)
            predicts_test += np.argmax(y_hat, axis=0).tolist()
            golds_test += np.argmax(Y_test, axis=0).tolist()
            
            
#             loss_test = cross_entropy(Y_test, y_hat)
            
        print('Epoch : ', i + 1, 'training_loss = ', loss, 'train_accur = ', evaluation(predicts, golds), 'test_accur = ', evaluation(predicts_test, golds_test))
            
        TrainError.append(1 - evaluation(predicts, golds))
        TestError.append(1 - evaluation(predicts_test, golds_test))    

    with open("Train_error_rate.json", mode="w") as stream:
        json.dump(TrainError, stream)

    with open("Test_error_rate.json", mode="w") as stream:
        json.dump(TestError, stream)

In [12]:
SGD_train_epoch(X_train, Y_train, batch_size = 64, epoch = 500, learning_rate = 0.03)

Epoch :  1 training_loss =  2.2731195478764694 train_accur =  0.13411458333333334 test_accur =  0.13793053629218677
Epoch :  2 training_loss =  2.181276184156391 train_accur =  0.2903645833333333 test_accur =  0.2922590152565881
Epoch :  3 training_loss =  2.1029575278879085 train_accur =  0.4427083333333333 test_accur =  0.40812817845584837
Epoch :  4 training_loss =  2.016173813268807 train_accur =  0.53125 test_accur =  0.49195272769301895
Epoch :  5 training_loss =  1.9342083408216966 train_accur =  0.60546875 test_accur =  0.5476623901987979
Epoch :  6 training_loss =  1.854205862288385 train_accur =  0.6393229166666666 test_accur =  0.5906004392048081
Epoch :  7 training_loss =  1.7769694915441547 train_accur =  0.6744791666666666 test_accur =  0.622298312528895
Epoch :  8 training_loss =  1.6987012433166848 train_accur =  0.7057291666666666 test_accur =  0.6495607951918632
Epoch :  9 training_loss =  1.619306601367331 train_accur =  0.73828125 test_accur =  0.6697439898289412
Ep

Epoch :  75 training_loss =  0.14052604653412532 train_accur =  0.9778645833333334 test_accur =  0.8300248497457235
Epoch :  76 training_loss =  0.13806456063736688 train_accur =  0.9778645833333334 test_accur =  0.8307761211280629
Epoch :  77 training_loss =  0.13551097462930614 train_accur =  0.9778645833333334 test_accur =  0.8309494914470643
Epoch :  78 training_loss =  0.13315077611192444 train_accur =  0.9791666666666666 test_accur =  0.8310795191863153
Epoch :  79 training_loss =  0.13126328770088413 train_accur =  0.9791666666666666 test_accur =  0.8309350439204808
Epoch :  80 training_loss =  0.12860784638982226 train_accur =  0.98046875 test_accur =  0.831166204345816
Epoch :  81 training_loss =  0.12628020693888653 train_accur =  0.9791666666666666 test_accur =  0.8313684697179843
Epoch :  82 training_loss =  0.12382991301142762 train_accur =  0.9791666666666666 test_accur =  0.831585182616736
Epoch :  83 training_loss =  0.12152788781731998 train_accur =  0.98046875 test_ac

Epoch :  147 training_loss =  0.05325343341498378 train_accur =  0.99609375 test_accur =  0.8391990291262136
Epoch :  148 training_loss =  0.05278102283838846 train_accur =  0.99609375 test_accur =  0.8393001618122977
Epoch :  149 training_loss =  0.052179819037454944 train_accur =  0.99609375 test_accur =  0.8393868469717984
Epoch :  150 training_loss =  0.05165711382397484 train_accur =  0.99609375 test_accur =  0.8394157420249653
Epoch :  151 training_loss =  0.05111789547510048 train_accur =  0.99609375 test_accur =  0.8395891123439667
Epoch :  152 training_loss =  0.0505426041581807 train_accur =  0.99609375 test_accur =  0.8398347202958854
Epoch :  153 training_loss =  0.05001743670923284 train_accur =  0.99609375 test_accur =  0.8399069579288025
Epoch :  154 training_loss =  0.04948942569624103 train_accur =  0.99609375 test_accur =  0.8400947757743874
Epoch :  155 training_loss =  0.04900691208769219 train_accur =  0.9973958333333334 test_accur =  0.8401814609338881
Epoch :  15

Epoch :  218 training_loss =  0.02979762031087293 train_accur =  0.9986979166666666 test_accur =  0.8416406611188164
Epoch :  219 training_loss =  0.02955447042183212 train_accur =  0.9986979166666666 test_accur =  0.841626213592233
Epoch :  220 training_loss =  0.02935792185220666 train_accur =  0.9986979166666666 test_accur =  0.8416984512251503
Epoch :  221 training_loss =  0.029158300433751904 train_accur =  0.9986979166666666 test_accur =  0.8417562413314841
Epoch :  222 training_loss =  0.028971198152364055 train_accur =  0.9986979166666666 test_accur =  0.8418284789644013
Epoch :  223 training_loss =  0.028787913565808287 train_accur =  0.9986979166666666 test_accur =  0.8419296116504854
Epoch :  224 training_loss =  0.028609662604047188 train_accur =  0.9986979166666666 test_accur =  0.8420162968099861
Epoch :  225 training_loss =  0.028434611386534657 train_accur =  0.9986979166666666 test_accur =  0.8420596393897365
Epoch :  226 training_loss =  0.028260757772075706 train_acc

Epoch :  291 training_loss =  0.019783760318498503 train_accur =  1.0 test_accur =  0.8435766296809986
Epoch :  292 training_loss =  0.01968950687403788 train_accur =  1.0 test_accur =  0.8435188395746648
Epoch :  293 training_loss =  0.019596555686779968 train_accur =  1.0 test_accur =  0.8435188395746648
Epoch :  294 training_loss =  0.019504651620923172 train_accur =  1.0 test_accur =  0.8435332871012483
Epoch :  295 training_loss =  0.019413551296870585 train_accur =  1.0 test_accur =  0.8435621821544151
Epoch :  296 training_loss =  0.019322653142479773 train_accur =  1.0 test_accur =  0.8435766296809986
Epoch :  297 training_loss =  0.019235269243582123 train_accur =  1.0 test_accur =  0.8436488673139159
Epoch :  298 training_loss =  0.01914673942449936 train_accur =  1.0 test_accur =  0.8437066574202496
Epoch :  299 training_loss =  0.019058650610262608 train_accur =  1.0 test_accur =  0.8437211049468331
Epoch :  300 training_loss =  0.018961189550055935 train_accur =  1.0 test_

Epoch :  371 training_loss =  0.013984622593213453 train_accur =  1.0 test_accur =  0.845295885344429
Epoch :  372 training_loss =  0.013937064696265518 train_accur =  1.0 test_accur =  0.8452814378178456
Epoch :  373 training_loss =  0.013884789028639159 train_accur =  1.0 test_accur =  0.8453536754507628
Epoch :  374 training_loss =  0.013834327111385455 train_accur =  1.0 test_accur =  0.8454259130836801
Epoch :  375 training_loss =  0.013784645607644887 train_accur =  1.0 test_accur =  0.8454548081368469
Epoch :  376 training_loss =  0.013735462245242535 train_accur =  1.0 test_accur =  0.8454403606102635
Epoch :  377 training_loss =  0.013686579926143822 train_accur =  1.0 test_accur =  0.8454403606102635
Epoch :  378 training_loss =  0.013637952729568832 train_accur =  1.0 test_accur =  0.8454403606102635
Epoch :  379 training_loss =  0.01358954028170153 train_accur =  1.0 test_accur =  0.8454692556634305
Epoch :  380 training_loss =  0.013541275420408902 train_accur =  1.0 test_

Epoch :  451 training_loss =  0.010655319342723835 train_accur =  1.0 test_accur =  0.8472029588534443
Epoch :  452 training_loss =  0.01062187993926046 train_accur =  1.0 test_accur =  0.8471740638002774
Epoch :  453 training_loss =  0.010589716975453713 train_accur =  1.0 test_accur =  0.8471162736939436
Epoch :  454 training_loss =  0.010558085010833234 train_accur =  1.0 test_accur =  0.8470873786407767
Epoch :  455 training_loss =  0.01052690326464513 train_accur =  1.0 test_accur =  0.8470440360610264
Epoch :  456 training_loss =  0.010496477827858562 train_accur =  1.0 test_accur =  0.8469862459546925
Epoch :  457 training_loss =  0.010466608538689571 train_accur =  1.0 test_accur =  0.8469862459546925
Epoch :  458 training_loss =  0.01043682500945311 train_accur =  1.0 test_accur =  0.8469862459546925
Epoch :  459 training_loss =  0.01040708444820234 train_accur =  1.0 test_accur =  0.8469429033749422
Epoch :  460 training_loss =  0.010377367782372202 train_accur =  1.0 test_ac

In [13]:
# parameters = {"W1": np.random.randn(400, 784) * np.sqrt(1. / 784),
#               "b1": np.zeros((400, 1)) * np.sqrt(1. / 784),
#               "W2": np.random.randn(400, 400) * np.sqrt(1. / 400),
#               "b2": np.zeros((400, 1)) * np.sqrt(1. / 400),
#               "W3": np.random.randn(digits, 400) * np.sqrt(1. / 400),
#               "b3": np.zeros((digits, 1)) * np.sqrt(1. / 400)}


In [14]:
# def forward(X, parameters):
#     inoutput = {}
#     inoutput["hiddenlayer1_output_temp"] = np.matmul(parameters["W1"], X) + parameters["b1"]
#     inoutput["hiddenlayer1_output"] = sigmoid(inoutput["hiddenlayer1_output_temp"])
    
#     inoutput["hiddenlayer2_output_temp"] = np.matmul(parameters["W2"], inoutput["hiddenlayer1_output"]) + parameters["b2"]
#     inoutput["hiddenlayer2_output"] = sigmoid(inoutput["hiddenlayer2_output_temp"])
    
#     inoutput["outputlayer_output_temp"] = np.matmul(parameters["W3"], inoutput["hiddenlayer2_output"]) + parameters["b3"]
#     inoutput["outputlayer_output"] = softmax(inoutput["outputlayer_output_temp"])
#     #inoutput["outputlayer_output"] = y_hat

#     return inoutput

In [15]:
# def back_propagation(X, Y, parameters, inoutput, m_batch):
#     CE_gradient = inoutput["outputlayer_output"] - Y

#     W3_gradient = (1. / m_batch) * np.matmul(CE_gradient, inoutput["hiddenlayer2_output"].T)
#     b3_gradient = (1. / m_batch) * np.sum(CE_gradient, axis=1, keepdims=True)

#     # ---

#     outputlayer_backward_output = np.matmul(parameters["W3"].T, CE_gradient)
#     hiddenlayer2_backward_input = outputlayer_backward_output * sigmoid_gradient(inoutput["hiddenlayer2_output_temp"])

#     W2_gradient = (1. / m_batch) * np.matmul(hiddenlayer2_backward_input, inoutput["hiddenlayer1_output"].T)
#     b2_gradient = (1. / m_batch) * np.sum(hiddenlayer2_backward_input, axis=1, keepdims=True)

#     # ---

#     hiddenlayer2_backward_output = np.matmul(parameters["W2"].T, hiddenlayer2_backward_input)
#     hiddenlayer1_backward_input = hiddenlayer2_backward_output * sigmoid_gradient(inoutput["hiddenlayer1_output_temp"])

#     W1_gradient = (1. / m_batch) * np.matmul(hiddenlayer1_backward_input, X.T)
#     b1_gradient = (1. / m_batch) * np.sum(hiddenlayer1_backward_input, axis=1, keepdims=True)

#     Wb_gradients = {"W1_gradient": W1_gradient, "b1_gradient": b1_gradient, "W2_gradient": W2_gradient,
#                      "b2_gradient": b2_gradient, "W3_gradient": W3_gradient, "b3_gradient": b3_gradient}

#     return Wb_gradients

In [16]:
# if __name__ == "__main__":
#     epoch = 3
#     batch_size = 64
#     TrainError = []
#     TestError = []
#     for i in range(epoch):
       
#         # shuffle training set
# #         permutation = np.random.permutation(X_train.shape[1])
# #         X_train_shuffled = X_train[:, permutation]
# #         Y_train_shuffled = Y_train[:, permutation]
    
#         batch_num = len(X_train) // batch_size
#         predicts = []
#         golds = []
#         predicts_test = []
#         golds_test = []
#         learning_rate = 0.03
        
        
#         for j in range(batch_num):
#             begin = j * batch_size
#             end = min(begin + batch_size, X_train.shape[1] - 1)
#             X = X_train[:, begin:end]
#             Y = Y_train[:, begin:end]
#             m_batch = end - begin
            
#             inoutput = forward(X, parameters)
#             Wb_gradients = back_propagation(X, Y, parameters, inoutput, m_batch)
            
#             W1_gradient = Wb_gradients["W1_gradient"]
#             b1_gradient = Wb_gradients["b1_gradient"]
#             W2_gradient = Wb_gradients["W2_gradient"]
#             b2_gradient = Wb_gradients["b2_gradient"]
#             W3_gradient = Wb_gradients["W3_gradient"]
#             b3_gradient = Wb_gradients["b3_gradient"]
            
#             parameters["W1"] = parameters["W1"] - learning_rate * W1_gradient
#             parameters["b1"] = parameters["b1"] - learning_rate * b1_gradient
#             parameters["W2"] = parameters["W2"] - learning_rate * W2_gradient
#             parameters["b2"] = parameters["b2"] - learning_rate * b2_gradient
#             parameters["W3"] = parameters["W3"] - learning_rate * W3_gradient
#             parameters["b3"] = parameters["b3"] - learning_rate * b3_gradient
            
            
          
#         inoutput = forward(X_train, parameters)
#         #print(Y.shape)
        
#         #print(inoutput["outputlayer_output"].shape)
#         train_loss = cross_entropy(Y_train, inoutput["outputlayer_output"])

#         predicts += np.argmax(inoutput["outputlayer_output"], axis=0).tolist()
#         golds += np.argmax(Y_train, axis=0).tolist()
        
#         inoutput = forward(X_test, parameters)
#         #print(Y.shape)
        
#         #print(inoutput["outputlayer_output"].shape)
#         test_loss = cross_entropy(Y_test, inoutput["outputlayer_output"])

#         predicts_test += np.argmax(inoutput["outputlayer_output"], axis=0).tolist()
#         golds_test += np.argmax(Y_test, axis=0).tolist()
        
#         print("Epoch {}: training loss = {},  test loss = {}, Train_accur = {},Test_accur = {}".format(
#             i + 1, train_loss, test_loss, evaluation(predicts, golds), evaluation(predicts_test, golds_test)))

#         TrainError.append(1 - evaluation(predicts, golds))
#         TestError.append(1 - evaluation(predicts_test, golds_test))
        
        
# #         new_x_axis = np.arange(0,500, 5)
# #         fig, ax = plt.subplots(1, 1)
# #         print(TrainError.shape)
# #         print(new_x_axis.shape)
# #         ax.plot(new_x_axis, TrainError)
              
        
# #     with open("Train_error_rate.json", mode="w") as stream:
# #         json.dump(TrainError, stream)

# #     with open("Test_error_rate.json", mode="w") as stream:
# #         json.dump(TestError, stream)