In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
import scipy as sp
import copy
import pickle


from keras.datasets import mnist
(train_data, train_labels), (test_data, test_labels) = mnist.load_data()



In [2]:
def hot_encoding(array):
    hot_encoded_out = []
    x = list(range(min(array),max(array)+1))

    for i in range (0,len(array)):
        hot_encoded_out.append(((x == array[i]).astype(int)))

    return np.array(hot_encoded_out).T

def fake_quantize(tensor, bit_width, scale=None, min_val=None, max_val=None):
    
    if min_val is None:
        min_val = np.min(tensor)
    if max_val is None:
        max_val = np.max(tensor)

    # Calculate the quantization range
    qmin = 0
    qmax = 2**bit_width - 1

    # Calculate the scale and zero point
    if scale is None:
        scale = (max_val - min_val) / (qmax - qmin)
    # print(scale)
    zero_point = qmin - min_val / scale

    # Fake quantization: Quantize and dequantize
    quantized_tensor = np.round(tensor / scale + zero_point)
    quantized_tensor = np.clip(quantized_tensor, qmin, qmax)

    # Dequantize back to the floating-point representation
    dequantized_tensor = (quantized_tensor - zero_point) * scale

    return dequantized_tensor, scale

In [3]:
#Reshape the  data examples
train_x_flatten = train_data.reshape(train_data.shape[0], -1).T   # The "-1" makes reshape flatten the remaining dimensions
test_x_flatten = test_data.reshape(test_data.shape[0], -1).T

#Standardize data to have feature values between 0 and 1
train_x = train_x_flatten/255
test_x = test_x_flatten/255

#Hot encoding the train and test samples
train_y = hot_encoding(np.squeeze(train_labels))
test_y = hot_encoding(np.squeeze(test_labels))

(784, 60000)
(10, 60000)
(784, 10000)
(10, 10000)


In [4]:
def sigmoid(x):
  return np.tanh(x)

def softmax(Z):
    p = Z - np.max(Z)
    A = np.exp(p) / (np.sum(np.exp(p), axis=0))
    assert (np.shape(A) == np.shape(Z))
    return A

def sigmoid_derivative(x):
  return 1 - x**2

def mse(x, y):
  return np.sum((x-y)**2)/(x.shape[1])

def dot_sigmoid(W, b, h_prev):
  return sigmoid(np.dot(W, h_prev) + b)

def dot_softmax(W, b, h_prev):
  return softmax(np.dot(W, h_prev) + b)

def cost_bincentropy(h, y):
  return -1* np.sum((y*np.log(h)+(1-y)*(1-np.log(h))))/h.shape[1]

def predict(p):
  return np.round(p)

def cal_accuracy(pred, truth):
  corr = 0
  for i in range(0,pred.shape[1]):
    if np.sum(np.abs((pred[:,i]-truth[:,i])))==0:
      corr+=1
  return corr/pred.shape[1]*100


def one_hot_max(prediction_probs):
    max_indices = np.argmax(prediction_probs, axis=0)
    one_hot_result = np.zeros_like(prediction_probs)
    one_hot_result[max_indices, np.arange(prediction_probs.shape[1])] = 1

    return one_hot_result

In [5]:
def initialize_param_he(Layers):
  # np.random.seed(4)
  param= {}
  layers_count = len(Layers)

  for i in range(layers_count - 1):
    param["F"+str(i+1)] = np.random.normal(0, np.sqrt(2.0 / (Layers[i+1]+Layers[i])), (Layers[i+1], Layers[i]))

    param["f"+str(i+1)] = np.zeros((Layers[i+1], 1))

  return param


In [6]:
def bp_grads_final(hp, hp_cap, h):     #hp_cap here target labels
  pers = h.shape[1]
  grad_Z_Last = hp - hp_cap                             # predictions - truth
  grad_F = np.dot(grad_Z_Last, h.T)/pers
  grad_f = np.sum(grad_Z_Last, axis=1, keepdims=True)/pers

  return grad_F, grad_f, grad_Z_Last


def bp_grads(W, hp, h, grad_Zpp, bit_width):     #hp is present, Zpp is p+1, h is previous
  pers = h.shape[1]

  part_grad_Zp, _ = fake_quantize(np.dot(W.T, grad_Zpp), bit_width) # quantize after dot product
  grad_Zp = part_grad_Zp*sigmoid_derivative(hp)
  q_grad_Zp_in_grads, _ = fake_quantize(grad_Zp, bit_width)
  grad_F = np.dot(q_grad_Zp_in_grads, h.T)/pers
  grad_f = np.sum(q_grad_Zp_in_grads, axis=1, keepdims=True)/pers

  return grad_F, grad_f, grad_Zp

In [7]:

def model_forward(PARAM, input_data):
  L = int(len(PARAM)/2)+1
  h = input_data

  for loop in range(1, L):
    if loop < L-1:
      h = dot_sigmoid(PARAM["F"+str(loop)], PARAM["f"+str(loop)], h)
    
    else:
      h = dot_softmax(PARAM["F"+str(loop)], PARAM["f"+str(loop)], h)

  return h


def q_model_forward(PARAM, input_data, bit_width):
  L = int(len(PARAM)/2)+1
  h = input_data
  h, _ = fake_quantize(h, bit_width)
  # print(L)

  for loop in range(1, L):
    if loop < L-1:
      q_F, _ = fake_quantize(PARAM["F"+str(loop)], bit_width)
      q_f, _ = fake_quantize(PARAM["f"+str(loop)], bit_width)
      h = dot_sigmoid(q_F, q_f, h)
      h, _ = fake_quantize(h, bit_width)
    
    else:
      q_F, _ = fake_quantize(PARAM["F"+str(loop)], bit_width)
      q_f, _ = fake_quantize(PARAM["f"+str(loop)], bit_width)
      h = dot_softmax(q_F, q_f, h)
      h, _ = fake_quantize(h, bit_width)

  return h

In [8]:
def back_propagation(input_data_main, target_output_main, hidden_layers, epochs, learning_rate, batches, bit_width):

    cost_back = np.array([])
    cost_for = np.array([])
    train_accu = np.array([])
    accu_old = -1
    train_accu_train =np.array([])


    Ns = target_output_main.shape[1]
    per_batch = int(Ns/batches)

    # np.random.seed(0) # for 0,1
    # np.random.seed(4)

    # Initialize weights and biases
    n0 = input_data_main.shape[0]
    n_L = target_output_main.shape[0]

    L = len(hidden_layers)+2   # Number of hidden layers plus input and output layers
    parameters = initialize_param_he([n0]+hidden_layers+[n_L])
    q_param = {}
    activations = {}
    q_activations = {}
    grad = {}


    # Training loop
    for epoch in range(epochs):

      random_new_indices = random.sample(range(0,Ns), Ns)

      input_data_ = input_data_main[:,random_new_indices]
      target_output_ = target_output_main[:,random_new_indices]

      for i in range(0,  batches):
        input_data = input_data_[:, i*per_batch:(i+1)*per_batch]
        target_output = target_output_[:, i*per_batch:(i+1)*per_batch]

        #quantize input
        activations["h0"] = input_data
        q_activations["h0"], _ = fake_quantize(input_data, bit_width)

        #quantize weights and biases
        for loop in range(1, L):
          q_param["F"+str(loop)], scale_F = fake_quantize(parameters["F"+str(loop)], bit_width)
          q_param["f"+str(loop)], _ = fake_quantize(parameters["f"+str(loop)], bit_width, scale=scale_F)


        # # Forward pass
        for loop in range(1, L):
          if loop < L-1:
            activations["h"+str(loop)] = dot_sigmoid(q_param["F"+str(loop)], q_param["f"+str(loop)], q_activations["h"+str(loop-1)])
            q_activations["h"+str(loop)], _ = fake_quantize(activations["h"+str(loop)], bit_width)
          else:
            activations["h"+str(loop)] = dot_softmax(q_param["F"+str(loop)], q_param["f"+str(loop)], q_activations["h"+str(loop-1)])
            q_activations["h"+str(loop)], _ = fake_quantize(activations["h"+str(loop)], bit_width)


        # Compute cost
        cost_for = np.append(cost_for, mse(activations["h"+str(L-1)], target_output))

        # Compute grads for forward weights and biases
        for loop in reversed(range(1, L)):
          if loop == L-1:
            grad["F"+str(loop)], grad["f"+str(loop)], grad_Z = bp_grads_final(q_activations["h"+str(loop)], target_output, q_activations["h"+str(loop-1)])
            q_grad_Z, _ = fake_quantize(grad_Z, bit_width)
          else:
            grad["F"+str(loop)], grad["f"+str(loop)], grad_Z = bp_grads(q_param["F"+str(loop+1)], q_activations["h"+str(loop)],
                                                                           q_activations["h"+str(loop-1)], q_grad_Z, bit_width)
            q_grad_Z, _ = fake_quantize(grad_Z, bit_width)
            


        #update parameters
        for loop in range(1, L):
          parameters["F"+str(loop)] -= grad["F"+str(loop)] * learning_rate
          parameters["f"+str(loop)] -= grad["f"+str(loop)] * learning_rate
 

      #calculate training accuracy
      final_layer = q_model_forward(parameters, input_data_main, bit_width)
      prediction = one_hot_max(final_layer)
      accu_new_train = cal_accuracy(prediction, target_output_main)
      train_accu_train = np.append(train_accu_train, accu_new_train)


      #calculate validation accuracy
      final_layer = q_model_forward(parameters, test_x[:,0:4000], bit_width)
      prediction = one_hot_max(final_layer)
      accu_new = cal_accuracy(prediction, test_y[:,0:4000])
      train_accu = np.append(train_accu, accu_new)



      if accu_new>=accu_old or accu_old==-1:
        parameters_final = copy.deepcopy(parameters)
        accu_old = accu_new

        print(f'Iteration {epoch} ---------')
        print(f'Training Accuracy: {accu_new_train}')
        print(f'Validation Accuracy: {accu_old}')


    plt.plot(np.squeeze(cost_for))
    plt.show()

    plt.plot(np.squeeze(train_accu))
    plt.plot(np.squeeze(train_accu_train))
    plt.show()


    return parameters_final

In [None]:
# Training the network

learning_rate = 0.1     #Learning rate
batch = 128             #Batch size
bit_width = 4           #Bit width
epochs = 100            #Number of epochs
hidden_layers = [128, 64]    #Hidden layers


parameters_final = back_propagation(train_x,
                                    train_y,
                                    hidden_layers,
                                    epochs,
                                    learning_rate,
                                    batch,
                                    bit_width)
#testing network
input_test_data= test_x
final_layer_output = q_model_forward(parameters_final, input_test_data, bit_width)
prediction = one_hot_max(final_layer_output)
print("Test Accuracy:")
print(cal_accuracy(prediction, test_y))




