In [1]:
import numpy as np
import pandas as pd
import math

In [29]:
class network:

  # Arguments to constructor
  #1 No. of nuerones in input layer
  #2 No. of nuerones in output layer
  #3 Hidden layer nuerons vector e.g, [128, 64, 32]
  #4 Activation function : sigmoid, tanh, relu
  #5 Optimizer : sgd, momentum, nesterov, rmsprop, adam, nadam
  #6 epochs
  #7 eta : Learning rate
  #8 Batch Size
  #9 Weight Decay : default value 0
  #10 gamma : for momentum, nesterov
  #11 beta1 : for adam, nadam
  #12 beta2 : for adam, nadam and rmsprop
  #13 eps : Epsilon for Learning rate decay
  def __init__(self, il, ol, hls, activation, optimizer, epochs, eta, batch_size, w_decay = 0, gamma = 0.9, beta1=0.9, beta2 = 0.999, eps = 1e-8):

    self.il_nodes = il
    self.ol_nodes = ol
    
    self.h_layers = len(hls)
    self.w = []
    self.b = []

    # for momentum and nesterov
    self.updateW = []
    self.updateB = []
    self.gamma = gamma

    # for rmsprop
    self.vW = []
    self.vB = []
    self.beta2 = beta2
    self.eps = eps

    # for adam
    self.mW = []
    self.mB = []
    self.beta1 = beta1

    self.act = activation
    self.opt = optimizer
    self.epochs = epochs
    self.eta = eta
    self.w_decay = w_decay
    self.batch_size = batch_size

    prev = il
    np.random.seed(10)
    for i in range(len(hls)):

      self.w.append( np.random.rand(hls[i], prev) - 0.5 )
      self.b.append( np.random.rand(hls[i], 1) - 0.5 )
      self.updateW.append(self.w[i]*0.0)
      self.updateB.append(self.b[i]*0.0)
      self.vW.append(self.w[i]*0.0)
      self.vB.append(self.b[i]*0.0)
      self.mW.append(self.w[i]*0.0)
      self.mB.append(self.b[i]*0.0)
      prev = hls[i]

    self.w.append( np.random.rand(ol, prev) - 0.5 )
    self.b.append( np.random.rand(ol, 1) - 0.5 )
    self.updateW.append(self.w[len(hls)]*0.0)
    self.updateB.append(self.b[len(hls)]*0.0)
    self.vW.append(self.w[len(hls)]*0.0)
    self.vB.append(self.b[len(hls)]*0.0)
    self.mW.append(self.w[len(hls)]*0.0)
    self.mB.append(self.b[len(hls)]*0.0)

  def network_shape(self):

    print('No. of nodes at each layer')
    print(f'{self.il_nodes}, ',end='')
    for i in range(self.h_layers):
      print(f'{self.w[i].shape[0]}, ',end='')
    print(f'{self.ol_nodes}')

  def weights_shape(self):

    print('Weight matrices shapes are')
    for i in range(self.h_layers):
      print(f'{self.w[i].shape}')
    print(f'{self.w[len(self.w)-1].shape}')

  def biases_shape(self):

    print('Bias vectors shapes are')
    for i in range(self.h_layers):
      print(f'{self.b[i].shape}')
    print(f'{self.b[len(self.b)-1].shape}')

  def vSigmoid(self, v):
    s = 1 / (1 + np.exp(-v))
    return s

  def vSigmoid_deriv(self, v):
    d = np.exp(-v) / (1 + np.exp(-v))**2
    return d

  def vTanh(self, v):
    t = (np.exp(v) - np.exp(-v)) / (np.exp(v) + np.exp(-v))
    return t
  
  def vTanh_deriv(self, v):
    e = (np.exp(v) - np.exp(-v)) / (np.exp(v) + np.exp(-v))
    d = 1.0 - e**2
    return d

  def vRelu(self, v):
    return np.maximum(v, 0)

  def vRelu_deriv(self, v):
    return (v > 0) + 0.0

  def vSoftmax(self, v):
    s = np.exp(v)/sum(np.exp(v))
    return s

  def forward_prop(self, X):
    
    a = []
    h = []

    a.append( self.w[0].dot(X) + self.b[0] )
    if self.act == "sigmoid":
      h.append( self.vSigmoid(a[0]) )
    elif self.act == "tanh":
      h.append( self.vTanh(a[0]) )
    elif self.act == "relu":
      h.append( self.vRelu(a[0]) )
    else:
      print("fp else")

    for i in range(1,self.h_layers):
      a.append( self.w[i].dot(h[i-1]) + self.b[i] )
      if self.act == "sigmoid":
        h.append( self.vSigmoid(a[i]) )
      elif self.act == "tanh":
        h.append( self.vTanh(a[i]) )
      elif self.act == "relu":
        h.append( self.vRelu(a[i]) )

    a.append( self.w[self.h_layers].dot(h[self.h_layers-1]) + self.b[self.h_layers] )
    h.append( self.vSoftmax(a[self.h_layers]) )

    if self.opt == "nesterov":

      for i in range(self.h_layers+1):
        self.w[i] = self.w[i] - self.gamma * self.updateW[i]
      for i in range(self.h_layers+1):
        self.b[i] = self.b[i] - self.gamma * self.updateB[i]

    if self.opt == "nadam":

      for i in range(self.h_layers+1):
        self.w[i] = self.w[i] - self.beta1 * self.mW[i]
      for i in range(self.h_layers+1):
        self.b[i] = self.b[i] - self.beta1 * self.mB[i]
    return a, h

  ## IMP -> one_hot_vector_size = no. of classes in classification problem, so change accordingly
  def one_hot(self, Y, one_hot_vector_size = 10):

    one_hot_y = np.zeros((Y.size, one_hot_vector_size))
    one_hot_y[np.arange(Y.size), Y] = 1
    one_hot_y = one_hot_y.T
    return one_hot_y

  def back_prop(self, a, h, X, Y):

    dA = []
    dW = []
    dB = []

    one_hot_y = self.one_hot(Y)
    dA.append( h[self.h_layers] - one_hot_y )
    dW.append( 1/(X.shape[1]) * dA[0].dot(h[self.h_layers-1].T) + self.w_decay * self.w[self.h_layers] )
    dB.append( 1/(X.shape[1]) * sum( dA[0].T).T.reshape(dA[0].shape[0],1) + self.w_decay * self.b[self.h_layers] )

    for i in range(1,self.h_layers):

      if self.act == "sigmoid":
        dA.append( self.w[self.h_layers - i + 1].T.dot( dA[i-1] ) * self.vSigmoid_deriv(a[self.h_layers-i]))
      elif self.act == "tanh":
        dA.append( self.w[self.h_layers - i + 1].T.dot( dA[i-1] ) * self.vTanh_deriv(a[self.h_layers-i]))
      elif self.act == "relu":
        dA.append( self.w[self.h_layers - i + 1].T.dot( dA[i-1] ) * self.vRelu_deriv(a[self.h_layers-i]))

      
      dW.append( 1/(X.shape[1]) * dA[i].dot(h[self.h_layers-1-i].T) + self.w_decay * self.w[self.h_layers - i] )
      dB.append( 1/(X.shape[1]) * sum( dA[i].T).T.reshape(dA[i].shape[0],1) + self.w_decay * self.b[self.h_layers - i] )

    if self.act == "sigmoid":
      dA.append( self.w[1].T.dot( dA[self.h_layers - 1] ) * self.vSigmoid_deriv(a[0]))
    elif self.act == "tanh":
      dA.append( self.w[1].T.dot( dA[self.h_layers - 1] ) * self.vTanh_deriv(a[0]))
    elif self.act == "relu":
      dA.append( self.w[1].T.dot( dA[self.h_layers - 1] ) * self.vRelu_deriv(a[0]))

    dW.append( 1/(X.shape[1]) * dA[self.h_layers].dot(X.T) + self.w_decay * self.w[0] )
    dB.append( 1/(X.shape[1]) * sum( dA[self.h_layers].T).T.reshape(dA[self.h_layers].shape[0],1) + self.w_decay * self.b[0] )

    dw = []
    for i in range(self.h_layers, -1, -1):
      dw.append(dW[i])

    db = []
    for i in range(self.h_layers, -1, -1):
      db.append(dB[i])

    return dw, db

  def update_para(self, dW, dB):

    if self.opt == "adam" or self.opt == "nadam":

      for i in range(self.h_layers+1):

        self.mW[i] = self.beta1 * self.mW[i] + (1-self.beta1) * dW[i]
        self.vW[i] = self.beta2 * self.vW[i] + (1-self.beta2) * dW[i]**2
        self.w[i] = self.w[i] - self.eta/np.sqrt(self.vW[i] + self.eps) * self.mW[i]

      for i in range(self.h_layers+1):

        self.mB[i] = self.beta1 * self.mB[i] + (1-self.beta1) * dB[i]
        self.vB[i] = self.beta2 * self.vB[i] + (1-self.beta2) * dB[i]**2
        self.b[i] = self.b[i] - self.eta/np.sqrt(self.vB[i] + self.eps) * self.mB[i]

    elif self.opt == "rmsprop":

      for i in range(self.h_layers+1):

        self.vW[i] = self.beta2 * self.vW[i] + (1-self.beta2) * dW[i]**2
        self.w[i] = self.w[i] - self.eta/np.sqrt(self.vW[i] + self.eps) * dW[i]

      for i in range(self.h_layers+1):

        self.vB[i] = self.beta2 * self.vB[i] + (1-self.beta2) * dB[i]**2
        self.b[i] = self.b[i] - self.eta/np.sqrt(self.vB[i] + self.eps) * dB[i]

    elif self.opt == "nesterov" or self.opt == "momentum":
      
      for i in range(self.h_layers+1):

        self.updateW[i] = self.gamma * self.updateW[i] + self.eta * dW[i]
        self.w[i] = self.w[i] - self.updateW[i]

      for i in range(self.h_layers+1):

        self.updateB[i] = self.gamma * self.updateB[i] + self.eta * dB[i]
        self.b[i] = self.b[i] - self.updateB[i]

    else:
      
      for i in range(self.h_layers+1):

        self.w[i] = self.w[i] - self.eta * dW[i]

      for i in range(self.h_layers+1):

        self.b[i] = self.b[i] - self.eta * dB[i]

  def grad_descent(self, X, Y):

    print("\nOptimizer : ",self.opt)
    print("Batch size : ",self.batch_size)
    for i in range(self.epochs):

      n = math.ceil(X.shape[1]/self.batch_size)-1

      for j in range(n):

        a, h = self.forward_prop(X.T[ self.batch_size * j : self.batch_size * (j+1)][:].T)
        dW, dB = self.back_prop(a, h, X.T[ self.batch_size * j : self.batch_size * (j+1)][:].T, Y[ self.batch_size * j : self.batch_size * (j+1)])
        self.update_para(dW, dB)

      a, h = self.forward_prop(X.T[ self.batch_size * n : X.shape[1] ][:].T)
      dW, dB = self.back_prop(a, h, X.T[ self.batch_size * n : X.shape[1] ][:].T, Y[ self.batch_size * n : X.shape[1] ])
      self.update_para(dW, dB)

  def accuracy(self, Y, y_hat):

    count = 0
    for i in range(y_hat.shape[0]):

      if Y[i] == np.argmax(y_hat[i]):
        count += 1
    
    accuracy = count/y_hat.shape[0]
    print(f'Accuracy : {accuracy}')
    return accuracy


In [None]:
from keras.datasets import fashion_mnist
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

In [36]:
# reshaped x_train
x_tr = x_train.reshape((x_train.shape[0], 784))/255
print("xtrain shape ", x_tr.shape)

# reshaped x_test
x_ts = x_test.reshape((x_test.shape[0], 784))/255
print("xtest  shape ", x_ts.shape)

xtrain shape  (60000, 784)
xtest  shape  (10000, 784)


In [37]:
m = 54000  # 90% of train data

# for training
x_tr1 = x_tr[0:m][:]
y_tr1 = y_train[0:m]

In [39]:
# Skip this section if you want to perform wand sweeps.
# This section is only for testing 1 configuration of parameters (no wandb intergration)

shl = 32
hl = 3
hl_nodes = hl*[shl]

#n1 = network(x_tr1.shape[1], 10, hl_nodes, "sigmoid", "sgd", 45, 0.8, batch_size = 145)
n1 = network(x_tr1.shape[1], 10, hl_nodes, "sigmoid", "momentum", 10, 0.9,batch_size = 145)
#n3 = network(x_tr1.shape[1], 10, hl_nodes, "sigmoid", "nesterov", 40, 0.8, 145)
#n4 = network(x_tr1.shape[1], 10, hl_nodes, "sigmoid", "rmsprop", 45, 0.0005,batch_size = 145)
#n5 = network(x_tr1.shape[1], 10, hl_nodes, "sigmoid",  "adam", 45, 0.0005,batch_size = 145)
#n6 = network(x_tr1.shape[1], 10, hl_nodes, "sigmoid", "nadam", 40, 0.0005,batch_size = 145)

n1.grad_descent(x_tr1.T, y_tr1)

# validation data = 10% of train data = 6000
# accuracy measure
a, h = n1.forward_prop(x_tr[54000:60000].T)
print("Validation ",end='')
accuracy = n1.accuracy(y_train[54000:60000], h[len(hl_nodes)].T)
          
a, h = n1.forward_prop(x_tr1[0:m].T)
print("Train ",end='')
_ = n1.accuracy(y_tr1[0:m], h[len(hl_nodes)].T)

a, h = n1.forward_prop(x_ts.T)
print("Test ",end='')
_ = n1.accuracy(y_test, h[len(hl_nodes)].T)


Optimizer :  momentum
Batch size :  145
Validation Accuracy : 0.8516666666666667
Train Accuracy : 0.8669814814814815
Test Accuracy : 0.8501


In [None]:
# Code from here is for logging wandb sweeps.
!pip install wandb --upgrade

In [None]:
import wandb
wandb.login()

In [None]:
sweep_config = {
    'method': 'random'
    }
metric = {
    'name': 'valid_acc',
    'goal': 'maximize'   
    }

sweep_config['metric'] = metric

parameters_dict = {
    'epochs': {
        'values': [5, 10]
        },
    'no_of_HL': {
        'values': [3, 4, 5]
        },
    'HL_size': {
        'values': [32, 64, 128]
        },
    'w_decay': {
        'values': [0, 0.5, 0.05, 0.005, 0.0005]
        },
    'eta': {
        'values': [1e-3, 1e-4]  
        },
    'optimizer': {
        'values': ['sgd', 'momentum', 'nesterov', 'rmsprop', 'adam', 'nadam']
        },
    'batch_size': {
        'values': [16, 32, 64, 128]  
        },
    'activation': {
        'values': ['sigmoid', 'tanh', 'relu']
        }
}
sweep_config['parameters'] = parameters_dict

sweep_id = wandb.sweep(sweep_config, project="DL-Assignment-1")

In [30]:
# train function for wandb
def train(config = None):

  # Initialize a new wandb run
  with wandb.init(config=config):
    # If called by wandb.agent, as below,
    # this config will be set by Sweep Controller
    config = wandb.config

    # list of no. of nuerons in each hidden layer
    hl_nodes = config.no_of_HL*[config.HL_size]

    #(il, ol, hls, activation, optimizer, epochs, eta, batch_size, w_decay = 0, gamma = 0.9, beta1=0.9, beta2 = 0.999, eps = 1e-8)
    n1 = network(x_tr1.shape[1], 10, hl_nodes, config.activation, config.optimizer, config.epochs, config.eta, config.batch_size, config.w_decay)

    a, h = n1.forward_prop(x_tr[54000:60000].T)
    accuracy = n1.accuracy(y_train[54000:60000], h[len(hl_nodes)].T)
    wandb.log({"valid_acc": accuracy, "step": 0})
    wandb.log({"valid_loss": 1-accuracy, "step": 0})
              
    a, h = n1.forward_prop(x_ts.T)
    acc = n1.accuracy(y_test, h[len(hl_nodes)].T)
    wandb.log({"test_acc": acc, "step": 0})
    wandb.log({"test_loss": 1-acc, "step": 0})
    wandb.log({"epochs": 0, "step": 0})

    n1.grad_descent(x_tr1.T, y_tr1)

    a, h = n1.forward_prop(x_tr[54000:60000].T)
    print("Validation ",end='')
    accuracy = n1.accuracy(y_train[54000:60000], h[len(hl_nodes)].T)
    wandb.log({"valid_acc": accuracy, "step": 1})
    wandb.log({"valid_loss": 1-accuracy, "step": 1})
              
    a, h = n1.forward_prop(x_tr1[0:m].T)
    print("Train ",end='')
    _ = n1.accuracy(y_tr1[0:m], h[len(hl_nodes)].T)

    a, h = n1.forward_prop(x_ts.T)
    print("Test ",end='')
    acc = n1.accuracy(y_test, h[len(hl_nodes)].T)
    wandb.log({"test_acc": acc, "step": 1})
    wandb.log({"test_loss": 1-acc, "step": 1})
    wandb.log({"epochs": config.epochs, "step": 1})

In [None]:
#executing wandb agent
wandb.agent(sweep_id, train, count= 10)