**Installing Wandb**

In [None]:
!pip install wandb

**Import statements**

In [None]:
import wandb

In [None]:
from keras.datasets import fashion_mnist
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [None]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
k = len(class_names)

# **Question 1**

In [None]:
def plotImagesOfEachClass():
  #loading the dataset
  (trainX, trainy), (testX, testy) = fashion_mnist.load_data()
  wandb.init(
      project="Assignment 1",
      entity="cs22m006",
      name="Assignment1_sample_images"
  )
  image_labels = []
  images = []
  #finding 1 image from each class
  for i in range(len(trainX)):
    if len(image_labels) == len(class_names):
      break
    if class_names[trainy[i]] not in image_labels:
      image_labels.append(class_names[trainy[i]])
      images.append(trainX[i])

  #logging 1 image from each class in wandb
  wandb.log({"Sample image for each class ": [wandb.Image(img, caption=caption) for img, caption in zip(images, image_labels)]})

plotImagesOfEachClass()

# **Question 2 and 3**

In [None]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
k = len(class_names)
#loading the dataset
(x_train, y_train), (testX, testy) = fashion_mnist.load_data()
#flattening the images, originally image is of size 28*28, converting it to 784*1
x_train = x_train.reshape(x_train.shape[0], x_train.shape[1]*x_train.shape[2])
x_train = x_train/255.0
testX = testX.reshape(testX.shape[0], testX.shape[1]*testX.shape[2])
testX = testX/255.0

In [None]:
def initializeWeightAndBias(layer_dims, init_mode = "random_uniform"):
  #layer_dims is a list, which contains number of neurons in each layer
  W = []
  bias = []
  np.random.seed(3)
  if(init_mode == "random_uniform"):
    for layer_num in range(len(layer_dims)-1):
      W.append(np.random.uniform(-0.7, 0.7, (layer_dims[layer_num+1], layer_dims[layer_num])))
      bias.append((np.random.uniform(-0.7, 0.7, (layer_dims[layer_num+1],1))))
  elif(init_mode == "xavier"):
    for layer_num in range(len(layer_dims)-1):
      W.append(np.random.randn(layer_dims[layer_num+1],layer_dims[layer_num])*np.sqrt(2/(layer_dims[layer_num+1]+layer_dims[layer_num])))
      bias.append(np.random.randn(layer_dims[layer_num+1],1)*np.sqrt(2/(layer_dims[layer_num+1])))
  else:  #random normal
    for layer_num in range(len(layer_dims)-1):
      W.append(np.random.randn(layer_dims[layer_num+1], layer_dims[layer_num]))
      bias.append((np.random.randn(layer_dims[layer_num+1],1)))
  return W, bias

In [None]:
def feedForward(W, bias, X, num_hidden_layers, layer_dims, activation_fun = "tanh"):
  preactivation = []
  activation = []
  activation.append(X.T)
  preactivation.append(X.T)
  for i in range(1, num_hidden_layers+1):
    preactivation.append(bias[i-1] + np.matmul(W[i-1], activation[(i-1)]))
    if(activation_fun == "sigmoid"):
      activation.append(sigmoid(preactivation[i]))
    elif(activation_fun == "tanh"):
      activation.append(tanh(preactivation[i]))
    elif(activation_fun == "reLU"):
      activation.append(reLU(preactivation[i]))
  preactivation.append(bias[-1] + np.dot(W[-1], activation[-1]))
  activation.append(softmax(preactivation[-1]))
  return activation[-1], activation, preactivation

In [None]:
def updateParam(W, gradientW, bias, gradientBias, learning_rate):
  for i in range(0, len(W)):
    W[i] = W[i] - learning_rate*gradientW[i]
    bias[i] = bias[i] - learning_rate*gradientBias[i]
  return W, bias

def updateParamMomentum(W, bias, gradientW, gradientBias, previous_updates_W, previous_updates_Bias, learning_rate, momentum):
  for idx in range(len(gradientW)):
    previous_updates_W[idx] = momentum*previous_updates_W[idx] + gradientW[idx]
    previous_updates_Bias[idx] = momentum*previous_updates_Bias[idx] + gradientBias[idx]
  for i in range(0, len(W)):
    W[i] = W[i] - learning_rate*gradientW[i]
    bias[i] = bias[i] - learning_rate*gradientBias[i]
  return W, bias
  

def updateParamRMS(W, gradientW, bias, gradientBias, learning_rate, v_W, v_bias, beta):
  eps = 1e-6
  for idx in range(0, len(W)):
    v_W_t = beta*v_W[idx] + (1-beta)*np.multiply(gradientW[idx], gradientW[idx])
    v_bias_t = beta*v_bias[idx] + (1-beta)*np.multiply(gradientBias[idx], gradientBias[idx])
    W[idx] = W[idx] - learning_rate*gradientW[idx]/(np.sqrt(v_W_t)+eps)
    bias[idx] = bias[idx] - learning_rate*gradientBias[idx]/(np.sqrt(v_bias_t)+eps)
    v_W[idx] = v_W_t
    v_bias[idx] = v_bias_t
  return W, bias, v_W, v_bias

def updateParamAdam(W, bias, gradientW, gradientBias, v_W, v_bias, m_W, m_bias, t, learning_rate, beta1, beta2):

  epsilon = 1e-6

  for i in range(0, len(W)):
    mdW = beta1*m_W[i] + (1-beta1)*gradientW[i]
    mdBias = beta1*m_bias[i] + (1-beta1)*gradientBias[i]
    vdW = beta2*v_W[i] + (1-beta2)*np.square(gradientW[i])
    vdBias = beta2*v_bias[i] + (1-beta2)*np.square(gradientBias[i])
    m_w_hat = mdW/(1.0 - beta1**t)
    v_w_hat = vdW/(1.0 - beta2**t)
    m_bias_hat = mdBias/(1.0 - beta1**t)
    v_bias_hat = vdBias/(1.0 - beta2**t)
    #adding epsilon to prevent from divide by zero
    W[i] = W[i] - (learning_rate * m_w_hat)/np.sqrt(v_w_hat + epsilon)
    bias[i] = bias[i] - (learning_rate * m_bias_hat)/np.sqrt(v_bias_hat + epsilon)

    v_W[i] = vdW
    m_W[i] = mdW
    v_bias[i] = vdBias
    m_bias[i] = mdBias

    return W, bias, v_W, v_bias, m_W, m_bias

In [None]:
def sigmoid(X):
  return 1.0/(1.+np.exp(-X))

def sigmoid_derivative(x):
  return sigmoid(x)*(1-sigmoid(x))

def reLU(x):
  return np.maximum(0,x)

def reLU_derivative(x):
  return 1*(x>0) 

def tanh(x):
  return np.tanh(x)

def tanh_derivative(x):
  return (1 - (np.tanh(x)**2))

def softmax(a):
  #finding softmax rowwise, since "a" is a matrix
  return np.exp(a)/np.sum(np.exp(a), axis=0)

def softmax_derivative(a):
  return softmax(a)*(1-softmax(a))

In [None]:
def backward_propogation(y_one_hot, x, y, W, bias, activation, preactivation, num_hidden_layers, batch_size, activation_fun, weight_decay, loss_fun):
  L = num_hidden_layers+1
  gradientPreactivation = []
  if(loss_fun == "cross_entropy"):
    gradientPreactivation.append(activation[L]-y_one_hot)
  else:
    gradientPreactivation.append((activation[L]-y_one_hot) * softmax_derivative(preactivation[L]))
  gradientWeight = []
  gradientBias = []
  for k in range(L, 0, -1):
    gradientWeight.append(np.matmul(gradientPreactivation[-1], activation[k-1].T)/batch_size + (weight_decay*W[k-1])/batch_size)
    gradientBias.append(np.sum(gradientPreactivation[-1], axis=1, keepdims=True)/batch_size)
    if k==1:
      break
    if(activation_fun == "sigmoid"):
      gradientPreactivation.append(np.multiply(np.matmul(W[k-1].T, gradientPreactivation[-1]), sigmoid_derivative(preactivation[k-1])))
    elif(activation_fun == "tanh"):
      gradientPreactivation.append(np.multiply(np.matmul(W[k-1].T, gradientPreactivation[-1]), tanh_derivative(preactivation[k-1])))
    if(activation_fun == "reLU"):
      gradientPreactivation.append(np.multiply(np.matmul(W[k-1].T, gradientPreactivation[-1]), reLU_derivative(preactivation[k-1])))
  return gradientWeight[::-1], gradientBias[::-1]

In [None]:
def cross_entropy(y, y_hat, W, weight_decay):
  loss = 0
  for i in range(len(y)):
    loss += -1.0*np.sum(y[i]*np.log(y_hat[i]))
  #L2 regularization
  acc = 0
  for i in range(len(W)):
    acc += np.sum(W[i]**2)
  loss += weight_decay*acc
  return loss

def mean_squared_error(y, y_hat, W, weight_decay):
  loss = 0.5 * np.sum((y-y_hat)**2)
  #L2 regularizaation
  acc = 0
  for i in range(len(W)):
    acc += np.sum(W[i]**2)
  loss += weight_decay*acc
  return loss

In [None]:
def calculate_accuracy_and_loss(W, bias, X, y, num_hidden_layers, layer_dims, activation_fun, weight_decay, loss_function, y_one_hot):
  hL, _, _ = feedForward(W, bias, X, num_hidden_layers, layer_dims, activation_fun)
  #finding predicted class for all the datapoints
  predictions = np.argmax(hL, axis = 0)
  #counting the elements which has predicted class same as original class
  acc = np.sum(y == predictions)/predictions.shape[0]*100
  if(loss_function == "cross_entropy"):
    loss = cross_entropy(y_one_hot, hL, W, weight_decay)
  else:
    loss = mean_squared_error(y_one_hot, hL, W, weight_decay)
  return acc, loss

In [None]:
def generate_one_hot(n, true_label):
  #generating one hot matrix, where all the elements of column i will be set to 0, except the true class index of image i, which will be set to 1
  y_one_hot = np.zeros((10, n))
  for i in range(n):
    y_one_hot[true_label[i]][i] = 1
  return y_one_hot

In [None]:
def plotConfusionMatrix(trainy, y_pred, class_names):
  wandb.init(
      project="Assignment 1",
      entity="cs22m006",
      name="Confusion matrix"
  )
  wandb.log({"conf_mat" : wandb.plot.confusion_matrix(
                          y_true=trainy, preds=y_pred,
                          class_names=class_names)})

In [None]:
def calculateTestAccuracy(testX, testy, layer_dims, num_hidden_layers, neurons_in_each_layer, batch_size, W, bias, activation_fun):
  batch_count = batch_size
  count = 0
  for i in range(0, len(testX), batch_size):
    #if we are left with lesser data points compared to batch size, still we don't want to ignore those data points
    if(i+batch_size>len(testX)):
      batch_count = len(testX)-i-1
    #calling feed forward to get the prediction class
    hL, activation, preactivation = feedForward(W, bias, testX[i:i+batch_count], num_hidden_layers, layer_dims, activation_fun)
    for j in range(i, i+batch_count):
      if(np.argmax(hL[:,(j-i)]) == testy[j]):
        count+=1
  print("Accuracy on test data", (100.0*count)/len(testX))

In [None]:
def optimizers():
  #setting the default parameters
  #change the parameters if you want to test for other parameters as well
  config_defaults = {
        'epochs': 20,
        'batch_size': 32,
        'learning_rate': 0.1,
        'activation_fun': 'tanh',
        'optimizer': 'sgd',
        'init_mode': 'random_normal',
        'weight_decay': 0.05,
        'neurons_in_each_layer': 128,
        'num_hidden_layers': 4,
        'loss_function': 'cross_entropy',
        'momentum': 0.9,
        'beta': 0.9,
        'beta1': 0.9,
        'beta2': 0.999
    }
  wandb.init(project = 'Assignment 1', entity = 'cs22m006', config=config_defaults)
  learning_rate = wandb.config.learning_rate
  activation_fun = wandb.config.activation_fun
  init_mode = wandb.config.init_mode
  optimizer = wandb.config.optimizer
  batch_size = wandb.config.batch_size
  epochs = wandb.config.epochs
  weight_decay = wandb.config.weight_decay
  neurons_in_each_layer = wandb.config.neurons_in_each_layer
  num_hidden_layers = wandb.config.num_hidden_layers
  loss_function = wandb.config.loss_function
  momentum = wandb.config.momentum
  beta = wandb.config.beta
  beta1 = wandb.config.beta1
  beta2 = wandb.config.beta2

  layer_dims = [trainX.shape[1]]
  for i in range(num_hidden_layers):
    layer_dims.append(neurons_in_each_layer)
  layer_dims.append(k)
  #initializing weights and biases
  W, bias = initializeWeightAndBias(layer_dims, init_mode)

  y_one_hot, y_one_hot_val = generate_one_hot(num_images, trainy), generate_one_hot(len(validationy), validationy)
  
  #initializing variables which is going to used in optimizers
  v_W = [0]*(num_hidden_layers+1)
  v_bias, m_W, m_bias, gradientW, gradientBias, look_ahead_W, look_ahead_bias, previous_updates_W, previous_updates_Bias = v_W.copy(), v_W.copy(), v_W.copy(), v_W.copy(), v_W.copy(), v_W.copy(), v_W.copy(), v_W.copy(), v_W.copy()
  t = 1 #for adam
  #setting the run name for wandb
  run_name = "lr_{}_ac_{}_in_{}_op_{}_bs_{}_L2_{}_ep_{}_nn_{}_nh_{}_loss_{}".format(learning_rate, activation_fun, init_mode, optimizer, batch_size, weight_decay, epochs, neurons_in_each_layer, num_hidden_layers, loss_function)
  y_pred = []
  for iterationNumber in range(epochs):
    loss=0
    val_loss = 0
    batch_count = batch_size
    for i in range(0, num_images, batch_size):
      #if we are left with lesser data points compared to batch size, still we don't want to ignore those data points
      if(i+batch_size >= num_images):
        batch_count = num_images-i

      if(optimizer == "nag"):
        for idx in range(len(W)):
          look_ahead_W[idx] = W[idx] - momentum * gradientW[idx]
          look_ahead_bias[idx] = bias[idx] - momentum * gradientBias[idx]

        hL, activation, preactivation = feedForward(look_ahead_W, look_ahead_bias, trainX[i:i+batch_count], num_hidden_layers, layer_dims, activation_fun)
        gradientW, gradientBias = backward_propogation(y_one_hot[:,i:i+batch_count], trainX[i:i+batch_count], trainy[i:i+batch_count], look_ahead_W, look_ahead_bias, activation, preactivation, num_hidden_layers, batch_size, activation_fun, weight_decay, loss_function)
        W, bias = updateParam(W, gradientW, bias, gradientBias, learning_rate)

      elif(optimizer == "nadam"):
        for idx in range(len(W)):
          look_ahead_W[idx] = W[idx] - momentum * gradientW[idx]
          look_ahead_bias[idx] = bias[idx] - momentum * gradientBias[idx]

        hL, activation, preactivation = feedForward(look_ahead_W, look_ahead_bias, trainX[i:i+batch_count], num_hidden_layers, layer_dims, activation_fun)
        gradientW, gradientBias = backward_propogation(y_one_hot[:,i:i+batch_count], trainX[i:i+batch_count], trainy[i:i+batch_count], look_ahead_W, look_ahead_bias, activation, preactivation, num_hidden_layers, batch_size, activation_fun, weight_decay, loss_function)
        W, bias, v_W, v_bias, m_W, m_bias = updateParamAdam(W, bias, gradientW, gradientBias, v_W, v_bias, m_W, m_bias, t, learning_rate, beta1, beta2)
        t += 1

      elif optimizer == 'insert your optimizer here':
        #write the update rules and calling of feedforward and backprop here
        pass

      else:
        hL, activation, preactivation = feedForward(W, bias, trainX[i:i+batch_count], num_hidden_layers, layer_dims, activation_fun)

        gradientW, gradientBias = backward_propogation(y_one_hot[:,i:i+batch_count], trainX[i:i+batch_count], trainy[i:i+batch_count], W, bias, activation, preactivation, num_hidden_layers, batch_size, activation_fun, weight_decay, loss_function)
  
        if(optimizer == "sgd"):
          W, bias = updateParam(W, gradientW, bias, gradientBias, learning_rate)

        elif(optimizer == "momentum"):
          W, bias = updateParamMomentum(W, bias, gradientW, gradientBias, previous_updates_W, previous_updates_Bias, learning_rate, momentum)
        
        elif(optimizer == "rmsprop"):
          W, bias, v_W, v_bias = updateParamRMS(W, gradientW, bias, gradientBias, learning_rate, v_W, v_bias, beta)

        elif(optimizer == "adam"):
          W, bias, v_W, v_bias, m_W, m_bias = updateParamAdam(W, bias, gradientW, gradientBias, v_W, v_bias, m_W, m_bias, t, learning_rate, beta1, beta2)
          t += 1
      #calculate predicted label for each datapoint in last epoch
      if(iterationNumber==epochs-1):
        for j in range(i, i+batch_count):
          y_pred.append(np.argmax(hL[:,(j-i)]))
    train_acc, loss = calculate_accuracy_and_loss(W, bias, trainX, trainy, num_hidden_layers, layer_dims, activation_fun, weight_decay, loss_function, y_one_hot)
    val_acc, val_loss = calculate_accuracy_and_loss(W, bias, validationX, validationy, num_hidden_layers, layer_dims, activation_fun, weight_decay, loss_function, y_one_hot_val)
    print("training_accuracy:", train_acc, "validation_accuracy:", val_acc, "training_loss:", loss/(len(trainX)), "validation loss:", val_loss/len(validationX), "epoch:", iterationNumber)
    wandb.log({"training_accuracy": train_acc, "validation_accuracy": val_acc, "training_loss": loss/(len(trainX)), "validation loss": val_loss/len(validationX), 'epoch': iterationNumber})
  calculateTestAccuracy(testX, testy, layer_dims, num_hidden_layers, neurons_in_each_layer, batch_size, W, bias, activation_fun)
  wandb.run.name = run_name
  wandb.run.save()
  wandb.run.finish()
  return y_pred

In [None]:
if __name__ == '__main__':
  trainX, validationX, trainy, validationy = train_test_split(x_train, y_train, random_state=104, test_size=0.1, shuffle=True)
  num_images = len(trainy)
  image_size = trainX.shape[1]
  y_pred = optimizers()

In [None]:
#to plot confusion matrix, first run the above cell to get y_pred and then run this cell
plotConfusionMatrix(trainy, y_pred, class_names)

In [None]:
# if you want to run the sweep then run this cell
trainX, validationX, trainy, validationy = train_test_split(x_train, y_train, random_state=104, test_size=0.1, shuffle=True)
num_images = len(trainy)
image_size = trainX.shape[1]
sweep_config = {
  "name": "CS6910 Assignment 1 - Cross Entropy Loss",
  "metric": {
      "name":"validation_accuracy",
      "goal": "maximize"
  },
  "method": "bayes",
  "parameters": {
        "learning_rate": {
            "values": [0.1, 0.01, 1e-3, 1e-4]
        },
        "activation_fun": {
            "values": ["sigmoid", "tanh", "reLU"]
        },
        "init_mode": {
            "values": ["xavier", "random_uniform", "random_normal"]
        },
        "optimizer": {
            "values": ["sgd", "momentum", "nag", "rmsprop", "adam", "nadam"]
        },
        "batch_size": {
            "values": [16,32]
        },
        "epochs": {
            "values": [5, 10, 20]
        },
        "weight_decay": {
            "values": [0, 0.0005, 0.05]
        },
        "neurons_in_each_layer": {
            "values": [32, 64, 128]
        },
        "num_hidden_layers": {
            "values": [3, 4, 5]
        }
    }
}

sweep_id = wandb.sweep(sweep_config, entity="cs22m006", project="Assignment 1")
wandb.agent(sweep_id, optimizers, count = 300)