## TODO:
- Try to modify the network to be able to use with cross entropy (same as notebook 1)
    * here the problem is loss becomes nan immediately, probably there is a bug in backprop, it is not clear how to propagate the error
- Try to add bias term
    * Here need to check how to update these terms
- Initialize the weights properyly
    * I did it but performance was worse
- Try to have same performance for k=3, now with existing setup, it doesn't learn

- Works for BCE
- Try to switch to Cross entropy with softmax

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import torch
from torch.utils.data import DataLoader
import torchvision
from torchvision.transforms import ToTensor, Normalize, Compose
from torch.optim import lr_scheduler

In [2]:
from scripts.mnistParity import MNISTParity
from scripts.architecture import MLP
from scripts.train_utils import accuracy
from scripts.train import train_epoch, train_model, test_model
from scripts.plot_utils import plot_loss_accuracy, plotValAccuracy

In [3]:
print(torch.__version__)
print(np.__version__)

1.9.0
1.20.3


## Create Parity Data Iterator

In [4]:
transforms = Compose([
    ToTensor(),
    Normalize((0.1307,), (0.3081,))
])

In [5]:
# doesn't perform and transformation until we call the loader
trainset = torchvision.datasets.MNIST(root='data', train=True, download=True, transform=transforms)
testset = torchvision.datasets.MNIST(root='data', train=False, download=True, transform=transforms)

  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [6]:
learn_rate = 0.05
num_epochs = 20
batch_size = 128
loss_fn = torch.nn.BCELoss()

### MLP Scratch

In [7]:
# add bias terms --> a bit confusing now with batch_size and it's not clear how to update them at the moment, ignoring for now
# try to have same structure with other training --> I had (2 outputs) with cross entropy loss,
# try to have the same performance for k = 3 --> doesn't perform well at all

import math

class MLP_Manual(torch.nn.Module):
    def __init__(self, k, device):
        super().__init__()

        self.input_dim = 28 * 28 * k
        self.hidden_dim = 512
        self.output_dim = 1           # TODO: make this part 2 with cross-entropy loss
        self.learning_rate = 0.001
        self.flat = torch.nn.Flatten() # when input comes as 28x28, this'll convert to 784
        # WEIGHTS
        # initialize the weights as pytorch does by default --> IT DIVERGES and perform worse (90%) for k=1
        # e.g. 784 x 512
        self.w1 = torch.zeros(self.input_dim, self.hidden_dim).to(device)
        stdv1 = 1. / math.sqrt(self.w1.size(1))
        self.w1.uniform_(-stdv1, +stdv1)
        #  e.g. 512 x 1
        self.w2 = torch.zeros(self.hidden_dim, self.output_dim).to(device)
        stdv2 = 1. / math.sqrt(self.w2.size(1))
        self.w2.uniform_(-stdv2, +stdv2)
        
    def sigmoid(self, s):
        return 1 / (1 + torch.exp(-s))
     
    def reLU(self, s):
        s[s < 0] = 0
        return s.float()
    
    def reLUPrime(self, s):
        s[s < 0] = 0
        s[s > 0] = 1
        return s.float()

    # Forward propagation
    def forward(self, X):
        X = self.flat(X)
        # a_k = W_k @ h_{k-1} + b_k, h_k = f(a_k) where h_0 = X and f is the non linearity, a_2 = y^
        self.a1 = torch.matmul(X, self.w1) # e.g. k=1 --> 128x784 @ 784x512
        self.h1 = self.reLU(self.a1)       # f is the reLU
        self.a2 = torch.matmul(self.h1, self.w2) #
        y_hat = self.sigmoid(self.a2)
        return y_hat # some loss functions handle output layer non-linearity

    # Backward propagation
    def backward(self, X, y, y_hat):
        X = self.flat(X)
        # gradients of W2 --> dBCE/dW2 = dE/dy^.dy^/da2. da2/dW2 = (y^ - y) h1 
        self.e = y_hat - y.reshape(len(y),1) # e - 128x1, h1.t - 512,128 for k=1
        self.w2_grads = torch.matmul(self.h1.t(), self.e)
        # gradients of W1 --> dBCE/dW1 = dE/dh1 . dh1/da1 . da1/dW1
        # where dE/dh1 = dE/dy^ . dy^/da2 . da2/dh1
        self.dBCE_da1 = torch.matmul(self.e, self.w2.t()) * self.reLUPrime(self.a1) # e - 128x1, w2.t - 1,512 , a1 - 128,512
        self.w1_grads = torch.matmul(X.t(), self.dBCE_da1) # x.t - 784,128, dBCE_da1 128,512
        
        # Implement SGD here
        self.w1 -= self.learning_rate * self.w1_grads
        self.w2 -= self.learning_rate * self.w2_grads

    def train(self, X, y_hat):
        # Forward propagation
        y_hat = self.forward(X)
        # Backward propagation and gradient descent
        self.backward(X, y, y_hat)
        
        
def predict(nn_output: torch.Tensor):
    nn_output[nn_output > 0.5] = 1
    nn_output[nn_output < 0.5] = 0
    return nn_output.reshape(len(nn_output)).int()

def predict2(nn_output: torch.Tensor):
    return torch.argmax(nn_output, dim=1)
    
def accuracy(nn_output: torch.Tensor, ground_truth: torch.Tensor):
    # nn_out_classes = torch.argmax(nn_output, dim=1)
    nn_out_classes = predict(nn_output)
    # nn_out_classes = predict2(nn_output)
    # produce tensor of booleans - at which position of the nn output is the correct class located?
    correct_items = (nn_out_classes == ground_truth)
    # now getting the accuracy is easy, we just operate the sum of the tensor and divide it by the number of examples
    acc = correct_items.sum().item() / nn_output.shape[0]
    return acc

In [8]:
# training loop for MLP_Manual

k=3
device = "cuda:0"
model_a = MLP_Manual(k, device)
loss_fn = torch.nn.BCELoss() # or BCELoss with sigmoid activarion in last layer
#loss_fn = torch.nn.CrossEntropyLoss() # TODO: fix loss becomes nan
from scripts.train_utils import AverageMeter

for epoch in range(20):
    trainData = MNISTParity(trainset, k, batch_size)
    loss_meter = AverageMeter()
    performance_meter = AverageMeter()

    for X,y in trainData.loader:
        X = X.to(device)
        y = y.to(device)
        y_hat = model_a(X)
        loss = loss_fn(y_hat,y.reshape(len(y),1).float())
        #loss = loss_fn(y_hat,y).float()
        acc = accuracy(y_hat, y)
        loss_meter.update(val=loss, n=X.shape[0])
        performance_meter.update(val=acc, n=X.shape[0])
        model_a.train(X,y)
     
    #print(f"Epoch {epoch+1} completed. Loss - total: {loss_meter.sum:.4f} - average: {loss_meter.avg:.4f}")   
    print(f"Epoch {epoch+1} completed. Loss - total: {loss_meter.sum:.4f} - average: {loss_meter.avg:.4f}; Performance: {performance_meter.avg:.4f}")   

Epoch 1 completed. Loss - total: 49285.7461 - average: 0.8214; Performance: 0.5141
Epoch 2 completed. Loss - total: 39075.3945 - average: 0.6513; Performance: 0.5927
Epoch 3 completed. Loss - total: 36375.4375 - average: 0.6063; Performance: 0.6489
Epoch 4 completed. Loss - total: 34783.4062 - average: 0.5797; Performance: 0.6701
Epoch 5 completed. Loss - total: 34081.9805 - average: 0.5680; Performance: 0.6875
Epoch 6 completed. Loss - total: 33428.3711 - average: 0.5571; Performance: 0.6904
Epoch 7 completed. Loss - total: 33025.4414 - average: 0.5504; Performance: 0.6982
Epoch 8 completed. Loss - total: 32518.9297 - average: 0.5420; Performance: 0.7028
Epoch 9 completed. Loss - total: 32328.6992 - average: 0.5388; Performance: 0.7077
Epoch 10 completed. Loss - total: 32225.4688 - average: 0.5371; Performance: 0.7103
Epoch 11 completed. Loss - total: 31806.0059 - average: 0.5301; Performance: 0.7136
Epoch 12 completed. Loss - total: 31628.4824 - average: 0.5271; Performance: 0.7163
E

## For Cross Entropy

In [None]:
def crossEntropy(y, y_hat):
    """y : is the ground truth one-hot encoded
       y_hat is the softmax output of the network"""
    return -torch.sum(y * torch.log(y_hat))

def BCE(y, y_hat):
    return -torch.sum(y*torch.log(y_hat) + (1-y)*torch.log(1-y_hat))

def softmax(out):
    e = torch.exp(out)
    return e / torch.sum(e)

## MISSING ONE PART, LEFT IT LIKE THIS FOR NOW
def softmaxPrime(out):
    e = torch.exp(out)
    S = e / torch.sum(e)
    return S * (1-S)


def sigmoid(s):
    return 1 / (1 + torch.exp(-s))

In [None]:
trainData = MNISTParity(trainset, 1, 128)
X,y = next(iter(trainData.loader))

In [None]:
## WRITE THE CROSS ENTROPY BP PROCESS IN PAPER
## THERE IS A MISTAKE BP PROCESS
## BE CAREFUL ABOUT NOTATION AND MATRIX SIZES
## https://www.ics.uci.edu/~pjsadows/notes.pdf
import math

def reLU(s):
    s[s < 0] = 0
    return s.float()

def reLUPrime(s):
    s[s < 0] = 0
    s[s > 1] = 1
    return s.float()

w1 = torch.rand(784, 512)
w2 = torch.rand(512, 2)
stdv1 = 1. / math.sqrt(w1.size(1))
stdv2 = 1. / math.sqrt(w2.size(1))

w1.uniform_(-stdv1, +stdv1)
w2.uniform_(-stdv2, +stdv2)

for i in range(5):
    #Forward Pass
    flat = torch.nn.Flatten()
    X = flat(X)
    a1 = torch.matmul(X, w1)
    h1 = reLU(a1)
    ay = torch.matmul(h1, w2)
    y_hat = softmax(ay)
    #Backward
    e = y_hat - torch.nn.functional.one_hot(y)
#     print(torch.max(e), torch.min(e))
#     print(e.shape, X.shape)
    delta_a1 = torch.matmul(e, w2.t()) * softmaxPrime(h1)
    w1 -= 0.001 * torch.matmul(X.t(), delta_a1)
    w2 -= 0.001 * torch.matmul(h1.t(), e)
#   print(w1.shape)
#   print(torch.max(y_hat), torch.min(y_hat))
    print(crossEntropy(torch.nn.functional.one_hot(y), y_hat))
    print(accuracy(predict2(y_hat), y))

#     print(torch.max(delta_a1), torch.min(delta_a1))
    

In [None]:
torch.nn.functional.softmax(ay[0],dim=0)

In [None]:
deneme = ay[2]

In [None]:
deneme

In [None]:
torch.exp(deneme)

In [None]:
deneme

In [None]:

torch.nn.functional.softmax(deneme, dim = 0)