# Backpropagation

## We have to consider the following steps

## How to start now?

# MyImporting

In [2]:
import numpy as np
import random
import pandas as pd
import torch
import torch.nn as nn
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

## 1 Prepare your dataset

### 1.1 Data Split in train and test

In [5]:
# Your code
mnist = load_digits()
head = pd.DataFrame(mnist.data).head()
target = pd.DataFrame(mnist.target)

X = mnist.data
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_train = torch.tensor(y_train.values, dtype=torch.long)
y_test = torch.tensor(y_test.values, dtype=torch.long)

y_train = torch.nn.functional.one_hot(y_train)
y_test = torch.nn.functional.one_hot(y_test)

# CONTENT AND DIMENTION CHECK

# print(y_train.shape)

# mnist.keys() -> dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])

# print(f'Pure Instance 1018 of the dataset: {X_train[1018]} \n with \n {y_train.iloc[1018]} \n and the 427 insatnce: {X_train[427]} \n with \n {y_train.iloc[427]}')

AttributeError: 'DataFrame' object has no attribute 'data'

### 1.2 Initialize weights and biases

In [9]:
# Initialization

W1 = nn.init.xavier_normal_(torch.empty(8,64))
W2 = nn.init.xavier_normal_(torch.empty(10,8))

b1 = np.random.randn(8)
b2 = np.random.randn(10)

b1 = torch.tensor(b1,dtype=torch.float32)
b2 = torch.tensor(b2, dtype=torch.float32)

# CONTENT AND DIMENTION CHECK

# print(f'Matrix W1: {W1} \n Matrix W2: {W2} \n Bias1: {b1} \n Bias2: {b2}')
print(f'Matrix W1 shape: {W1.shape} \n Matrix W2 shape: {W2.shape} \n Bias1 shape: {b1.shape} \n Bias2 shape: {b2.shape}')

# print(f'{W1.shape} X {X.shape}')
# print(f'Result is {W1.T * X}')

# print(f'Layer 1: \n\t Weights: \n \tNormal [first ten rows only]: \n\n {W1[:10,:]} \n\n \t Transposed: \n\n {W1.T}')
# print(f'Layer 1: \n\t Weights: \n \tNormal [first ten rows only]: \n\n {W2[:10,:]} \n\n \t Transposed: \n\n {W2.T}')

Matrix W1 shape: torch.Size([8, 64]) 
 Matrix W2 shape: torch.Size([10, 8]) 
 Bias1 shape: torch.Size([8]) 
 Bias2 shape: torch.Size([10])


## Building your neural network

In [None]:
# Your code

## Implement your loss function(s)

In [5]:
# Your code

# cel = torch.nn.CrossEntropyLoss()

def MyCrossEntropyLoss(y,y_hat):
    # lst = []
    # for i in range(len(y)):
    #     element = y[i] * np.log(yhat[i])
    #     lst.append(element)
    # return -np.sum(lst)

    return torch.sum(-y * torch.log(y_hat))

def sigmoid_deriv(grad_output, sigmoid_output):
    return grad_output * sigmoid_output * (1 - sigmoid_output)

def softmax_deriv(grad_output, softmax_output):
    return grad_output * softmax_output * (1 - softmax_output)

## Implement the training loop

In [8]:
# Your code

# Consider the following steps:
# 1) Loop through your training data
#   1. 1) Choose number of epochs (How often do you want to loop through your complete dataset?)
# 2) Forward the data through your network
# 3) Calculate the loss
# 4) Perform backpropagation with SGD and update the weights
#   4. 1) Choose a learning rate to update your weights
# Repeat 1, 2, 3, 4 until the training converges or maximum epochs are reached

# IMPLEMENTATION

learning_rate = 0.05
learning_rate = torch.tensor(learning_rate, dtype=torch.float32)

for i in range(1):

    # VISUALIZATION
    # print(f'BEGGIN: \n X: {x_rand, x_rand.shape} \n W1: {W1,W1.shape} \n b1: {b1, b1.shape} \n W2: {W2, W2.shape} \n b2: {b2, b2.shape}')

    # FORWARD
    random.seed(25)
    rnd = random.randint(0,len(X_train)) # -> 772th image
    # print(f'Random indice {rnd}')

    x_rand = X_train[rnd]
    x_rand = torch.tensor(x_rand, dtype=torch.float32) # Shape (1,64)

    target_rand = y_train[rnd] # -> target = 5 for random.seed(25)
    # print(target_rand)

    # MODEL
    Z1 = torch.matmul(x_rand, W1.T) + b1
    A1 = torch.sigmoid(Z1)
    Z2 = torch.matmul(A1, W2.T) + b2
    OUTPUT = nn.functional.softmax(Z2)

    # LOSS
    L = MyCrossEntropyLoss(target_rand,OUTPUT)
    # print(f'Z1: {Z1.shape} \n A1: {A1.shape} \n Z2: {Z2.shape} \n Output: {OUTPUT.shape} \n Loss: {L}')
    # # print(L)

    # # BACKPROPAGATION (Layer 2-1)
    dLdout = (OUTPUT - target_rand) / torch.matmul(OUTPUT,(1 - OUTPUT)) # entropy derivative 1x10
    doutdZ2 = OUTPUT * (1-OUTPUT) # Softmax derivative
    dZ2dW2 = A1.T # shape 8,
    dZ2db2 = 1 # no shape as its an int
    # print(f'DERIVATIVES 1: \n dLdout:{dLdout} with shape {dLdout.shape} -> \n doutdZ2:{doutdZ2} with shape {doutdZ2.shape} -> \n dZ2dW2:{dZ2dW2} with shape {dZ2dW2.shape} -> \n dZ2db2:{dZ2db2} with no shape as its an int \n')

    # GRADIENTS
    result_of_grads = dLdout * doutdZ2
    gradb2 = dZ2db2 * result_of_grads # Shape 1,
    gradW2 = dZ2dW2 * result_of_grads.T # Shape 8,
    # # print(f'GRADIENTS 1: \n Gradb2: {gradb2} and shape {gradb2.shape} \n GradW2: {gradW2} and shape {gradW2.shape}')

    # # BACKPROPAGATION (Layer 1-0)
    dLdz2 = dLdout * doutdZ2 # 1x10
    dZ2dA1 = W2 # 10x8
    dA1dZ1 = A1 * (1-A1) # 1x8
    dZ1dW1 = x_rand # 1x64
    dZ1db1 = 1
    # # print(f'DERIVATIVES 2: \n dLdZ2:{dLdz2} with shape {dLdz2.shape} -> \n dZ2dA1:{dZ2dA1} with shape {dZ2dA1.shape} -> \n dA1dZ1:{dA1dZ1} with shape {dA1dZ1.shape} -> \n dZ1dW1:{dZ1dW1} with shape {dZ1dW1.shape} -> \n dZ1db1:{dZ1db1} with no shape as its an int\n')

    # # GRADIENTS
    gradb1 = ((dZ1db1 * dA1dZ1) @ dZ2dA1.T) * dLdz2 # 1x10
    gradW1 = ((dZ1dW1 * dA1dZ1) @ dZ2dA1.T) * dLdz2 
    # # print(f'GRADIENTS 2: \n gradb1: {gradb1} and shape {gradb1.shape} \n gradW1: {gradW1} and shape {gradW1.shape}')

    # # UPDATES
    b2 = b2 - (learning_rate * gradb2) # 1x10 - (1x10) = 1x10
    W2 = W2 - (learning_rate * gradW2)
    # print(f'UPDATES 1: \n b2: {b2} with shape {b2.shape} \n W2: {W2} with shape {W2.shape}')
    
    # # UPDATES
    b1 = b1 - (learning_rate * gradb1) # Shape
    W1 = W1 - (learning_rate * gradW1) # Shape
    # print(f'UPDATES 2: \n b1: {b1} with shape {b1.shape} \n W1: {W1} with shape {W1.shape}')

    # print(f'END: \n X: {x_rand, x_rand.shape} \n W1: {W1,W1.shape} \n b1: {b1, b1.shape} \n W2: {W2, W2.shape} \n b2: {b2, b2.shape}')
[random_point,:]

DERIVATIVES 1: 
 dLdout:tensor([[ 0.2007, -0.8818,  0.1681,  0.0089,  0.0467,  0.0616,  0.2160,  0.1172,
          0.0211,  0.0416]]) with shape torch.Size([1, 10]) -> 
 doutdZ2:0.8347920775413513 with shape torch.Size([]) -> 
 dZ2dW2:tensor([9.8556e-01, 5.2566e-01, 3.6379e-01, 1.6674e-03, 1.0929e-02, 1.4696e-11,
        1.0000e+00, 1.3319e-02]) with shape torch.Size([8]) -> 
 dZ2db2:1 with no shape as its an int 



  OUTPUT = nn.functional.softmax(Z2)
