<a href="https://colab.research.google.com/github/davidraamirez/GradientWithoutBackpropagation/blob/main/NN_fwd_gradient.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split
import tqdm
import torch.distributions as distr

In [2]:
%pip install torchmetrics --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/517.2 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m481.3/517.2 KB[0m [31m15.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.2/517.2 KB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import torchmetrics
import torchvision
from torchvision import transforms as T

Loading and preprocessing the data

In [4]:
#Load the dataset
train_data = torchvision.datasets.KMNIST('./data', train=True, download=True)

Downloading http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-images-idx3-ubyte.gz
Downloading http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-images-idx3-ubyte.gz to ./data/KMNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/18165135 [00:00<?, ?it/s]

Extracting ./data/KMNIST/raw/train-images-idx3-ubyte.gz to ./data/KMNIST/raw

Downloading http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-labels-idx1-ubyte.gz
Downloading http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-labels-idx1-ubyte.gz to ./data/KMNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/29497 [00:00<?, ?it/s]

Extracting ./data/KMNIST/raw/train-labels-idx1-ubyte.gz to ./data/KMNIST/raw

Downloading http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-images-idx3-ubyte.gz
Downloading http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-images-idx3-ubyte.gz to ./data/KMNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/3041136 [00:00<?, ?it/s]

Extracting ./data/KMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/KMNIST/raw

Downloading http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-labels-idx1-ubyte.gz
Downloading http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-labels-idx1-ubyte.gz to ./data/KMNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/5120 [00:00<?, ?it/s]

Extracting ./data/KMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/KMNIST/raw



In [5]:
# This loads data with both data conversion.
train_data = torchvision.datasets.KMNIST('./data', train=True, transform=T.ToTensor())

In [6]:
# Loaders are used to shuffle, batch, and possibly sample the elements of the dataset
train_loader = torch.utils.data.DataLoader(train_data, batch_size=8, shuffle=True)

In [7]:
xb, yb = next(iter(train_loader))
print(xb.shape)
print(yb.shape)

torch.Size([8, 1, 28, 28])
torch.Size([8])


In [8]:
# Loading the test data is similar, but (a) we do not apply data augmentation,
# and (b) we do not shuffle when building the mini-batches.
test_data = torchvision.datasets.KMNIST('./data', train=False, transform=T.ToTensor())
test_loader = torch.utils.data.DataLoader(test_data, batch_size=8, shuffle=False)

Define MultiLayer Neural Network

In [9]:
from torch import nn
from torch.nn import functional as F

In [22]:
class MNN(nn.Module):
  def __init__(self, input_size, fc1w, fc1b, fc2w, fc2b, fc3w, fc3b):
    super().__init__()
    input_size = 1

    self.fc1 = nn.Linear(input_size, 1024)
    self.fc1.weight = torch.nn.Parameter(fc1w)
    self.fc1.bias = torch.nn.Parameter(fc1b)
    
    self.fc2 = nn.Linear(1024, 1024)
    self.fc2.weight = torch.nn.Parameter(fc2w)
    self.fc2.bias = torch.nn.Parameter(fc2b)

    self.fc3 = nn.Linear(1024, 10)
    self.fc3.weight = torch.nn.Parameter(fc3w)
    self.fc3.bias = torch.nn.Parameter(fc3b)

  def forward(self, x):
    x = x.reshape((-1, 1*28*28))
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    return self.fc3(x)

In [23]:
# We check if CUDA is available. If you do not see it,
# activate a GPU from Runtime >> Change runtime type and 
# restart the notebook.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


Initialize the parameters

In [36]:
# We initialize the parameters randomly and the model with an input size
fc1w = torch.FloatTensor(1024, 28*28).uniform_(-1, 1)
fc1b = torch.FloatTensor(1024).uniform_(-1, 1)
fc2w = torch.FloatTensor(1024, 1024).uniform_(-1, 1)
fc2b = torch.FloatTensor(1024).uniform_(-1, 1)
fc3w = torch.FloatTensor(10, 1024).uniform_(-1, 1)
fc3b = torch.FloatTensor(10).uniform_(-1, 1)
mnn = MNN(1, fc1w, fc1b, fc2w, fc2b, fc3w, fc3b).to(device)

In [25]:
# Note: we also need to move data when asking for a prediction
print(mnn(xb.to(device)).shape)

torch.Size([8, 10])


Train and evaluate the network with forward gradient

In [27]:
def accuracy(net, loader, device):
  # A function that aggregates the accuracy over all mini-batches in the loader.
  # See here for a quick-start on torchmetrics: https://torchmetrics.readthedocs.io/en/stable/pages/quickstart.html.
  #acc = torchmetrics.Accuracy().to(device)
  acc = torchmetrics.Accuracy('multiclass', num_classes=10).to(device)
  for xb, yb in loader:
      xb, yb = xb.to(device), yb.to(device)
      ypred = mnn(xb)
      _ = acc(ypred, yb)
  return acc.compute()

In [28]:
# Average accuracy at initialization is 10% (random guessing).
accuracy(mnn, test_loader, device)

tensor(0.0844)

In [29]:
# Note: it is important to move the CNN to the device before initializing the optimizer,
# since the optimizer also has a state that must be moved to the GPU.
loss = nn.CrossEntropyLoss()

In [30]:
def beale_function(x):
  return (torch.pow(torch.tensor([1.5])-x[0]+x[0]*x[1],2) + torch.pow(torch.tensor([2.25])-x[0]+x[0]*torch.pow(x[1],2),2)+torch.pow(torch.tensor([2.625])-x[0]+x[0]*torch.pow(x[1],3),2))

In [31]:
def rosenbrock_function(x):
  sum=0
  for p in x.size():
    for i in range (x.size(1)-1):
      sum += (100*torch.pow(x[i+1] - torch.pow(x[i], 2), 2) + torch.pow(x[i]-1, 2))
  return sum

In [32]:
from functorch import jvp

In [40]:
def train_fwd_gradient(x, y):
  x, y = x.to(device), y.to(device)
  x = x / 255

  l_rate0 = 0.025
  f = rosenbrock_function

  #Parameters
  fc1w = torch.randn((1024, 28*28), requires_grad=False)
  fc1b = torch.randn(1024, requires_grad=False)
  fc2w = torch.randn((1024, 1024), requires_grad=False)
  fc2b = torch.randn(1024, requires_grad=False)
  fc3w = torch.randn((10, 1024), requires_grad=False)
  fc3b = torch.randn(10, requires_grad=False) 

  fc1w1 = fc1w.reshape(-1)
  fc2w1 = fc2w.reshape(-1)
  fc3w1 = fc3w.reshape(-1)

  mnn = MNN(1, fc1w, fc1b, fc2w, fc2b, fc3w, fc3b).to(device)
  print(mnn(x).shape)
  print(y.shape)
  error = torch.norm(mnn(x)-y, 2)

  t=torch.tensor([0])

  while (error>1e-3) :

    t=t+1

    vfc1w1=torch.diagonal(torch.normal(torch.zeros_like(fc1w1),torch.eye(fc1w1.shape[0])))
    vfc1b=torch.diagonal(torch.normal(torch.zeros_like(fc1b),torch.eye(fc1b.shape[0])))
    vfc2w1=torch.diagonal(torch.normal(torch.zeros_like(fc2w1),torch.eye(fc2w1.shape[0])))
    vfc2b=torch.diagonal(torch.normal(torch.zeros_like(fc2b),torch.eye(fc2b.shape[0])))
    vfc3w1=torch.diagonal(torch.normal(torch.zeros_like(fc3w1),torch.eye(fc3w1.shape[0])))
    vfc3b=torch.diagonal(torch.normal(torch.zeros_like(fc3b),torch.eye(fc3b.shape[0])))

    ftfc1w1=f(fc1w1)
    ftfc1b=f(fc1b)
    ftfc2w1=f(fc2w1)
    ftfc2b=f(fc2b)
    ftfc3w1=f(fc3w1)
    ftfc3b=f(fc3b)

    dtfc1w1=torch.tensor(jvp(f,(fc1w1, ), (vfc1w1, ))[1])
    dtfc1b=torch.tensor(jvp(f,(fc1b, ), (vfc1b, ))[1])
    dtfc2w1=torch.tensor(jvp(f,(fc2w1, ), (vfc2w1, ))[1])
    dtfc2b=torch.tensor(jvp(f,(fc2b, ), (vfc2b, ))[1])
    dtfc3w1=torch.tensor(jvp(f,(fc3w1, ), (vfc3w1, ))[1])
    dtfc3b=torch.tensor(jvp(f,(fc3b, ), (vfc3b, ))[1])

    gtfc1w1 = vfc1w1*dtfc1w1
    gtfc1b = vfc1b*dtfc1b
    gtfc2w1 = vfc2w1*dtfc2w1
    gtfc2b = vfc2b*dtfc2b
    gtfc3w1 = vfc3w1*dtfc3w1
    gtfc3b = vfc3b*dtfc3b

    fc1w1 -= l_rate0*gtfc1w1
    fc1b -= l_rate0*gtfc1b
    fc2w1 -= l_rate0*gtfc2w1
    fc2b -= l_rate0*gtfc2b
    fc3w1 -= l_rate0*gtfc3w1
    fc3b -= l_rate0*gtfc3b

    fc1w = fc1w1.reshape(-1, 28*28)
    fc2w = fc2w1.reshape(-1, 1024)
    fc3w = fc3w1.reshape(-1, 1024)

    mnn = MNN(1, fc1w, fc1b, fc2w, fc2b, fc3w, fc3b).to(device)
    error = torch.norm(mnn(x)-y, 2)

  return fc1w, fc1b, fc2w, fc2b, fc3w, fc3b 

In [41]:
for epoch in range(1):

  mnn.train()
  for i in range(1):
    xb, yb = next(iter(train_loader))
    xb = xb.to(device)
    yb = yb.to(device)

    fc1w, fc1b, fc2w, fc2b, fc3w, fc3b = train_fwd_gradient(xb, yb)
    mnn = MNN(1, fc1w, fc1b, fc2w, fc2b, fc3w, fc3b).to(device)

    #Update cnn parameters
    #Recalculate ypred and loss
    #MIRAR NN_LAB_LOGISITC_REGRESSION
    #CALCULAR G(THETA) QUE ES EL GRADIENTE Y APLICARLO A LOS PARAMETROS DEL CNN, LOS WEIGHTS

torch.Size([8, 10])
torch.Size([8])


RuntimeError: ignored