<a href="https://colab.research.google.com/github/davidraamirez/GradientWithoutBackpropagation/blob/main/LogisticRegression_fwd_gradient.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split
import tqdm
import torch.distributions as distr

In [2]:
%pip install torchmetrics --quiet

In [3]:
import torchmetrics
import torchvision
from torchvision import transforms as T

Loading and preprocessing the data

In [4]:
#Load the dataset
train_data = torchvision.datasets.KMNIST('./data', train=True, download=True)

In [5]:
# This loads data with both data conversion.
train_data = torchvision.datasets.KMNIST('./data', train=True, transform=T.ToTensor())

In [6]:
# Loaders are used to shuffle, batch, and possibly sample the elements of the dataset
train_loader = torch.utils.data.DataLoader(train_data, batch_size=8, shuffle=True)

In [7]:
xb, yb = next(iter(train_loader))
print(xb.shape)
print(yb.shape)

torch.Size([8, 1, 28, 28])
torch.Size([8])


In [8]:
# Loading the test data is similar, but (a) we do not apply data augmentation,
# and (b) we do not shuffle when building the mini-batches.
test_data = torchvision.datasets.KMNIST('./data', train=False, transform=T.ToTensor())
test_loader = torch.utils.data.DataLoader(test_data, batch_size=8, shuffle=False)

Define Logistic Regression

In [9]:
from torch import nn
from torch.nn import functional as F

In [10]:
class SimpleLogisticRegression(nn.Module):
  def __init__(self, input_size, w, b):
    super(SimpleLogisticRegression, self).__init__()
    input_size = 28
    self.linear = nn.Linear(input_size, 28*28)
    self.linear.weight = nn.Parameter(w)
    self.linear.bias = nn.Parameter(b)


  def forward(self, x):
    x = x.reshape(-1, 28*28)
    return torch.softmax(self.linear(x), 1)

In [11]:
# We check if CUDA is available. If you do not see it,
# activate a GPU from Runtime >> Change runtime type and 
# restart the notebook.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


Initialize the parameters

In [43]:
# We initialize the parameters randomly and the model with an input size
w = torch.randn((10, 28*28), requires_grad=False)
b = torch.randn(10, requires_grad=False)
LG = SimpleLogisticRegression(1, w, b).to(device)

In [44]:
# Note: we also need to move data when asking for a prediction
LG(xb.to(device)).shape

torch.Size([8, 10])

Train and evaluate the network with forward gradient

In [45]:
def accuracy(net, loader, device):
  # A function that aggregates the accuracy over all mini-batches in the loader.
  # See here for a quick-start on torchmetrics: https://torchmetrics.readthedocs.io/en/stable/pages/quickstart.html.
  #acc = torchmetrics.Accuracy().to(device)
  acc = torchmetrics.Accuracy('multiclass', num_classes=10).to(device)
  for xb, yb in loader:
      xb, yb = xb.to(device), yb.to(device)
      ypred = LG(xb)
      _ = acc(ypred, yb)
  return acc.compute()

In [46]:
# Average accuracy at initialization is 10% (random guessing).
accuracy(LG, test_loader, device)

tensor(0.0812, device='cuda:0')

Define Cross Entropy

In [47]:
# Note: it is important to move the CNN to the device before initializing the optimizer,
# since the optimizer also has a state that must be moved to the GPU.
loss = nn.CrossEntropyLoss()

In [48]:
def beale_function(x):
  return (torch.pow(torch.tensor([1.5])-x[0]+x[0]*x[1],2) + torch.pow(torch.tensor([2.25])-x[0]+x[0]*torch.pow(x[1],2),2)+torch.pow(torch.tensor([2.625])-x[0]+x[0]*torch.pow(x[1],3),2))

In [49]:
def rosenbrock_function(x):
  sum=0
  for i in range (x.size(0) -1):
    sum += (100*torch.pow(x[i+1] - torch.pow(x[i], 2), 2) + torch.pow(x[i]-1, 2))
  return sum

In [95]:
def train_fwd_gradient(x, y):
  x, y = x.to(device), y.to(device)

  l_rate0 = 0.0025
  f = rosenbrock_function

  #Parameters
  w = torch.rand((8, 28*28), requires_grad=False)
  w = torch.div(w, torch.norm(w, 2))

  b = torch.rand(8, requires_grad=False)
  b = torch.div(b, torch.norm(b, 2))

  LG = SimpleLogisticRegression(1, w, b).to(device)
  w1 = w.reshape(-1)
  #w1 = torch.cat((w1, b), 0)

  error = torch.norm(LG(x) - y, 2)
  t=torch.tensor([0])

  while (error>1e-3) :

    t=t+1
    v=torch.diagonal(torch.normal(torch.zeros_like(w1),torch.eye(w1.shape[0])))

    ftw1 = f(w1)
    ftb = f(b)
    print('ftw1', ftw1)
    print('ftb', ftb)
    dt=torch.tensor(torch.autograd.functional.jvp(f,w1,v)[1])
    print('dt', dt)
    gt=v*dt
    w1 = w1 - l_rate0*gt

    w = w1[:8*28*28]
    w = w.reshape(-1, 28*28)
    b = w1[(8*28*28):]

    LG = SimpleLogisticRegression(1, w, b).to(device)
    error = torch.norm(LG(x) - y, 2)
    print('error', error)

  return w, b

In [96]:
for epoch in range(1):

  LG.train()
  for i in range(1):
    xb, yb = next(iter(train_loader))
    xb = xb.to(device)
    yb = yb.to(device)

    w, b = train_fwd_gradient(xb, yb)
    LG = SimpleLogisticRegression(1, w, b)

ft tensor(6314.2305)


  dt=torch.tensor(torch.autograd.functional.jvp(f,w1,v)[1])


dt tensor(12.9284)
error tensor(44.9298, device='cuda:0', grad_fn=<NormBackward1>)
ft tensor(6951.1831)
dt tensor(-596.4167)
error tensor(44.7646, device='cuda:0', grad_fn=<NormBackward1>)
ft tensor(10729264.)
dt tensor(914927.)
error tensor(44.6990, device='cuda:0', grad_fn=<NormBackward1>)
ft tensor(5.0900e+19)
dt tensor(-1.2355e+15)
error tensor(45.0111, device='cuda:0', grad_fn=<NormBackward1>)
ft tensor(inf)
dt tensor(nan)
error tensor(nan, device='cuda:0', grad_fn=<NormBackward1>)
