In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset


In [11]:
# Make sure model and data are both on either cpu or gpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
# Preparing the MNIST dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

# Load full MNIST dataset
train_dataset = datasets.MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.MNIST(root='./data', train=False, transform=transform)

# Reduce training dataset to 20.000 examples
num_examples = 20000
indices = torch.randperm(len(train_dataset))[:num_examples] 
train_subset = Subset(train_dataset, indices)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)


In [13]:
class MLP_RFA(nn.Module):
    def __init__(self, input_size=784, hidden1=256, hidden2=128, output_size=10): # 784 because MNIST images are 28x28 pixels
        super(MLP_RFA, self).__init__() # run all setup code of the parent class before adding custom parts, to make sure my class inherits its full functionality
        self.fc1 = nn.Linear(input_size, hidden1)  # first hidden layer
        self.fc2 = nn.Linear(hidden1, hidden2)     # second hidden layer
        self.fc3 = nn.Linear(hidden2, output_size) # output layer

    def forward(self, x):
        x = x.view(-1, 784)
        h1 = torch.relu(self.fc1(x))
        h2 = torch.relu(self.fc2(h1))
        out = self.fc3(h2)
        return out, h1, h2  # return activations for RFA


In [14]:
# Instantiate model
model = MLP_RFA().to(device)

In [15]:
# Create Feedback matrices: 10x256 tensor with random values from standard normal distribution with mean 0 and sd 0.1
B1 = torch.randn(10, 256, device=device) * 0.1 # Feedback Matrix for first hidden layer
B2 = torch.randn(10, 128, device=device) * 0.1 # Feedback Matrix for second hidden layer


In [17]:
# Training Loop with RFA

criterion = nn.CrossEntropyLoss() # Loss Function
lr = 1e-3 # Learning rate

for epoch in range(5):
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)
        
        # Forward pass
        output, h1, h2 = model(data)
        loss = criterion(output, target)
        
        # Zero gradients manually
        for param in model.parameters():
            param.grad = None

        # Compute output error
        e = torch.zeros_like(output)
        e[range(target.shape[0]), target] = 1
        e = output.softmax(dim=1) - e  # cross-entropy gradient approximation

        # Compute gradients via random feedback alignment
        # Output layer gradients (standard)
        grad_fc3_w = torch.matmul(e.T, h2)
        grad_fc3_b = e.sum(0)

        # Hidden layer 2 gradients via B2
        delta_h2 = torch.matmul(e, B2) * (h2 > 0).float()
        grad_fc2_w = torch.matmul(delta_h2.T, h1)
        grad_fc2_b = delta_h2.sum(0)

        # Hidden layer 1 gradients via B1
        delta_h1 = torch.matmul(e, B1) * (h1 > 0).float()
        x_flat = data.view(-1, 784)
        grad_fc1_w = torch.matmul(delta_h1.T, x_flat)
        grad_fc1_b = delta_h1.sum(0)

        # Update weights manually
        model.fc3.weight.data -= lr * grad_fc3_w / data.size(0)
        model.fc3.bias.data   -= lr * grad_fc3_b / data.size(0)

        model.fc2.weight.data -= lr * grad_fc2_w / data.size(0)
        model.fc2.bias.data   -= lr * grad_fc2_b / data.size(0)

        model.fc1.weight.data -= lr * grad_fc1_w / data.size(0)
        model.fc1.bias.data   -= lr * grad_fc1_b / data.size(0)

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


Epoch 1, Loss: 0.5432
Epoch 2, Loss: 0.5051
Epoch 3, Loss: 0.1577
Epoch 4, Loss: 0.3688
Epoch 5, Loss: 0.3634
