<a href="https://colab.research.google.com/github/dimidagd/gists/blob/main/Polyak_Ruppert_averaging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Polyak-Ruppert averaging



> Polyak-Ruppert averaging is a technique used in optimization to compute a weighted average of iterates obtained during the optimization process. By averaging multiple parameter vectors, it can improve generalization, stability, and robustness to hyperparameter choices. This technique is widely used in machine learning and optimization research and is applicable to various optimization algorithms.



In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
# Define a simple linear regression model
class LinearRegression(nn.Module):
    def __init__(self):
        super(LinearRegression, self).__init__()
        self.linear1 = nn.Linear(1, 100)
        self.linear2 = nn.Linear(100, 100)
        self.linear3 = nn.Linear(100, 1)
        self.activation = nn.Sigmoid()
    def forward(self, x):
        return self.linear3(self.activation(self.linear2(self.activation(self.linear1(x)+x))))

# Generate some dummy data
torch.manual_seed(42)
x = torch.randn(1000, 1)
x_test = torch.randn(100, 1)
F = lambda input_features: 3*input_features + input_features**2  + torch.randn(input_features.shape[0], 1) **2 # y = 3x + 2 + noise
y = F(x)
y_test = F(x_test)
# Initialize model, optimizer, and Polyak-Ruppert averaging parameters
model = LinearRegression()
optimizer = optim.SGD(model.parameters(), lr=0.1)
avg_params = [p.clone().detach() for p in model.parameters()]
alpha = 0.91  # Decay rate for averaging

# Training loop
num_epochs = 1000
for epoch in tqdm(range(num_epochs)):
    # Forward pass
    outputs = model(x)
    loss = nn.functional.mse_loss(outputs, y)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Update Polyak-Ruppert averaging parameters
    with torch.no_grad():
        for p, avg_p in zip(model.parameters(), avg_params):
            avg_p.mul_(alpha).add_(p, alpha=1 - alpha)
y_test_pred_no_avg = model(x_test)
# Use averaged parameters for evaluation
with torch.no_grad():
    # Set model parameters to the averaged parameters
    for p, avg_p in zip(model.parameters(), avg_params):
        p.copy_(avg_p)

    # Evaluate the model on some test data
    y_test_pred = model(x_test)
    print("Difference between models",(y_test_pred_no_avg - y_test_pred).sum().item())
    criterion = nn.MSELoss()
    loss = torch.sqrt(criterion(y_test_pred, y_test))
    loss_no_avg = torch.sqrt(criterion(y_test_pred_no_avg, y_test))
    print("loss with averaged parameters:",loss.item())
    print("loss in last epoch:",loss_no_avg.item())

100%|██████████| 1000/1000 [00:03<00:00, 300.92it/s]

Difference between models -18.67245864868164
loss with averaged parameters: 1.3458468914031982
loss in last epoch: 1.3584779500961304



