<a href="https://colab.research.google.com/github/giuliottnl99/2017-07-10/blob/master/Distributed_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch torchvision
!pip install matplotlib numpy pandas



In [2]:
#IMPORT SECTION

import torch
import torchvision
from torchvision.transforms import Compose, ToTensor, Normalize
from torchvision import datasets
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
import sys

In [3]:
#LeNet model definition SECTION

import torch
import torch.nn as nn
import torch.nn.functional as F

class LeNet5(nn.Module):
    def __init__(self):
        super(LeNet5, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, kernel_size=5)  # CIFAR-100 has 3 input channels (RGB)
        self.pool = nn.MaxPool2d(2, 2)              # 2x2 Max pooling
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)       # Fully connected layer
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 100)               # Output for CIFAR-100 (100 classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))        # Convolution + ReLU + Pooling
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)                  # Flatten for fully connected layers
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)                             # No activation (applied in loss function)
        return x


In [4]:
# Defining transforms for data preprocessing SECTION
transform = Compose([
    ToTensor(),
    Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Downloading and loading the CIFAR-100 dataset
train_dataset = datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz


100%|██████████| 169M/169M [00:03<00:00, 48.3MB/s]


Extracting ./data/cifar-100-python.tar.gz to ./data
Files already downloaded and verified


In [10]:
#model architecture SECTION
from torchvision.models import resnet18

#ResNet model
model = LeNet5()  # initialize model

In [6]:
#using SGD for a baseline , for optimizers we want to test
#executed for basic SGD optimization
criterion = nn.CrossEntropyLoss()
# optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9) # better to reduce lr because it is not working a so high lr
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.0)

In [None]:
#training SECTION
best_model_path = 'best_model.pth'
last_model_path = 'last_model.pth'
starting_model='model_5.pth'

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.load_state_dict(torch.load(starting_model, weights_only=True, map_location=torch.device(device)))
model.to(device)
best_loss = float('inf')

for epoch in range(200):  # Train for x epochs
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss +=  loss.item()
        torch.save(model.state_dict(), last_model_path)

    epoch_loss = loss.item()
    if best_loss > epoch_loss:
        best_loss = epoch_loss
        torch.save(model.state_dict(), best_model_path)
        print(f"Best model saved with loss = {best_loss}")

    print(f"Epoch {epoch+1}: Loss = {loss.item()}")
print("Best model loaded.")


Best model saved with loss = 0.9293094277381897
Epoch 1: Loss = 0.9293094277381897
Epoch 2: Loss = 1.8812881708145142
Epoch 3: Loss = 1.416412591934204
Epoch 4: Loss = 2.3195698261260986
Epoch 5: Loss = 1.3503525257110596
Epoch 6: Loss = 1.6474361419677734
Epoch 7: Loss = 1.2421700954437256
Epoch 8: Loss = 1.0555882453918457
Epoch 9: Loss = 1.4998509883880615
Best model saved with loss = 0.7105939388275146
Epoch 10: Loss = 0.7105939388275146
Epoch 11: Loss = 1.2011522054672241
Epoch 12: Loss = 1.0918103456497192
Epoch 13: Loss = 2.4309310913085938
Epoch 14: Loss = 1.7491779327392578
Epoch 15: Loss = 1.1952110528945923
Epoch 16: Loss = 2.225224256515503
Epoch 17: Loss = 0.8707416653633118
Epoch 18: Loss = 0.8042885661125183
Epoch 19: Loss = 1.4188737869262695
Epoch 20: Loss = 1.485187292098999
Epoch 21: Loss = 1.1987634897232056
Epoch 22: Loss = 1.227553129196167
Best model saved with loss = 0.6860207915306091
Epoch 23: Loss = 0.6860207915306091
Epoch 24: Loss = 1.3345681428909302
Epoch

In [None]:
#training fedAVG SECTION
best_model_path = 'best_model.pth'
last_model_path = 'last_model.pth'
starting_model='model_2.pth'

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model.load_state_dict(torch.load(starting_model, weights_only=True))
model.to(device)
local_models = [model.to(device) for _ in range(10)]  # 10 local models (machines)
best_loss = float('inf')

for epoch in range(100):  # Train for x epochs
  model.train()
  local_losses = []
  for i, local_model in enumerate(local_models):
      local_model.load_state_dict(model.state_dict())  # Sync with the global model before each round
      local_optimizer = torch.optim.SGD(local_model.parameters(), lr=0.01, momentum=0.9)  # Local optimizer
      local_loss = 0.0

      for inputs, targets in train_loader:
          inputs, targets = inputs.to(device), targets.to(device)
          local_optimizer.zero_grad()
          outputs = local_model(inputs)
          loss = criterion(outputs, targets)
          loss.backward()
          local_optimizer.step()
          torch.save(model.state_dict(), last_model_path)

      local_losses.append(loss.item())
      # torch.save(local_model.state_dict(), f'local_model_{i}_last.pth') #better not to save!
  global_dict = model.state_dict()
  for key in global_dict:
    # Average the weights of all local models for each parameter
    global_dict[key] = torch.mean(torch.stack([local_models[i].state_dict()[key].float() for i in range(10)]), dim=0)
  model.load_state_dict(global_dict)

  # Save the updated global model after aggregation
  torch.save(model.state_dict(), last_model_path)

  epoch_loss = np.mean(local_losses) #this is just an average, but can be useful to find the best set
  if best_loss > epoch_loss:
      best_loss = epoch_loss
      torch.save(model.state_dict(), best_model_path)
      print(f"Best model saved with loss = {best_loss}")

      print(f"Epoch {epoch+1}: Loss = {loss.item()}")
print("Best model loaded.")


KeyboardInterrupt: 

In [None]:
#implement current best or last model:
starting_model='model_5.pth'
model.load_state_dict(torch.load(starting_model, weights_only=True))


<All keys matched successfully>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#evaluating SECTION
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

print(f"Test Accuracy: {100 * correct / total}%")

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [None]:
#computing accuracy on training set SECTION

#just to see hwo accurate the model is on the training data (to understand if a low accuracy on the test set is given by overfitting or underfitting)
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

print(f"Test Accuracy on training set: {100 * correct / total}%")

Test Accuracy on training set: 60.354%


In [None]:
#ENV variable SECTION
import os
import torch.distributed as dist

# Set environment variables for a single-node simulation
os.environ['RANK'] = '0'
os.environ['WORLD_SIZE'] = '1'
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29500'

# Initialize the distributed process
dist.init_process_group(backend='nccl', init_method='env://')

# Wrap your model in DistributedDataParallel
from torch.nn.parallel import DistributedDataParallel as DDP
model = DDP(model)

In [None]:
torch.save(model.state_dict(), 'model.pth')  # Save it locally in Colab