# LeNet-5 Evaluation on Fashion-MNIST

Classic LeNet-5 architecture (1998) for baseline comparison:
- 2 convolutional layers with average pooling
- 3 fully connected layers
- ~60K parameters

In [1]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

batch_size = 64
num_classes = 10
learning_rate = 0.001

num_epochs = 20

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
all_transforms = transforms.Compose([transforms.Resize((32,32)),
                                     transforms.ToTensor(),
                                     transforms.Normalize(mean=[0.5],
                                                          std=[0.5])
                                     ])
train_dataset = torchvision.datasets.FashionMNIST(root = './data',
                                             train = True,
                                             transform = all_transforms,
                                             download = True)

test_dataset = torchvision.datasets.FashionMNIST(root = './data',
                                            train = False,
                                            transform = all_transforms,
                                            download=True)

train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                           batch_size = batch_size,
                                           shuffle = True)


test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                           batch_size = batch_size,
                                           shuffle = True)

In [3]:
class LeNet5(nn.Module):
    """Classic LeNet-5 architecture from 1998"""
    def __init__(self, num_classes):
        super(LeNet5, self).__init__()
        # First convolutional block
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.avgpool1 = nn.AvgPool2d(kernel_size=2, stride=2)
        
        # Second convolutional block
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5)
        self.avgpool2 = nn.AvgPool2d(kernel_size=2, stride=2)
        
        # Fully connected layers
        # Input: 32x32 -> conv1: 28x28 -> pool: 14x14 -> conv2: 10x10 -> pool: 5x5
        # Flattened size: 16 * 5 * 5 = 400
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, num_classes)
    
    def forward(self, x):
        # First conv block with tanh activation (original LeNet used tanh)
        out = torch.tanh(self.conv1(x))
        out = self.avgpool1(out)
        
        # Second conv block
        out = torch.tanh(self.conv2(out))
        out = self.avgpool2(out)
        
        # Flatten
        out = out.reshape(out.size(0), -1)
        
        # Fully connected layers
        out = torch.tanh(self.fc1(out))
        out = torch.tanh(self.fc2(out))
        out = self.fc3(out)
        return out

In [4]:
model = LeNet5(num_classes)
model = model.to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
print(f'Total parameters: {total_params:,}')

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.005, momentum = 0.9)  

total_step = len(train_loader)

Total parameters: 61,706


In [5]:
# Initial training run to test the model
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):  
        images = images.to(device)
        labels = labels.to(device)
        
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

Epoch [1/20], Loss: 1.0510
Epoch [2/20], Loss: 0.6688
Epoch [3/20], Loss: 0.5379
Epoch [4/20], Loss: 0.6628
Epoch [5/20], Loss: 0.7042
Epoch [6/20], Loss: 0.7732
Epoch [7/20], Loss: 0.5343
Epoch [8/20], Loss: 0.6167
Epoch [9/20], Loss: 0.7109
Epoch [10/20], Loss: 0.6341
Epoch [11/20], Loss: 0.2118
Epoch [12/20], Loss: 0.2745
Epoch [13/20], Loss: 0.3669
Epoch [14/20], Loss: 0.4451
Epoch [15/20], Loss: 0.3616
Epoch [16/20], Loss: 0.4710
Epoch [17/20], Loss: 0.2176
Epoch [18/20], Loss: 0.3557
Epoch [19/20], Loss: 0.3381
Epoch [20/20], Loss: 0.5895


In [6]:
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    print('Accuracy of the network on the {} test images: {:.2f} %'.format(10000, 100 * correct / total))

Accuracy of the network on the 10000 test images: 83.91 %


## Grid Search for Hyperparameter Tuning

Testing different combinations of:
- Optimizers: SGD, Adam
- Learning rates: 0.001, 0.01, 0.1
- Weight decay: 0, 0.005

Total configurations: 2 × 3 × 2 = 12

In [7]:
# Grid search hyperparameters
optimizers_to_test = ['sgd', 'adam']
learning_rates = [0.001, 0.01, 0.1]
weight_decays = [0, 0.005]

# Store results
results = []

print(f"Total configurations to test: {len(optimizers_to_test) * len(learning_rates) * len(weight_decays)}")

Total configurations to test: 12


In [8]:
import time

config_num = 0
for optimizer_name in optimizers_to_test:
    for lr in learning_rates:
        for wd in weight_decays:
            config_num += 1
            print(f"\n{'='*60}")
            print(f"Configuration {config_num}/12")
            print(f"Optimizer: {optimizer_name}, LR: {lr}, Weight Decay: {wd}")
            print(f"{'='*60}")
            
            # Create fresh model
            model = LeNet5(num_classes).to(device)
            
            # Create optimizer based on type
            if optimizer_name == 'sgd':
                optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=0.9)
            else:  # adam
                optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
            
            criterion = nn.CrossEntropyLoss()
            
            # Training
            start_time = time.time()
            for epoch in range(num_epochs):
                model.train()
                for i, (images, labels) in enumerate(train_loader):
                    images = images.to(device)
                    labels = labels.to(device)
                    
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                    
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                
                if (epoch + 1) % 5 == 0:  # Print every 5 epochs
                    print(f'  Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
            
            training_time = time.time() - start_time
            
            # Evaluation on test set
            model.eval()
            with torch.no_grad():
                correct = 0
                total = 0
                for images, labels in test_loader:
                    images = images.to(device)
                    labels = labels.to(device)
                    outputs = model(images)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()
                
                test_accuracy = 100 * correct / total
            
            print(f"  Test Accuracy: {test_accuracy:.2f}%")
            print(f"  Training Time: {training_time:.2f}s")
            
            # Store results
            results.append({
                'optimizer': optimizer_name,
                'learning_rate': lr,
                'weight_decay': wd,
                'test_accuracy': test_accuracy,
                'training_time': training_time
            })

print(f"\n{'='*60}")
print("Grid search completed!")
print(f"{'='*60}")


Configuration 1/12
Optimizer: sgd, LR: 0.001, Weight Decay: 0
  Epoch [5/20], Loss: 0.4220
  Epoch [10/20], Loss: 0.3296
  Epoch [15/20], Loss: 0.5577
  Epoch [20/20], Loss: 0.6093
  Test Accuracy: 86.26%
  Training Time: 91.79s

Configuration 2/12
Optimizer: sgd, LR: 0.001, Weight Decay: 0.005
  Epoch [5/20], Loss: 0.5239
  Epoch [10/20], Loss: 0.4589
  Epoch [15/20], Loss: 0.5131
  Epoch [20/20], Loss: 0.4805
  Test Accuracy: 84.43%
  Training Time: 92.16s

Configuration 3/12
Optimizer: sgd, LR: 0.01, Weight Decay: 0
  Epoch [5/20], Loss: 0.3436
  Epoch [10/20], Loss: 0.3595
  Epoch [15/20], Loss: 0.2057
  Epoch [20/20], Loss: 0.3170
  Test Accuracy: 89.95%
  Training Time: 91.45s

Configuration 4/12
Optimizer: sgd, LR: 0.01, Weight Decay: 0.005
  Epoch [5/20], Loss: 0.7747
  Epoch [10/20], Loss: 0.1770
  Epoch [15/20], Loss: 0.2289
  Epoch [20/20], Loss: 0.5715
  Test Accuracy: 85.71%
  Training Time: 91.58s

Configuration 5/12
Optimizer: sgd, LR: 0.1, Weight Decay: 0
  Epoch [5/20

In [9]:
# Display all results sorted by test accuracy
print("\n" + "="*80)
print("GRID SEARCH RESULTS - Sorted by Test Accuracy")
print("="*80)
print(f"{'Rank':<6} {'Optimizer':<10} {'LR':<10} {'Weight Decay':<13} {'Test Acc':<12} {'Time (s)':<10}")
print("-"*80)

# Sort results by test accuracy (descending)
sorted_results = sorted(results, key=lambda x: x['test_accuracy'], reverse=True)

for rank, result in enumerate(sorted_results, 1):
    print(f"{rank:<6} {result['optimizer']:<10} {result['learning_rate']:<10} "
          f"{result['weight_decay']:<13} {result['test_accuracy']:<12.2f} {result['training_time']:<10.2f}")

print("="*80)
print("\nBEST CONFIGURATION:")
best = sorted_results[0]
print(f"  Optimizer: {best['optimizer']}")
print(f"  Learning Rate: {best['learning_rate']}")
print(f"  Weight Decay: {best['weight_decay']}")
print(f"  Test Accuracy: {best['test_accuracy']:.2f}%")
print(f"  Training Time: {best['training_time']:.2f}s")
print("="*80)


GRID SEARCH RESULTS - Sorted by Test Accuracy
Rank   Optimizer  LR         Weight Decay  Test Acc     Time (s)  
--------------------------------------------------------------------------------
1      sgd        0.01       0             89.95        91.45     
2      adam       0.001      0             89.14        92.62     
3      sgd        0.001      0             86.26        91.79     
4      sgd        0.1        0             85.98        91.61     
5      sgd        0.01       0.005         85.71        91.58     
6      adam       0.001      0.005         85.18        93.66     
7      sgd        0.001      0.005         84.43        92.16     
8      adam       0.01       0.005         79.86        93.17     
9      sgd        0.1        0.005         78.95        91.97     
10     adam       0.01       0             77.16        92.89     
11     adam       0.1        0             51.26        93.05     
12     adam       0.1        0.005         43.66        93.31     



## Retrain Best Configuration and Save Model

Retraining the best configuration and saving the model for later use.

In [10]:
# Best configuration from grid search
best_optimizer = best['optimizer']
best_lr = best['learning_rate']
best_wd = best['weight_decay']

print("Training with best configuration...")
print(f"Optimizer: {best_optimizer}, LR: {best_lr}, Weight Decay: {best_wd}\n")

# Create fresh model
best_model = LeNet5(num_classes).to(device)

# Create optimizer
if best_optimizer == 'sgd':
    optimizer = torch.optim.SGD(best_model.parameters(), lr=best_lr, weight_decay=best_wd, momentum=0.9)
else:  # adam
    optimizer = torch.optim.Adam(best_model.parameters(), lr=best_lr, weight_decay=best_wd)

criterion = nn.CrossEntropyLoss()

# Training
for epoch in range(num_epochs):
    best_model.train()
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        
        outputs = best_model(images)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    if (epoch + 1) % 5 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Save the best model
torch.save(best_model.state_dict(), 'best_lenet5_model.pth')
print(f"\nModel saved to 'best_lenet5_model.pth'")

# Final evaluation
best_model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = best_model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    final_accuracy = 100 * correct / total
    print(f'Final Test Accuracy: {final_accuracy:.2f}%')

Training with best configuration...
Optimizer: sgd, LR: 0.01, Weight Decay: 0

Epoch [5/20], Loss: 0.2495
Epoch [10/20], Loss: 0.3509
Epoch [15/20], Loss: 0.3547
Epoch [20/20], Loss: 0.0842

Model saved to 'best_lenet5_model.pth'
Final Test Accuracy: 89.91%


## Comparison Notes

**LeNet-5 (1998)** vs **Custom CNN (2020s)**

Architecture differences:
- LeNet: 2 conv layers with AvgPool, tanh activation, ~60K params
- Custom CNN: 4 conv layers with MaxPool, ReLU activation, ~1M params

Expected results:
- LeNet-5: ~87-89% accuracy (classic baseline)
- Custom CNN: ~90%+ accuracy (modern improvements)

This demonstrates the impact of architectural evolution over 25+ years.