# Lesson 6: Tricks of the trade

In this course we will cover:
1. Learning rate scheduler/Dynamic learning rate
2. Dropout (macht validation besser)
3. Risidual connections (alles)
4. batch normalisation (CNN), layer normalisation (Transformer) —> alle Arten von normalisation
5. initialisation
6. early stopping
7. Transfer learning
8. Augmentation



In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import scipy.signal as signal
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

# Define device:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")  
elif  torch.mtia.is_available():
    device = torch.device("mtia")
else:
    device = torch.device("cpu")

In [None]:
# Creating the data
def generate_waveform(wave_type, length, fs):
    t = np.linspace(0, 1, length, endpoint=False)
    freq = np.random.uniform(1, 10)      # random frequency 1-10Hz
    amp = np.random.uniform(0.5, 1.5)    # random amplitude
    phi = np.random.uniform(0, 2*np.pi)  # random phases

    if wave_type == 'sine':
        y = amp * np.sin(2 * np.pi * freq * t + phi)
    elif wave_type == 'triangle':
        y = amp * signal.sawtooth(2 * np.pi * freq * t + phi, 0.5)
    elif wave_type == 'square':
        y = amp * signal.square(2 * np.pi * freq * t + phi)
    else:
        raise ValueError("Unknown wave type")
    
    # optional noise
    noise = np.random.normal(0, 0.05, length)
    y += noise
    return y

# Parameters
num_samples = 200      # samples per waveform type
length = 128           # number of points per waveform
fs = 128               # sampling frequency

# Generate dataset in memory
wave_types = ['sine', 'triangle', 'square']
X = []
y = []

for idx, wave in enumerate(wave_types):
    for _ in range(num_samples):
        waveform = generate_waveform(wave, length, fs)
        X.append(waveform)
        y.append(idx)  # class label: 0=sine, 1=triangle, 2=square


# Convert to numpy arrays
X = np.array(X)   # shape: (600, 128)
y = np.array(y)   # shape: (600,)

In [None]:
# Class to create our data set with torch.utils.data.Dataset
class WaveformDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32).unsqueeze(1)  # add channel dimension
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
dataset = WaveformDataset(X, y)

train_size = int(0.6 * len(dataset))
valid_size = int(0.2 * len(dataset))
test_size = len(dataset) - train_size - valid_size
train_dataset, valid_dataset, test_dataset = random_split(dataset, [train_size, valid_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

# Our CNN model for waveform classification
class Waveform_Classification_CNN(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=5, stride=2)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=5,  stride=2)
        self.pool = nn.MaxPool1d(2)
        self.fc1 = nn.Linear(224, 64) 
        self.fc2 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)  # flatten
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
def train_epoch(model, train_loader,  criterion, optimizer, lr_scheduler, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)

        loss = criterion(outputs, labels)
        
        loss.backward()

        optimizer.step()
        
        # Accumulate loss and accuracy
        total_loss += loss.item()
        
        # Calculate accuracy (no need to reshape for classification)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    # Step scheduler after epoch if provided
    if lr_scheduler:
        lr_scheduler.step()
        print(f'Learning Rate after epoch: {lr_scheduler.get_last_lr()[0]:.6f}')

    avg_loss = total_loss / len(train_loader)
    accuracy = 100. * correct / total

    return avg_loss, accuracy

def validate(model, valid_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0  

    with torch.no_grad():
        for inputs, labels in valid_loader:  
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass only (no gradient computation)
            outputs = model(inputs)
            
            # Reshape for loss computation
            outputs_flat = outputs.reshape(-1, outputs. size(-1))
            labels_flat = labels.reshape(-1)
            
            # Compute loss
            loss = criterion(outputs_flat, labels_flat)
            total_loss += loss.item()

            # Calculate accuracy
            _, predicted = outputs_flat.max(1)
            total += labels_flat.size(0)
            correct += predicted.eq(labels_flat).sum().item()
        
        avg_loss = total_loss / len(valid_loader)
        accuracy = 100. * correct / total
    
    return avg_loss, accuracy

In [None]:
# Training loop 
def train_model(model, train_loader, valid_loader, criterion, optimizer, lr_scheduler=None, n_epochs=15, device='cpu'):
    print("Starting training...")
    
    train_losses = []
    valid_losses = []
    train_accuracies = []
    valid_accuracies = []
    
    for epoch in range(n_epochs):
        # Train for one epoch
        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, lr_scheduler, device)
        train_losses.append(train_loss)
        train_accuracies.append(train_acc)

        # Evaluate on validation set
        val_loss, val_acc = validate(model, valid_loader, criterion, device)
        valid_losses.append(val_loss)
        valid_accuracies.append(val_acc)
        
        # Print progress every 100 epochs or on first/last epoch
        if n_epochs <= 30:
            if epoch % 5 == 0 or epoch == n_epochs - 1:
                print(f'Epoch [{epoch+1}/{n_epochs}] - Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}% | Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%')
        elif n_epochs > 30:
            if epoch % 10 == 0 or epoch == n_epochs - 1:
                print(f'Epoch [{epoch+1}/{n_epochs}] - Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}% | Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%')


    print('-' * 50)

    print("Training completed!")

    plt.figure(figsize=(10, 4))
    plt.plot(range(n_epochs), train_losses, label='Train Loss')
    plt.plot(range(n_epochs), valid_losses, label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

    return train_losses, valid_losses, train_accuracies, valid_accuracies

In [None]:
n_epochs = 15

model = Waveform_Classification_CNN(num_classes=3).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Train the model with scheduler enabled
train_losses, valid_losses, train_accs, valid_accs = train_model(
    model=model, 
    train_loader=train_loader, 
    valid_loader=valid_loader, 
    criterion=criterion, 
    optimizer=optimizer, 
    n_epochs=n_epochs, 
    device=device
)


## Using a Scheduler or Dynamic Learning Rate

One way to improve the performance of the model is by using a scheduler for the learning rate or use a dynamic learning rate. 

### What is a Learning Rate Scheduler?

A learning rate scheduler is a technique that adjusts the learning rate during training according to a predefined strategy. Instead of keeping the learning rate constant throughout the entire training process, the scheduler modifies it at specific intervals (e.g., per epoch or per batch).

Common scheduling strategies include:
- **Step Decay**: Reduces the learning rate by a factor every few epochs
- **Exponential Decay**: Gradually decreases the learning rate exponentially
- **Cosine Annealing**: Varies the learning rate following a cosine curve
- **ReduceLROnPlateau**: Reduces the learning rate when a metric stops improving

### Why Does It Make Training More Effective? 

1. **Better Convergence**: Starting with a higher learning rate allows the model to make large steps toward the optimal solution early in training.  As training progresses, a lower learning rate helps fine-tune the parameters and converge to a better minimum.

2. **Escape Local Minima**: A dynamic learning rate can help the model escape shallow local minima in the early stages while settling into deeper, better minima as the rate decreases.

3. **Prevents Overshooting**: A constant high learning rate might cause the optimizer to overshoot the optimal point. Reducing it over time ensures more precise updates near convergence.

4. **Improved Generalization**: Gradually lowering the learning rate can lead to flatter minima, which often generalize better to unseen data. 

5. **Faster Training**: By adapting the learning rate to the training dynamics, schedulers can achieve better results in fewer epochs compared to a fixed learning rate.


In [None]:
n_epochs = 15

model = Waveform_Classification_CNN(num_classes=3).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.7)

# Train the model with scheduler enabled
train_losses, valid_losses, train_accs, valid_accs = train_model(
    model=model, 
    train_loader=train_loader, 
    valid_loader=valid_loader, 
    optimizer=optimizer, 
    lr_scheduler=scheduler, 
    criterion=criterion, 

    n_epochs=n_epochs, 

    device=device
)


In [None]:
model.eval()
all_preds = []
all_labels = []
all_inputs = []
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        all_inputs.extend(inputs.cpu().numpy()) 
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy: {100 * correct / total:.2f}%")

In [None]:
wave_types = ['sine', 'triangle', 'square']
n_examples = 6
idxs = np.random.choice(len(all_inputs), n_examples, replace=False)

plt.figure(figsize=(12, 8))
for i, idx in enumerate(idxs):
    plt.subplot(2, 3, i+1)
    plt.plot(all_inputs[idx][0], color='black')
    plt.title(f"True: {wave_types[all_labels[idx]]}\nPred: {wave_types[all_preds[idx]]}",
              color="green" if all_labels[idx] == all_preds[idx] else "red")
    plt.tight_layout()
plt.show()

## Dropout

One effective technique to prevent overfitting and improve model generalization is by using dropout during training. 

Dropout is a regularization technique where random neurons are temporarily "dropped" (set to zero) during training with a specified probability. This means that during each training iteration, a different subset of neurons is active, forcing the network to learn more robust features. Dropout is only applied during training. During evaluation/inference, all neurons are active, and their outputs are typically scaled to account for the dropout rate used during training. 

Common dropout rates range from 0.2 to 0.5. Higher rates provide stronger regularization but may hurt learning if too high. For example, with a dropout rate of 0.5, each neuron has a 50% chance of being deactivated during any given training step.

 
### Why Does It Make Training More Effective?

1.  **Prevents Overfitting**: By randomly dropping neurons, dropout prevents the network from relying too heavily on specific neurons or learning complex co-adaptations between neurons that only work on the training data.

2. **Ensemble Effect**: Dropout can be viewed as training an ensemble of multiple sub-networks simultaneously. At inference time, using all neurons approximates averaging the predictions of all these sub-networks, leading to better generalization.

3.  **Forces Redundancy**: Since any neuron can be dropped at any time, the network learns to distribute information across multiple neurons rather than concentrating it in a few.  This creates more robust and redundant representations.

4. **Reduces Co-adaptation**: Without dropout, neurons can develop complex interdependencies that don't generalize well.  Dropout breaks these dependencies, forcing each neuron to learn more independently useful features.

5. **Improves Generalization**: Models trained with dropout typically perform better on unseen data because they learn more general patterns rather than memorizing the training set.


In [None]:
# Our CNN model for waveform classification with dropout
class CNN_with_dropout(nn.Module):
    def __init__(self, num_classes=3, dropout_rate_conv=0.2, dropout_rate_lin=0.5):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=5, stride=2)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=5, stride=2)
        self.pool = nn.MaxPool1d(2)
        
        # Add dropout layers
        self.dropout_conv = nn.Dropout(dropout_rate_conv)  # Lower dropout for conv layers
        self.dropout_fc = nn.Dropout(dropout_rate_lin)  # Higher dropout for fully connected
        
        self.fc1 = nn.Linear(224, 64) 
        self.fc2 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = self.dropout_conv(x)  # Dropout after first conv block
        
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = self.dropout_conv(x)  # Dropout after second conv block
        
        x = x.view(x. size(0), -1)  # flatten
        x = F. relu(self.fc1(x))
        x = self.dropout_fc(x)  # Dropout after first FC layer
        x = self.fc2(x)
        return x

In [None]:
n_epochs = 100

dropout_rate_conv = 0
dropout_rate_lin = 0

model = CNN_with_dropout(num_classes=3, dropout_rate_conv=dropout_rate_conv, dropout_rate_lin=dropout_rate_lin).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = None

# Train the model without
train_losses, valid_losses, train_accs, valid_accs = train_model(
    model=model, 
    train_loader=train_loader, 
    valid_loader=valid_loader, 
    optimizer=optimizer, 
    lr_scheduler=scheduler, 
    criterion=criterion, 
    n_epochs=n_epochs, 
    device=device
)

In [None]:
n_epochs = 100

dropout_rate_conv = 0.1
dropout_rate_lin = 0.3

model = CNN_with_dropout(num_classes=3, dropout_rate_conv=dropout_rate_conv, dropout_rate_lin=dropout_rate_lin).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = None

# Train the model without
train_losses, valid_losses, train_accs, valid_accs = train_model(
    model=model, 
    train_loader=train_loader, 
    valid_loader=valid_loader, 
    optimizer=optimizer, 
    lr_scheduler=scheduler, 
    criterion=criterion, 
    n_epochs=n_epochs, 
    device=device
)

## Batch normalization

One powerful technique to improve training stability and speed up convergence is by using batch normalization.  Batch normalization has become a standard component in modern deep neural networks. 

Batch normalization is a technique that normalizes the inputs of each layer by adjusting and scaling the activations.  For each mini-batch during training, it:

1. Calculates the mean and variance of the activations
2. Normalizes the activations using these statistics
3. Scales and shifts the normalized values using learnable parameters (γ and β)

The normalization formula for a batch:
```
x_norm = (x - μ_batch) / √(σ²_batch + ε)
output = γ * x_norm + β
```

Where:
- **μ_batch**: Mean of the batch
- **σ²_batch**: Variance of the batch
- **ε**: Small constant for numerical stability (e.g., 1e-5)
- **γ, β**: Learnable parameters for scale and shift

Typically the normalization is placed **after** the linear/convolutional layer and **before** the activation function:
   ```
   Conv/Linear → BatchNorm → Activation (ReLU)
   ```
It works best with reasonably sized batches (≥16).  Very small batches can cause instability. 


### Why Does It Make Training More Effective?

1. **Faster Training**: By normalizing activations, batch normalization allows for higher learning rates without the risk of divergence.  This can speed up training by 2-10x in some cases.

2. **Reduces Internal Covariate Shift**: As the network learns, the distribution of inputs to each layer changes. Batch normalization stabilizes these distributions, making training more stable and predictable.

3. **Acts as Regularization**: Batch normalization introduces a slight noise (because statistics are computed per batch), which has a mild regularization effect similar to dropout.  This can reduce the need for other regularization techniques.

4. **Reduces Sensitivity to Initialization**: Networks with batch normalization are less sensitive to the initial weights, making training more robust and reproducible.

5. **Helps Gradient Flow**: By keeping activations in a reasonable range, batch normalization prevents vanishing or exploding gradients, allowing gradients to flow more effectively through deep networks.

6. **Enables Deeper Networks**: The stabilizing effect of batch normalization makes it possible to train much deeper networks that would otherwise be difficult to optimize.


In [None]:
# Our CNN model for waveform classification with risidual connections
class CNN_with_batch_normalization(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=5, stride=2)
        self.bn1 = nn.BatchNorm1d(16, momentum=0.01)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=5, stride=2)
        self.bn2 = nn.BatchNorm1d(32, momentum=0.01)
        self.pool = nn.MaxPool1d(2)
        
        self.fc1 = nn.Linear(224, 64) 
        self.fc2 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = self.bn1(self.conv1(x))
        x = F.relu(x)
        x = self.pool(x)

        x = self.bn2(self.conv2(x))
        x = F.relu(x)
        x = self.pool(x)
        
        x = x.view(x. size(0), -1)  # flatten
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
  
        
        return x

In [None]:
n_epochs = 50

model = Waveform_Classification_CNN(num_classes=3).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0008)
scheduler = None

# Train the model with residual connections
train_losses, valid_losses, train_accs, valid_accs = train_model(
    model=model, 
    train_loader=train_loader, 
    valid_loader=valid_loader, 
    optimizer=optimizer, 
    lr_scheduler=scheduler, 
    criterion=criterion, 
    n_epochs=n_epochs, 
    device=device
)

In [None]:
n_epochs = 50

model = CNN_with_batch_normalization(num_classes=3).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0008)
scheduler = None

# Train the model with residual connections
train_losses, valid_losses, train_accs, valid_accs = train_model(
    model=model, 
    train_loader=train_loader, 
    valid_loader=valid_loader, 
    optimizer=optimizer, 
    lr_scheduler=scheduler, 
    criterion=criterion, 
    n_epochs=n_epochs, 
    device=device
)

## Using Residual Connections

One powerful architectural technique to improve deep neural network training is by using residual connections (also known as skip connections). 

Residual connections create shortcuts that allow the input of a layer (or block of layers) to bypass those layers and be added directly to the output.  Instead of learning a direct mapping H(x), the layers learn a residual mapping F(x), and the final output becomes F(x) + x. The input and output dimensions must match for the addition operation.  If they don't, use a projection (typically a 1x1 convolution) on the skip connection. Typically, the activation function (ReLU) is applied **after** the addition of the residual connection. Residual connections combine well with batch normalization, dropout, and other regularization techniques. 

This concept was introduced in ResNet (Residual Networks) and has become a fundamental building block in modern deep learning architectures.

### Why Do They Make Training More Effective?

1. **Solves Vanishing Gradient Problem**: In very deep networks, gradients can become extremely small during backpropagation, making it difficult for early layers to learn.  Residual connections provide direct gradient pathways, allowing gradients to flow backward through the network more easily.

2. **Enables Deeper Networks**: Before residual connections, making networks deeper often degraded performance due to optimization difficulties. Residual connections make it possible to train networks with hundreds or even thousands of layers effectively.

3. **Easier Optimization**: Learning the residual (the difference between input and desired output) is often easier than learning the complete transformation.  If the optimal function is close to an identity mapping, the network can simply learn to make F(x) ≈ 0.

4. **Identity Mapping**: In the worst case, if additional layers aren't helpful, the network can learn to pass the input through unchanged (identity function) by setting the residual to zero.  This ensures deeper models perform at least as well as shallower ones.

5. **Feature Reuse**: Skip connections allow the network to reuse features from earlier layers, combining low-level and high-level features for better representations.

6. **Faster Convergence**: Networks with residual connections often converge faster during training because the gradient signal is stronger and more stable.

In [None]:
# Our CNN model for waveform classification with risidual connections
class CNN_with_Residual_Connections(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=5, stride=2)
        self.bn1 = nn.BatchNorm1d(16)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=5, stride=2)
        self.bn2 = nn.BatchNorm1d(32)
        self.pool = nn.MaxPool1d(2)

        self.conv_1x1 = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=5, stride=2),  # Match conv1
            nn.MaxPool1d(2),                             # Match first pool
            nn.Conv1d(16, 32, kernel_size=5, stride=2),  # Match conv2
            nn.MaxPool1d(2)                              # Match second pool
        )
        
        self.fc_projection = nn.Linear(224, 64)
        self.fc1 = nn.Linear(224, 64) 
        self.fc2 = nn.Linear(64, num_classes)

    def forward(self, x):
        residual  = x
        x = self.bn1(self.conv1(x))
        x = F.relu(x)
        x = self.pool(x)

        x = self.bn2(self.conv2(x))
        x = F.relu(x)
        x = self.pool(x)

        residual = self.conv_1x1(residual)  # Shape: [batch, 32, matching_length]
        residual = self.pool(self.pool(residual)) 

        x = x + residual
        x = F.relu(x) 
        
        x = x.view(x. size(0), -1)  # flatten
   
        residual_fc = self.fc_projection(x)
        x = F.relu(self.fc1(x))
        x = x + residual_fc 
        x = F.relu(x)  

        x = self.fc2(x)
  
        

        return x

In [None]:
n_epochs = 50

model = CNN_with_batch_normalization(num_classes=3).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0008)
scheduler = None

# Train the model with residual connections
train_losses, valid_losses, train_accs, valid_accs = train_model(
    model=model, 
    train_loader=train_loader, 
    valid_loader=valid_loader, 
    optimizer=optimizer, 
    lr_scheduler=scheduler, 
    criterion=criterion, 
    n_epochs=n_epochs, 
    device=device
)

In [None]:
n_epochs = 50

model = CNN_with_Residual_Connections(num_classes=3).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0008)
scheduler = None

# Train the model with residual connections
train_losses, valid_losses, train_accs, valid_accs = train_model(
    model=model, 
    train_loader=train_loader, 
    valid_loader=valid_loader, 
    optimizer=optimizer, 
    lr_scheduler=scheduler, 
    criterion=criterion, 
    n_epochs=n_epochs, 
    device=device
)

## Initialisation


## Early stopping

## Transfer learning


## Augmentation