# Generalization effects of linear transformations

Let us take a step back and have a look at a theoretical framework in order to understand data augmentation better.

This theoretical chapter will eventually give birth to another algorithm that stands as a well-balanced trade-off between heavy Reinforcement Learning and Random methods.

The algrithm uses biased-sampling in order to select useful transformations more efficiently.

## Problem formulation

### Setting

### Question

How does adding transformed samples impact the ridge estimator's generalization error ?

We will consider 3 categories of transformations which are : 

    1. label-invariant transformations
    2. label-mixing transformations
    3. composition of transformations

### Label-invariant transformations

#### Examples

#### Formulation

#### Generalization effects

### Label-mixing transformations

#### Examples

#### Formulation

#### Generalization effects

### Composition of transformations

#### Examples

#### Formulation

#### Generalization effects

# Uncertainty-based sampling of transformations

Let us try to implement this algorithm referred as UBS in the following.

Default transformations are a fixed set of transformations applied on top of the sampled transformations.They act as a **baseline augmentation strategy** that is always used. Their role is likely to **maintain consistency across transformations**, ensuring that each sample undergoes a minimum level of augmentation.

In [6]:
import torch
import torchvision.transforms as transforms
import random
import numpy as np
import utils
from torch.utils.data import Subset, DataLoader
import torchvision
import torch.optim as optim

In [None]:
def uncertainty_based_sampling(
    batch, # B data points (x1,y1), ..., (xB, yB)
    F_transformations, # K transformations F1, ..., FK
    G_default_transformations, # G default transformations 
    model, 
    loss_function,
    L, # the number of composition steps
    C, # number of augmented data per input data
    S, # number of selected data points used for training
    device
    ) : 

    model.eval()
    images, labels = batch
    images, labels = images.to(device), labels.to(device)
    
    selected_samples = []  # Store selected high-uncertainty samples
    
    with torch.no_grad():
        for i in range(len(images)):  # Loop over each image in batch
            augmented_samples = []
            loss_values = []
            
            for _ in range(C):  # Create C augmented versions
                transformed_img = images[i].cpu()

                # Apply L random transformations
                transforms_list = random.sample(F_transformations, L)
                transform_pipeline = transforms.Compose(transforms_list + [G_default_transformations])
                
                transformed_img = transform_pipeline(transformed_img)
                transformed_img = transformed_img.to(device).unsqueeze(0)  # Add batch dim
                
                # Compute loss
                output = model(transformed_img)
                loss = loss_function(output, labels[i].unsqueeze(0))
                
                augmented_samples.append(transformed_img)
                loss_values.append(loss.item())

            # Select the top S samples with highest loss
            top_s_indices = np.argsort(-np.array(loss_values))[:S]
            selected_samples.extend([augmented_samples[idx] for idx in top_s_indices])

    return selected_samples  # Return S most uncertain augmented samples

In [None]:
path_to_data = "./cifar10"

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# path_to_data = "drive/MyDrive/SDD/data_augmentation/cifar10"

In [None]:
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # ImageNet normalization
    # transforms.Normalize((0.5,), (0.5,))
])

trainset = torchvision.datasets.CIFAR10(root=path_to_data, train=True, download=True, transform=transform)

batch_length = int(0.5*len(trainset))  # the training set will be enriched by 50%

# Randomly select indices
indices = np.random.choice(len(trainset), batch_length, replace=False)

# Create the bach
batch = Subset(trainset, indices)

In [None]:
F_transformations = [
    transforms.CenterCrop(size=16),
    transforms.RandomCrop(size=16),
    transforms.ColorJitter(brightness=0.3, contrast=0.3),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomRotation(45),
    transforms.RandomAffine(degrees=45, translate=(0.2, 0.2), shear=15),
    transforms.GaussianBlur(kernel_size=3),
    transforms.RandomInvert(p=0.5),
    transforms.RandomSolarize(-0.1, p=0.5),
    transforms.RandomAutocontrast(p=0.5),
    transforms.RandomAdjustSharpness(sharpness_factor=2, p=0.5)
]

In [None]:
G_default_transformations = [] # these transformations are applied when loading the data calling torchvision.datasets.CIFAR10(..., transform=transform)

In [4]:
from torchvision.models import wide_resnet50_2
import torch.nn as nn

# Load Wide ResNet model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load a pre-trained Wide ResNet-50-2 model:
wrn = wide_resnet50_2(pretrained=True)
# Freeze all model parameters except for the final layer:
for param in wrn.parameters():
    param.requires_grad = False
# Get the number of input features for the original last layer:
num_feature = wrn.fc.in_features
# Replace the final classification layer to match your dataset:
wrn.fc = nn.Linear(num_feature, 10)
# View the structure of the new final layer (optional):
print(wrn.fc)
# Move the model to the GPU for accelerated training:
wrn = wrn.to(device)
print(f"Using device: {device}")



Linear(in_features=2048, out_features=10, bias=True)
Using device: cpu


In [None]:
loss_function = nn.CrossEntropyLoss()

In [None]:
augmented_samples = uncertainty_based_sampling(
    batch=batch,
    F_transformations=F_transformations,
    G_default_transformations=G_default_transformations,
    model=wrn_trained_on_raw_data, 
    loss_function=loss_function,
    L=2, # the number of composition steps
    C=int(round(len(batch)/2)), # number of augmented data per input data
    S=int(round(len(batch)/4)), # number of selected data points used for training
    device=device
    ) 

In [None]:
UBS_train_loader_aug = DataLoader(augmented_samples, batch_size=32, shuffle=True)

Now let's train the model on the augmented data.

In [None]:
optimizer = optim.Adam(wrn.fc.parameters(), lr=0.001)
# Learning rate scheduler for better convergence
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)
criterion = nn.CrossEntropyLoss()

In [5]:
wrn_trained_on_UBS_transformed_data = utils.train_WideResNet(model=wrn_trained_on_raw_data, trainloader=UBS_train_loader_aug, num_epochs=5, batch_size=32, optimizer=optimizer, criterion=criterion, device=device, scheduler=scheduler)

8