In [None]:
# Hyperparameters
k = 5  # Number of previous epochs to use for stopping criterion
alpha = 10  # Threshold for stopping criterion
s_min = 16  # Minimum batch size
s_max = 128  # Maximum batch size
target_train_time = 60  # Target training time per epoch in seconds
loss_weight = 0.7
time_weight = 0.3
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class RandomResizeTransform:
    def __init__(self, min_scale=0.5, max_scale=1.5):
        self.min_scale = min_scale
        self.max_scale = max_scale

    def __call__(self, img):
        scale = random.uniform(self.min_scale, self.max_scale)
        new_size = int(32 * scale)
        transform = transforms.Compose([
            transforms.Resize(new_size),  # Resize image to new_size
            transforms.Pad((32 - new_size) // 2) if new_size < 32 else transforms.CenterCrop(32)  # Pad or crop to maintain 32x32
        ])
        return transform(img)

# Load CIFAR-10 dataset with random resizing transform
#transform = transforms.Compose([
#    RandomResizeTransform(),
#    transforms.ToTensor()
#])
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.Grayscale(num_output_channels=3),  # Correctly convert grayscale to RGB
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalization values for pre-trained models
])


dataset = CIFAR10(root='./data', train=True, download=True, transform=transform)

prev_train_losses = []
prev_train_time = []
prev_combined_score = 1000
training_times = {}
def adjust_batch_size(epoch, prev_combined_score, rank, loss, train_time, world_size, batch_size):
    tolerance = 10
    if rank == 0:
        prev_train_losses.append(loss)
        prev_train_time.append(train_time)


    training_times[batch_size] = train_time

    batch_size *= 2
    if epoch == 4:
        min_key = min(training_times, key=training_times.get)
        batch_size = min_key

    batch_size = batch_size

    return batch_size

def train_model():
    torch.manual_seed(0)

    # instantiate the model and transfer it to the GPU
    model = models.resnet18()
    model.fc = torch.nn.Linear(model.fc.in_features, 10)  # Adjust for CIFAR-10's 10 classes

    model = model.to(device)
    # wraps the network around distributed package

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    lr = 0.01
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

    # Preparing the training data
    transforms_train = transforms.Compose([transforms.RandomCrop(32, padding=2),
                                           transforms.RandomHorizontalFlip(),
                                           transforms.ToTensor(),
                                           transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

    trainloader = torch.utils.data.DataLoader(dataset=dataset, batch_size=16,
                                              shuffle=False, num_workers=0, pin_memory=True)


    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
    print("training")
    target_loss = 0.5  # Adjust this value based on your requirements

    # Training
    batch_size = 16
    prev_train_losses = []  # Store previous k train losses
    prev_combined_score = 1000
    for epoch in range(10):
        start_epoch_time = time.time()

        model.train()
        train_loss = 0
        accuracy = 0
        total = 0

        start_train_time = time.time()
        for images, labels in trainloader:
            # Move data to the appropriate device
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()


            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            for param in model.parameters():
              param.grad.data.div_(trainloader.batch_size)
            optimizer.step()

            train_loss += loss.item()
            total += labels.size(0)
            _, prediction = outputs.max(1)
            accuracy += prediction.eq(labels).sum().item()
        end_train_time = time.time()

        train_time = end_train_time - start_train_time

        # Gather train loss from all processes

        # Adjust batch size based on the stopping criterion and training time
        if epoch < 4:
            new_batch_size = adjust_batch_size(epoch+1, prev_combined_score, train_loss, prev_train_losses, train_time, 1, trainloader.batch_size)

        # Limit the batch size to a reasonable range
        new_batch_size = max(new_batch_size, 16)  # Minimum batch size
        new_batch_size = min(new_batch_size, 128)  # Maximum batch size

        if new_batch_size != trainloader.batch_size:
            curr = trainloader.batch_size
            lr = lr * new_batch_size / curr
            print(f"Changing batch size from {curr} to {new_batch_size}")
            # Re-create the data loaders with the new batch size
            trainloader = torch.utils.data.DataLoader(
                dataset=dataset, batch_size=new_batch_size, shuffle=False,
                num_workers=0, pin_memory=True)

        print(f"Epoch: {epoch + 1}, Loss: {loss.item()}, Training Accuracy: {accuracy / total:.4f}, "
                  f"Time: {end_train_time - start_train_time:.2f}s")

        end_epoch_time = time.time()

        print(f"Total Epoch Time: {end_epoch_time - start_epoch_time:.2f}, {trainloader.batch_size}")

    print("Training DONE!!!")
    print()
    print('Testing BEGINS!!')




def run_train_model(train_func, world_size):

    # this is responsible for spawning 'nprocs' number of processes of the train_func function with the given
    # arguments as 'args'
    train_func()


if __name__ == "__main__":
    # since this example shows a single process per GPU, the number of processes is simply replaced with the
    # number of GPUs available for training.
    n_gpus = torch.cuda.device_count()
    run_train_model(train_model, n_gpus)


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:03<00:00, 48315384.52it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
training
Changing batch size from 16 to 32
Epoch: 1, Loss: 2.11828351020813, Training Accuracy: 0.3829, Time: 416.23s
Total Epoch Time: 416.23, 32
Changing batch size from 32 to 64
Epoch: 2, Loss: 1.4790825843811035, Training Accuracy: 0.5768, Time: 425.67s
Total Epoch Time: 425.68, 64
Changing batch size from 64 to 128
Epoch: 3, Loss: 1.3311432600021362, Training Accuracy: 0.6305, Time: 442.26s
Total Epoch Time: 442.26, 128
Changing batch size from 128 to 16
Epoch: 4, Loss: 1.0855016708374023, Training Accuracy: 0.6467, Time: 443.75s
Total Epoch Time: 443.75, 16
Epoch: 5, Loss: 1.6283986568450928, Training Accuracy: 0.6018, Time: 415.47s
Total Epoch Time: 415.47, 16
Epoch: 6, Loss: 1.1023520231246948, Training Accuracy: 0.6883, Time: 416.19s
Total Epoch Time: 416.19, 16
Epoch: 7, Loss: 0.8589321374893188, Training Accuracy: 0.7590, Time: 415.67s
Total Epoch Time: 415.68, 16
Epoch: 8, Loss: 0.698289692401886, Training Accuracy: 0.8318,

In [None]:
import os
import argparse
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset
from PIL import Image

# packages for distributed training
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
torch.backends.cudnn.enabled = False
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
import random