In [1]:
import os
import sys
import logging
import argparse
import optuna
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
from models.model_factory import ModelFactory
from data.dataset import CustomImageDataset

In [2]:
SEED = 0
BATCH_SIZE = 64
MODEL = 'resnet18'
IMG_DIR = '/home/dlukyan/fedhh/data/tuning/images'
LABELS_FILE = '/home/dlukyan/fedhh/data/tuning//labels.csv'
DATA_LOADER_WORKERS = 2

In [3]:
print(f'CUDA devices: {torch.cuda.device_count()}')

CUDA devices: 1


In [4]:
def objective(trial, model_name, img_dir, labels_file):
    batch_size = BATCH_SIZE * torch.cuda.device_count()

    dataset = CustomImageDataset(img_dir=img_dir, labels_file=labels_file)
    train_size = int(0.8 * len(dataset))
    valid_size = len(dataset) - train_size
    train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=DATA_LOADER_WORKERS*torch.cuda.device_count(), prefetch_factor=DATA_LOADER_WORKERS*4)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=DATA_LOADER_WORKERS*torch.cuda.device_count(), prefetch_factor=DATA_LOADER_WORKERS*4)

    model_instance = ModelFactory.create_model(model_name, trial)

    model = model_instance.get_model()
    optimizer = model_instance.get_tuning_optimizer(model)
    scheduler = model_instance.get_tuning_scheduler(optimizer)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f'Device: {device}')
    
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    
    model.to(device)

    criterion = nn.CrossEntropyLoss()

    n_epochs = 10
    for epoch in range(n_epochs):
        train_loss = model_instance.train_model(model, train_loader, criterion, optimizer, scheduler, device)
        valid_loss, accuracy = model_instance.validate_model(model, valid_loader, criterion, device)
        trial.report(accuracy, epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()
    
    return accuracy

In [None]:
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
optuna.logging.set_verbosity(optuna.logging.DEBUG)
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED), pruner=optuna.pruners.HyperbandPruner())
study.optimize(lambda trial: objective(trial, MODEL, IMG_DIR, LABELS_FILE), n_trials=200)

print('Best trial:')
trial = study.best_trial
print('  Value: {}'.format(trial.value))
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

[I 2024-07-16 13:27:25,799] A new study created in memory with name: no-name-53b6b181-8ec8-4e26-b61d-448564ef2b63


A new study created in memory with name: no-name-53b6b181-8ec8-4e26-b61d-448564ef2b63
Device: cuda


[I 2024-07-16 15:17:44,114] Trial 0 finished with value: 70.45204520452045 and parameters: {'learning_rate': 0.00044303752452182633, 'beta1': 0.9358859796107597, 'beta2': 0.9596735742310928, 'eps': 1.229607110732571e-07, 'weight_decay': 0.0042365479933890475, 'scheduler_type': 'CosineAnnealingLR', 'T_max': 49, 'eta_min': 0.0038344151882577773}. Best is trial 0 with value: 70.45204520452045.


Trial 0 finished with value: 70.45204520452045 and parameters: {'learning_rate': 0.00044303752452182633, 'beta1': 0.9358859796107597, 'beta2': 0.9596735742310928, 'eps': 1.229607110732571e-07, 'weight_decay': 0.0042365479933890475, 'scheduler_type': 'CosineAnnealingLR', 'T_max': 49, 'eta_min': 0.0038344151882577773}. Best is trial 0 with value: 70.45204520452045.
Device: cuda


[D 2024-07-16 15:28:31,983] Hyperband has 3 brackets
