# Lab 2: Neural Architecture Search - Task 1
- Lab & tutorials written by [Aaron Zhaa](https://aaron-zhao123.github.io/) and [Pedro Gimenes](https://www.pedrogimenes.co.uk/)
- Implementation task answer written by [Gwendal Casta (gc1724)](mailto:gwendal.casta24@imperial.ac.uk)


## Hyperparameter optimization with Optuna
### Import dependencies

In [1]:
import os
import optuna
import matplotlib.pyplot as plt
from pathlib import Path
from transformers import AutoConfig, AutoModelForSequenceClassification
from chop.tools import get_tokenized_dataset, get_trainer
from chop.nn.modules import Identity
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Set environnment

In [3]:
import os
import torch

def get_least_used_gpu():
    """Returns the GPU ID with the most available memory."""
    num_gpus = torch.cuda.device_count()
    best_gpu = 0
    min_memory_used = float("inf")

    for i in range(num_gpus):
        used_memory = torch.cuda.memory_allocated(i) / torch.cuda.get_device_properties(i).total_memory
        if used_memory < min_memory_used:
            min_memory_used = used_memory
            best_gpu = i

    return best_gpu

# Set the best available GPU dynamically
best_gpu = get_least_used_gpu()
os.environ["CUDA_VISIBLE_DEVICES"] = str(best_gpu)
device = torch.device("cuda")

print(f"Using GPU {best_gpu} for training.")

Using GPU 0 for training.


In [4]:
# Dataset details
checkpoint = "prajjwal1/bert-tiny"
tokenizer_checkpoint = "bert-base-uncased"
dataset_name = "imdb"

# Load dataset and tokenizer
dataset, tokenizer = get_tokenized_dataset(
    dataset=dataset_name,
    checkpoint=tokenizer_checkpoint,
    return_tokenizer=True,
)


[32mINFO    [0m [34mTokenizing dataset imdb with AutoTokenizer for bert-base-uncased.[0m


### Define the search space

In [5]:
import torch.nn as nn
from chop.nn.modules import Identity

search_space = {
    "num_layers": [2, 4, 8],
    "num_heads": [2, 4, 8, 16],
    "hidden_size": [128, 192, 256, 384, 512],
    "intermediate_size": [512, 768, 1024, 1536, 2048],
    "linear_layer_choices": ["linear", "identity"],
}

In [6]:
def construct_model(trial):
    config = AutoConfig.from_pretrained("prajjwal1/bert-tiny")

    config.num_layers = trial.suggest_categorical("num_layers", search_space["num_layers"])
    config.num_heads = trial.suggest_categorical("num_heads", search_space["num_heads"])
    config.hidden_size = trial.suggest_categorical("hidden_size", search_space["hidden_size"])
    config.intermediate_size = trial.suggest_categorical("intermediate_size", search_space["intermediate_size"])

    # Create the model
    trial_model = AutoModelForSequenceClassification.from_config(config)

    # Replace linear layers if needed
    for name, layer in trial_model.named_modules():
        if isinstance(layer, nn.Linear) and layer.in_features == layer.out_features:
            new_layer_cls = trial.suggest_categorical(f"linear_layer_choices", search_space["linear_layer_choices"])
            if new_layer_cls == Identity:
                deepsetattr(trial_model, name, Identity())

    return trial_model

### Define the objective 
This function builds a model with trial-specific hyperparameters, trains the model for one epoch, evaluates accuracy and returns it as the optimization metric.

In [7]:
def objective(trial):
    global device  # Ensure device is updated per trial

    # Select the least used GPU for this trial
    best_gpu = get_least_used_gpu()
    os.environ["CUDA_VISIBLE_DEVICES"] = str(best_gpu)
    device = torch.device("cuda")

    print(f"Trial {trial.number} running on GPU {best_gpu}")

    # Construct and train model
    model = construct_model(trial)
    trainer = get_trainer(model, dataset, tokenizer)
    
    trainer.train()
    eval_results = trainer.evaluate()

    return eval_results["eval_accuracy"]

In [8]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using GPU: {device}")

Using GPU: cuda


In [9]:
from chop.tools.utils import deepsetattr

# Define number of trials
num_trials = 10

# Run GridSampler
grid_sampler = optuna.samplers.GridSampler(search_space)
grid_study = optuna.create_study(direction="maximize", sampler=grid_sampler)
grid_study.optimize(objective, n_trials=num_trials)

# Run TPESampler
tpe_sampler = optuna.samplers.TPESampler()
tpe_study = optuna.create_study(direction="maximize", sampler=tpe_sampler)
tpe_study.optimize(objective, n_trials=num_trials)

[I 2025-02-04 18:07:43,381] A new study created in memory with name: no-name-9a02931e-33d7-4b80-95e7-d75891e7aaf0


Trial 0 running on GPU 0


    There is an imbalance between your GPUs. You may want to exclude GPU 0 which
    has less than 75% of the memory or cores of GPU 1. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Step,Training Loss


[W 2025-02-04 18:08:36,861] Trial 0 failed with parameters: {'num_layers': 8, 'num_heads': 2, 'hidden_size': 384, 'intermediate_size': 1536, 'linear_layer_choices': 'linear'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/infres/casta-22/miniconda3/envs/mase/lib/python3.11/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_1684846/965990618.py", line 15, in objective
    trainer.train()
  File "/home/infres/casta-22/miniconda3/envs/mase/lib/python3.11/site-packages/transformers/trainer.py", line 2171, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/infres/casta-22/miniconda3/envs/mase/lib/python3.11/site-packages/transformers/trainer.py", line 2531, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^

KeyboardInterrupt: 

### Extract best accuracy per trial
This tracks the highest achieved accuracy at each trial to compare how quickly each method finds an optimal solution.

In [11]:
# Get best accuracy per trial for Grid Search
grid_best_accuracies = []
best_so_far = 0
for trial in grid_study.trials:
    best_so_far = max(best_so_far, trial.value)
    grid_best_accuracies.append(best_so_far)

# Get best accuracy per trial for TPE Search
tpe_best_accuracies = []
best_so_far = 0
for trial in tpe_study.trials:
    best_so_far = max(best_so_far, trial.value)
    tpe_best_accuracies.append(best_so_far)