# Hyperparameter Tuning

In [7]:
import numpy as np
from data_preperation.dataset import CityDataset
import os

from config import PATH, CITIES, MIN_LABELS, PATCH_SIZE, TEST_CITY


In [2]:
dataset = CityDataset(
    PATH,
    patch_size=PATCH_SIZE,
    data_name="openEO.tif",
    labels_name="building_mask_dense.tif",
    image_bands=[1, 2, 3, 4, 5, 6],
    min_labels=MIN_LABELS,
    cities=CITIES,
    train=True,
)

dataset_test = CityDataset(
    PATH,
    data_name="openEO.tif",
    labels_name="building_mask_dense.tif",
    image_bands=[1, 2, 3, 4, 5, 6],
    cities=["Berlin"],
    train=False,
)

Loading data from cities:
['Singapore', 'Johannesburg', 'Bogota', 'Wien', 'London', 'Montreal', 'Seoul', 'Aachen', 'CapeTown', 'Hamburg', 'Paris', 'Frankfurt', 'Muenchen', 'Sydney']


Loading Images:   0%|          | 0/14 [00:00<?, ?it/s]

Loading Labels:   0%|          | 0/14 [00:00<?, ?it/s]

Creating Patches from Images: 0it [00:00, ?it/s]

Loading data from cities:
['Berlin']


Loading Images:   0%|          | 0/1 [00:00<?, ?it/s]

Loading Labels:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# from torch.utils.tensorboard import SummaryWriter
import lightning as L
from typing import Any
from torch.utils.data import DataLoader, Dataset
from lightning import seed_everything
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint
from models.lightning_utils import LitModule
from models.baseconvnet import ConvNetSimple

from lightning.pytorch.tuner.tuning import Tuner

# model
# convmodel = LitModule(ConvNetSimple())

# trainer
def get_trainer(dirname):
    trainer = L.Trainer(
        default_root_dir=f"model_experiments/hyperparam_tuning/{dirname}",
        callbacks=[
            EarlyStopping(
                monitor="val_loss",
                mode="min",
                patience=2,
            ),
            ModelCheckpoint(
                dirpath=f"model_experiments/hyperparam_tuning/{dirname}",
                monitor="val_loss",
                mode="min",
                save_top_k=1,
                filename="best_model"
            )
        ],
        # val_check_interval=1,
        fast_dev_run=False,
        num_sanity_val_steps=2,
        max_epochs=10,
        log_every_n_steps=20,
    )
    return trainer

In [9]:
from itertools import product
import random
b = {'B04': 1, 'B03': 2, 'B02': 3, 'B08': 4, 'B12': 5, 'B11': 6}
channels = list(b.values())
print(channels)
optimizers = ["sgd", "adamW","adam"]

grid = []
i = 0
while i < 5:
    num_channels = random.randint(3, 6)
    selected_channels = random.sample(channels, num_channels)
    selected_channels.sort()    
    if selected_channels not in grid:
        i += 1
        grid.append(selected_channels) 
    else:
        continue
print(grid)

[1, 2, 3, 4, 5, 6]
[[1, 3, 4, 5, 6], [2, 3, 4, 5, 6], [1, 2, 3, 4, 6], [2, 3, 4], [1, 4, 5]]


In [10]:
model_stats = {}
torch.set_float32_matmul_precision('high') # for tensor cores
test_dl = DataLoader(dataset_test, batch_size=32, shuffle=False, num_workers=20)
for chs in grid:
    print(chs)
    # load the data
    dataset = CityDataset(PATH,
                      patch_size=32,
                      data_name="openEO.tif",
                      labels_name="building_mask_dense.tif",
                      image_bands=chs,
                      min_labels=0.1,
                      cities=CITIES,
                      train=True,)
    train_ds, val_ds = dataset.train_val_split(val_size=0.1, show_summary=False)
    train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=20)
    val_dl = DataLoader(val_ds, batch_size=32, shuffle=False, num_workers=20)

    dataset_test = CityDataset(PATH,
                            data_name="openEO.tif",
                            labels_name="building_mask_dense.tif",
                            image_bands=chs,
                            cities=TEST_CITY,
                            train=False)
    test_dl = DataLoader(dataset_test, batch_size=32, shuffle=False, num_workers=20)


    for optim in optimizers:
        print(chs, optim)
        # create the model
        model = LitModule(ConvNetSimple(len(chs)), learning_rate=0.001, optimizer=optim)
        # create the trainer
        trainer = get_trainer(f'convSimple/{str(chs).replace(", ", "_").replace("[", "_").replace("]", "_")}{optim}')
        # tuner lr
        seed_everything(49)
        tuner = Tuner(trainer=trainer)
        tuner.lr_find(model, train_dl, val_dl, min_lr=1e-5, max_lr=0.03, num_training=5000)

        # train the model
        trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=val_dl)
        model_stats[f"{str(chs)}_{optim}"] = dict(
            best_model=trainer.checkpoint_callback.best_model_path,
        )
        # test the model
        best_model = LitModule.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
        trainer.test(model=model, dataloaders=test_dl)   


[1, 3, 4, 5, 6]
Loading data from cities:
['Singapore', 'Johannesburg', 'Bogota', 'Wien', 'London', 'Montreal', 'Seoul', 'Aachen', 'CapeTown', 'Hamburg', 'Paris', 'Frankfurt', 'Muenchen', 'Sydney']


Loading Images:   0%|          | 0/14 [00:00<?, ?it/s]

Loading Labels:   0%|          | 0/14 [00:00<?, ?it/s]

Creating Patches from Images: 0it [00:00, ?it/s]

Loading data from cities:
['Berlin']


Loading Images:   0%|          | 0/1 [00:00<?, ?it/s]

Loading Labels:   0%|          | 0/1 [00:00<?, ?it/s]

/home/jlb/Projects/architecture-of-ml-systems/.venv/lib/python3.10/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['model'])`.
/home/jlb/Projects/architecture-of-ml-systems/.venv/lib/python3.10/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Seed set to 49


[1, 3, 4, 5, 6] sgd


Missing logger folder: model_experiments/hyperparam_tuning/convSimple/_1_3_4_5_6_sgd/lightning_logs
/home/jlb/Projects/architecture-of-ml-systems/.venv/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:652: Checkpoint directory /home/jlb/Projects/architecture-of-ml-systems/model_experiments/hyperparam_tuning/convSimple/_1_3_4_5_6_sgd exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/5000 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=5000` reached.
Learning rate set to 0.02270478715883327
Restoring states from the checkpoint path at model_experiments/hyperparam_tuning/convSimple/_1_3_4_5_6_sgd/.lr_find_cadb1e8f-2f66-4b1f-9e44-1cb9561fe7da.ckpt
Restored all states from the checkpoint at model_experiments/hyperparam_tuning/convSimple/_1_3_4_5_6_sgd/.lr_find_cadb1e8f-2f66-4b1f-9e44-1cb9561fe7da.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type          | Params | Mode 
------------------------------------------------
0 | model | ConvNetSimple | 94.0 K | train
1 | loss  | BCELoss       | 0      | train
------------------------------------------------
94.0 K    Trainable params
0         Non-trainable params
94.0 K    Total params
0.376     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

/home/jlb/Projects/architecture-of-ml-systems/.venv/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Seed set to 49


────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_loss_epoch        0.49404653906822205
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
[1, 3, 4, 5, 6] adamW


Missing logger folder: model_experiments/hyperparam_tuning/convSimple/_1_3_4_5_6_adamW/lightning_logs
/home/jlb/Projects/architecture-of-ml-systems/.venv/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:652: Checkpoint directory /home/jlb/Projects/architecture-of-ml-systems/model_experiments/hyperparam_tuning/convSimple/_1_3_4_5_6_adamW exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/5000 [00:00<?, ?it/s]

# UNET Hyperparam Search

In [1]:
model_stats = {}
from models.unet import UNet

torch.set_float32_matmul_precision('high') # for tensor cores
test_dl = DataLoader(dataset_test, batch_size=32, shuffle=False, num_workers=20)
for chs in grid:
    print(chs)
    # load the data
    dataset = CityDataset(PATH,
                      patch_size=32,
                      data_name="openEO.tif",
                      labels_name="building_mask_dense.tif",
                      image_bands=chs,
                      min_labels=0.1,
                      cities=CITIES,
                      train=True,)
    train_ds, val_ds = dataset.train_val_split(val_size=0.1, show_summary=False)
    train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=20)
    val_dl = DataLoader(val_ds, batch_size=32, shuffle=False, num_workers=20)

    dataset_test = CityDataset(PATH,
                            data_name="openEO.tif",
                            labels_name="building_mask_dense.tif",
                            image_bands=chs,
                            cities=TEST_CITY,
                            train=False)
    test_dl = DataLoader(dataset_test, batch_size=32, shuffle=False, num_workers=20)


    for optim in optimizers:
        print(chs, optim)
        # create the model

        # model
        unet = UNet(n_channels=len(dataset.get_image_bands()), n_classes=1, bilinear=True)
        # UNet implementation uses the BCEWithLogitsLoss, lr of 1e-5 default
        model = LitModule(unet, learning_rate=1e-4, loss=nn.BCEWithLogitsLoss())


        # create the trainer
        trainer = get_trainer(f'unet/{str(chs).replace(", ", "_").replace("[", "_").replace("]", "_")}{optim}')
        
        
        # tuner lr
        seed_everything(49)
        tuner = Tuner(trainer=trainer)
        tuner.lr_find(model, train_dl, val_dl, min_lr=1e-5, max_lr=0.03, num_training=5000)

        # train the model
        trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=val_dl)
        model_stats[f"{str(chs)}_{optim}"] = dict(
            best_model=trainer.checkpoint_callback.best_model_path,
        )
        # test the model
        best_model = LitModule.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
        trainer.test(model=model, dataloaders=test_dl)   


NameError: name 'torch' is not defined