In [1]:
import pytorch_lightning as pl
from ray import train, tune
from ray.train.torch import TorchTrainer
from ray.train import RunConfig, ScalingConfig, CheckpointConfig
from ray.train.lightning import (
    RayDDPStrategy,
    RayLightningEnvironment,
    RayTrainReportCallback,
    prepare_trainer,
)
from ray.tune.integration.pytorch_lightning import TuneReportCallback
from ray.tune.schedulers import ASHAScheduler
from torchvision.models import get_model, ViT_B_32_Weights, ViT_B_16_Weights
from torch.optim import Adam, AdamW
from torchmetrics import LogCoshError, MeanAbsoluteError, MeanSquaredError
# import mlflow
# from ray.air.integrations.mlflow import setup_mlflow
import pytorch_lightning as pl
import torch.nn as nn
from torchmetrics import MetricCollection, MeanAbsoluteError, MeanSquaredError, ExplainedVariance
import numpy as np
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Define model_config
model_config = {
    "protected_namespaces": ()
}

# Update model_config
model_config['protected_namespaces'] = ()



* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
import import_ipynb
from dataset import PlotsDataset

importing Jupyter notebook from dataset.ipynb




In [3]:
import os

os.environ["RAY_DEDUP_LOGS"] = "0"

In [4]:
# Paths

DATASET_PATH = "../data/raw/Case_Study_1/Raw_Images"
GCP_PATH = "../data/GCP_Images"
GROUND_TRUTH_PATH = '../data/ground_truth/ground_truth.csv'
MODEL_PATH = "../data/models/"
CHECKPOINT_PATH = "../data/checkpoints/"
ORTHOMOSAIC_PATH = "../data/orthophoto/raster.tif"
PLOT_PATH = "../data/plots"
SAMPLES_PATH = "../data/raw_samples"
SHAPEFILE_PATH = "../data/shapefile/all_plots.shp"

In [5]:
# Hyperparameters

TEST_SIZE = 0.2 # % of dataset
VAL_SIZE = 0.2  # % of training set

MAX_EPOCS = -1
BATCH_SIZE = 8
WORKERS = 8
IMG_SIZE = (224,224)

BACKBONE_NAME = "vit_b_32"
BACKBONE_WEIGHTS = ViT_B_32_Weights.IMAGENET1K_V1
PATIENCE = 30

learning_rate = 1e-5
FROZEN_LAYERS = 6
WEIGHT_DECAY = 1e-3
DROPOUT = 0.3
ATTENTION_DROPOUT = 0.3

In [6]:
class TunableVit(pl.LightningModule):
    def __init__(self, backbone, criterion, optimizer, config):
        super().__init__()

        self.criterion = criterion
        self.optimizer = optimizer
        self.learning_rate = config["learning_rate"]
        self.weight_decay = config["weight_decay"]
        self.no_grad_layers_n = config["no_grad_layers_n"]
        self.batch_size = config["batch_size"]
        self.dropout = config["dropout"]
        self.attention_dropout = config["attention_dropout"]
        self.backbone = backbone

        self.test_output = []
        self.test_loss = []
        self.test_label_mean = []
        if(self.no_grad_layers_n > 0):
            for i, param in enumerate(self.backbone.encoder.parameters()):
                if i < self.no_grad_layers_n:
                    param.requires_grad = False
        self.set_dropouts()

        # Get the number of input features of the last layer of the backbone
        num_input_filters = backbone.heads[0].in_features
        num_output_values = 1

        # Replace the head of the model
        self.backbone.heads[0] = nn.Linear(in_features=num_input_filters, out_features=num_output_values).float()
        
        metric_collection = MetricCollection([
            MeanSquaredError(),
            MeanAbsoluteError(),
            ExplainedVariance()
        ])
        self.val_metrics = metric_collection.clone(prefix="val_")
        self.test_metrics = metric_collection.clone(prefix="test_")



    def set_dropouts(self):
        for m in self.modules():
            if isinstance(m, nn.Dropout):
                m.p = self.dropout
            elif isinstance(m, nn.MultiheadAttention):
                m.dropout = self.attention_dropout

    def forward(self, x):
        x = self.backbone(x)
        return x

    def configure_optimizers(self):
        return self.optimizer
    
    # Training

    def training_step(self, batch, batch_idx):
        _, _, loss = self.get_batch_data(batch)
        self.log("train_loss", loss, on_epoch=True, on_step=False)
        return loss
    
    # Validation

    def validation_step(self, batch, batch_idx):
        outputs, labels, loss = self.get_batch_data(batch)
        step_metrics = self.val_metrics.forward(outputs, labels)
        self.log_dict(step_metrics, on_epoch=True, on_step=False)
        self.log("val_loss", loss, on_epoch=True, on_step=False)
        return loss

    def on_validation_epoch_end(self):
        self.val_metrics.reset()

    # Testing

    def test_step(self, batch, batch_idx):
        outputs, labels, loss = self.get_batch_data(batch)
        step_metrics = self.test_metrics.forward(outputs, labels)

        outputs = outputs.squeeze().cpu().numpy()
        labels = labels.squeeze().cpu().numpy()

        self.test_output.extend(outputs)
        self.test_loss.append(loss.item())
        self.test_label_mean.append(np.mean(labels))

        self.log_dict(step_metrics, on_epoch=True, on_step=False)
        self.log("test_loss", loss, on_epoch=True, on_step=False)

    def on_test_epoch_end(self):
        self.test_metrics.reset()

    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        outputs = self.forward(batch)
        predictions = [x.item() for x in outputs.detach().cpu().numpy()]
        return predictions
    
    def get_batch_data(self, batch):
        images, labels = batch
        labels = labels.unsqueeze(-1)
        outputs = self.backbone(images)
        loss = self.criterion(outputs, labels)
        return outputs, labels, loss

In [7]:
# Ray config

METRIC = "val_loss"
MODE = "min"
BACKBONE_NAME = "vit_b_32"
BACKBONE_WEIGHTS = ViT_B_32_Weights.IMAGENET1K_V1
NUM_EPOCHS = 5
NUM_SAMPLES = 10
SEARCH_SPACE = {
    "learning_rate": tune.loguniform(1e-6, 1e-2),
    "batch_size": tune.choice([8, 16, 32]),
    "no_grad_layers_n": tune.choice([0, 1, 2, 3, 4, 5]),
    "dropout": tune.choice([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]),
    "attention_dropout": tune.choice([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]),
    "experiment_name": "pl_tune_uav_vit",
    "weight_decay": tune.loguniform(1e-1, 1e-5),
}
use_gpu = False
num_workers = 4
resources_per_worker={"CPU": 1}

default_config = {
    "learning_rate": 1e-3,
    "batch_size": 8,
    "no_grad_layers_n": 0,
    "dropout": 0.0,
    "attention_dropout": 0.0,
    "experiment_name": "pl_tune_uav_vit",
    "weight_decay": 1e-4,
}

In [8]:
# Ground truth, dataset and dataloaders

ground_truth = pd.read_csv(GROUND_TRUTH_PATH)

# Elevation format conversion to float32
ground_truth["elev"] = ground_truth["elev"].astype("float32")
labels_norm = ground_truth["elev"]
ground_truth.head()

# Targets normalization

scaler = MinMaxScaler()
ground_truth["elev"] = scaler.fit_transform(ground_truth[["elev"]])

base_dataset = {
    "name": "base",
    "dataset": PlotsDataset(labels=ground_truth, img_dir=PLOT_PATH, img_size=IMG_SIZE, transforms=None),
}
dataset = base_dataset["dataset"]

# Dataset split

train_set, test_set = train_test_split(dataset, test_size=TEST_SIZE)
train_set, val_set = train_test_split(train_set, test_size=VAL_SIZE)

print(f"Training set size: {len(train_set)}")
print(f"Validation set size: {len(val_set)}")
print(f"Test set size: {len(test_set)}")

# Dataloaders

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, num_workers=WORKERS)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, num_workers=WORKERS)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, num_workers=WORKERS)

print(f"Train Dataloader size: {len(train_loader)}")
print(f"Validation Dataloader size: {len(val_loader)}")
print(f"Test Dataloader size: {len(test_loader)}")

Training set size: 451
Validation set size: 113
Test set size: 141
Train Dataloader size: 57
Validation Dataloader size: 15
Test Dataloader size: 18


In [9]:
def train_func(config):
    # setup_mlflow(
    #     config,
    #     experiment_name=config.get("experiment_name", None),
    #     tracking_uri=config.get("tracking_uri", None),
    # )
    criterion = LogCoshError()
    backbone = get_model(BACKBONE_NAME, weights=BACKBONE_WEIGHTS)
    optimizer = AdamW(backbone.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"])
    model = TunableVit(backbone, criterion, optimizer, config)
    
    # mlflow.pytorch.autolog()

    trainer = pl.Trainer(
        devices="auto",
        accelerator="auto",
        strategy=RayDDPStrategy(),
        max_epochs=NUM_SAMPLES,
        callbacks=[RayTrainReportCallback()],
        plugins=[RayLightningEnvironment()],
        enable_progress_bar=False,
    )
    trainer = prepare_trainer(trainer)
    trainer.fit(model, train_loader)
    # trainer.test(model)

In [10]:
# Number of parallel workers
scaling_config = ScalingConfig(
    num_workers=num_workers, use_gpu=use_gpu, resources_per_worker=resources_per_worker
)

# Ray trainer setup
run_config = RunConfig(
    checkpoint_config=CheckpointConfig(
        num_to_keep=2,
        checkpoint_score_attribute=METRIC,
        checkpoint_score_order=MODE
    ),
)

ray_trainer = TorchTrainer(
    train_func,
    scaling_config=scaling_config,
    run_config=run_config,
)

In [11]:
def vit_tuner(metric=METRIC, mode=MODE, num_samples=NUM_SAMPLES, experiment_name="pl_tuning"):
    scheduler = ASHAScheduler(max_t=NUM_EPOCHS, grace_period=1, reduction_factor=2)

    # mlflow.set_experiment(experiment_name)
    tuner = tune.Tuner(
        ray_trainer,
        param_space={"train_loop_config": SEARCH_SPACE},
        tune_config=tune.TuneConfig(
            metric=metric,
            mode=mode,
            num_samples=num_samples,
            scheduler=scheduler,
        ),
    )
    return tuner.fit()

In [12]:
results = vit_tuner()

0,1
Current time:,2023-10-17 19:55:10
Running for:,00:03:33.65
Memory:,11.5/15.9 GiB

Trial name,status,loc,train_loop_config/at tention_dropout,train_loop_config/ba tch_size,train_loop_config/dr opout,train_loop_config/le arning_rate,train_loop_config/no _grad_layers_n,train_loop_config/we ight_decay
TorchTrainer_d0dde_00000,RUNNING,127.0.0.1:7884,0.5,32,0.5,1.67824e-05,0,0.00247118
TorchTrainer_d0dde_00001,PENDING,,0.3,8,0.4,1.68149e-05,4,0.000143579
TorchTrainer_d0dde_00002,PENDING,,0.2,8,0.1,0.000303171,5,0.0490588
TorchTrainer_d0dde_00003,PENDING,,0.2,16,0.5,1.40841e-06,5,0.0974765
TorchTrainer_d0dde_00004,PENDING,,0.3,32,0.4,3.87829e-06,1,0.0346637
TorchTrainer_d0dde_00005,PENDING,,0.4,32,0.3,0.000495205,4,0.00560083
TorchTrainer_d0dde_00006,PENDING,,0.2,32,0.0,0.000201238,0,0.0158583
TorchTrainer_d0dde_00007,PENDING,,0.0,32,0.4,0.00944795,4,0.00921454
TorchTrainer_d0dde_00008,PENDING,,0.2,16,0.4,0.000251012,0,0.00365252
TorchTrainer_d0dde_00009,PENDING,,0.4,32,0.4,0.00023094,3,0.0124522


2023-10-17 19:51:36,941	INFO data_parallel_trainer.py:407 -- GPUs are detected in your Ray cluster, but GPU training is not enabled for this trainer. To enable GPU training, make sure to set `use_gpu` to True in your scaling config.
2023-10-17 19:51:36,946	INFO data_parallel_trainer.py:407 -- GPUs are detected in your Ray cluster, but GPU training is not enabled for this trainer. To enable GPU training, make sure to set `use_gpu` to True in your scaling config.
2023-10-17 19:51:36,952	INFO data_parallel_trainer.py:407 -- GPUs are detected in your Ray cluster, but GPU training is not enabled for this trainer. To enable GPU training, make sure to set `use_gpu` to True in your scaling config.
2023-10-17 19:51:36,958	INFO data_parallel_trainer.py:407 -- GPUs are detected in your Ray cluster, but GPU training is not enabled for this trainer. To enable GPU training, make sure to set `use_gpu` to True in your scaling config.
2023-10-17 19:51:36,963	INFO data_parallel_trainer.py:407 -- GPUs ar

TuneError: The Ray Tune run failed. Please inspect the previous error messages for a cause. After fixing the issue, you can restart the run from scratch or continue this run. To continue this run, you can use `tuner = Tuner.restore("C:/Users/Thuls/ray_results/TorchTrainer_2023-10-17_19-51-28", trainable=...)`.

[2m[36m(RayTrainWorker pid=5628)[0m Traceback (most recent call last):
[2m[36m(RayTrainWorker pid=5628)[0m   File "<string>", line 1, in <module>
[2m[36m(RayTrainWorker pid=5628)[0m   File "C:\Program Files\Python310\lib\multiprocessing\spawn.py", line 116, in spawn_main
[2m[36m(RayTrainWorker pid=5628)[0m     exitcode = _main(fd, parent_sentinel)
[2m[36m(RayTrainWorker pid=5628)[0m   File "C:\Program Files\Python310\lib\multiprocessing\spawn.py", line 126, in _main
[2m[36m(RayTrainWorker pid=5628)[0m     self = reduction.pickle.load(from_parent)
[2m[36m(RayTrainWorker pid=5628)[0m EOFError: Ran out of input


In [None]:
results

In [None]:
results.get_best_result(metric="val_loss", mode="min")