# This notebook explores evaluates how a DeepLabV3 model performs on seismic segmentation task with few data


##  1. Initial setup

In [1]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[1716473739.058885] [dac6c281a48b:949842:f]        vfs_fuse.c:281  UCX  ERROR inotify_add_watch(/tmp) failed: No space left on device


In [2]:
import os

# Run unity tests
run_unity_test = True

# Setup trainer
max_epochs=2

# Pretrained weights filename
pretrained_weights_filename = f'backbone_parameters-epochs-50.pth'
#pretrained_weights_filename = None

# Check if file exists
if pretrained_weights_filename and not os.path.exists(pretrained_weights_filename):
    raise Exception(f"Could not find file {pretrained_weights_filename}. "+
                    "Please set the pretrained_weights_filename variable with a valid filename.")


# 2. Auxiliary functions



Build the downstream model

In [3]:
import models.deeplabv3 as dlv3

# Build the downstream model
def build_downstream_model(backbone_weights_filepath=None):
    # Build and load the backbone weights
    backbone = dlv3.DeepLabV3Backbone()
    backbone.load_state_dict(torch.load(backbone_weights_filepath))

    # Build the downstream model
    downstream_model = dlv3.DeepLabV3Model(backbone = backbone, num_classes=6)
    downstream_model.to(device=device)

    return downstream_model

if run_unity_test:
    # Test the model
    downstream_model = build_downstream_model(pretrained_weights_filename)
    random_input = torch.rand(2,3,255,701).to(device=device)
    output = downstream_model(random_input)
    print("output_shape = ", output.shape)

output_shape =  torch.Size([2, 6, 255, 701])


Evaluation functions

In [4]:
from torchmetrics import JaccardIndex

def evaluate_model(model, dataset_dl):
    # Inicialize JaccardIndex metric
    jaccard = JaccardIndex(task="multiclass", num_classes=6)

    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # For each batch, compute the predictions and compare with the labels.
    for X, y in dataset_dl:
        # Move the model, data and metric to the GPU if available
        model.to(device)
        X = X.to(device)
        y = y.to(device)
        jaccard.to(device)

        logits = model(X.float())
        predictions = torch.argmax(logits, dim=1, keepdim=True)
        jaccard(predictions, y)
    # Return a tuple with the number of correct predictions and the total number of predictions
    return (float(jaccard.compute().to("cpu")))

def report_accuracy(model, dataset_dl, prefix=""):
    iou = evaluate_model(model, dataset_dl)
    print(prefix + " IoU = {:0.4f}".format(iou))

# 3. Train using multiple subsets



## 3.1 Define dataloaders

In [5]:
# Instanciate SeismicDataModule
from data_modules.seismic import F3SeismicDataModule

# Instantiating the SeismicDataModule with root dir at data/f3
data_module = F3SeismicDataModule("data/", batch_size=8)

In [6]:
# Create a dictionary that maps the cap ratio to each dataloader.
dataloaders = {}
for cap in [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]:
    print("Generating data loader with {0:.2f}% of data".format(cap*100) )
    dataloaders[cap] = {
        "train_dl": data_module.train_dataloader(cap=cap, drop_last=True),
        "val_dl": data_module.val_dataloader(cap=cap),
    }

if run_unity_test:
    print(f"{len(dataloaders)} dataloaders were defined:")
    for cap, d in dataloaders.items():
        train_dl = d["train_dl"]
        val_dl = d["val_dl"]
        print(f" - Train/Val dataloader with cap={cap} has {len(train_dl)}/{len(val_dl)} batch(es) and {len(train_dl.dataset)}/{len(val_dl.dataset)} sample(s)!")

Generating data loader with 1.00% of data
Generating data loader with 5.00% of data
Generating data loader with 10.00% of data
Generating data loader with 20.00% of data
Generating data loader with 50.00% of data
Generating data loader with 100.00% of data
6 dataloaders were defined:
 - Train/Val dataloader with cap=0.01 has 1/1 batch(es) and 9/1 sample(s)!
 - Train/Val dataloader with cap=0.05 has 6/1 batch(es) and 49/5 sample(s)!
 - Train/Val dataloader with cap=0.1 has 12/2 batch(es) and 99/11 sample(s)!
 - Train/Val dataloader with cap=0.2 has 24/3 batch(es) and 198/22 sample(s)!
 - Train/Val dataloader with cap=0.5 has 62/7 batch(es) and 496/55 sample(s)!
 - Train/Val dataloader with cap=1.0 has 124/14 batch(es) and 992/110 sample(s)!


## 3.2 Train models

In [7]:
import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint

# Train all models
best_checkpoint_path = {}
for cap, d in dataloaders.items():
    train_dl = d["train_dl"]
    val_dl = d["val_dl"]
    print(f"Evaluating model with cap={cap}")
    downstream_model = build_downstream_model(pretrained_weights_filename)
    checkpoint_callback = ModelCheckpoint(save_top_k=1, monitor="val_loss")
    trainer = L.Trainer(max_epochs=max_epochs, log_every_n_steps=1, callbacks=[checkpoint_callback])
    trainer.fit(model=downstream_model, train_dataloaders=train_dl, val_dataloaders=val_dl)
    print("Best model was saved at:", checkpoint_callback.best_model_path)
    best_checkpoint_path[cap] = checkpoint_callback.best_model_path

Evaluating model with cap=0.01


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                    | Params
------------------------------------------------------
0 | backbone  | DeepLabV3Backbone       | 25.6 M
1 | pred_head | DeepLabV3PredictionHead | 16.1 M
2 | loss_fn   | CrossEntropyLoss        | 0     
------------------------------------------------------
41.7 M    Trainable params
0         Non-trainable params
41.7 M    Total params
166.736   Total estimated model params size (MB)


Epoch 1: 100%|██████████| 1/1 [00:00<00:00,  1.79it/s, v_num=113]          

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 1/1 [00:01<00:00,  0.83it/s, v_num=113]
Best model was saved at: /workspaces/2024-mo436-course-work/lightning_logs/version_113/checkpoints/epoch=1-step=2.ckpt
Evaluating model with cap=0.05


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                    | Params
------------------------------------------------------
0 | backbone  | DeepLabV3Backbone       | 25.6 M
1 | pred_head | DeepLabV3PredictionHead | 16.1 M
2 | loss_fn   | CrossEntropyLoss        | 0     
------------------------------------------------------
41.7 M    Trainable params
0         Non-trainable params
41.7 M    Total params
166.736   Total estimated model params size (MB)


Epoch 1: 100%|██████████| 6/6 [00:01<00:00,  3.64it/s, v_num=114]          

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 6/6 [00:02<00:00,  2.63it/s, v_num=114]
Best model was saved at: /workspaces/2024-mo436-course-work/lightning_logs/version_114/checkpoints/epoch=1-step=12.ckpt
Evaluating model with cap=0.1


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                    | Params
------------------------------------------------------
0 | backbone  | DeepLabV3Backbone       | 25.6 M
1 | pred_head | DeepLabV3PredictionHead | 16.1 M
2 | loss_fn   | CrossEntropyLoss        | 0     
------------------------------------------------------
41.7 M    Trainable params
0         Non-trainable params
41.7 M    Total params
166.736   Total estimated model params size (MB)


Epoch 1: 100%|██████████| 12/12 [00:02<00:00,  4.30it/s, v_num=115]        

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 12/12 [00:02<00:00,  4.29it/s, v_num=115]
Best model was saved at: /workspaces/2024-mo436-course-work/lightning_logs/version_115/checkpoints/epoch=0-step=12.ckpt
Evaluating model with cap=0.2


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                    | Params
------------------------------------------------------
0 | backbone  | DeepLabV3Backbone       | 25.6 M
1 | pred_head | DeepLabV3PredictionHead | 16.1 M
2 | loss_fn   | CrossEntropyLoss        | 0     
------------------------------------------------------
41.7 M    Trainable params
0         Non-trainable params
41.7 M    Total params
166.736   Total estimated model params size (MB)


Epoch 1: 100%|██████████| 24/24 [00:04<00:00,  4.88it/s, v_num=116]        

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 24/24 [00:05<00:00,  4.33it/s, v_num=116]
Best model was saved at: /workspaces/2024-mo436-course-work/lightning_logs/version_116/checkpoints/epoch=1-step=48.ckpt
Evaluating model with cap=0.5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                    | Params
------------------------------------------------------
0 | backbone  | DeepLabV3Backbone       | 25.6 M
1 | pred_head | DeepLabV3PredictionHead | 16.1 M
2 | loss_fn   | CrossEntropyLoss        | 0     
------------------------------------------------------
41.7 M    Trainable params
0         Non-trainable params
41.7 M    Total params
166.736   Total estimated model params size (MB)


Epoch 1: 100%|██████████| 62/62 [00:11<00:00,  5.47it/s, v_num=117]        

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 62/62 [00:11<00:00,  5.19it/s, v_num=117]
Best model was saved at: /workspaces/2024-mo436-course-work/lightning_logs/version_117/checkpoints/epoch=1-step=124.ckpt
Evaluating model with cap=1.0


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                    | Params
------------------------------------------------------
0 | backbone  | DeepLabV3Backbone       | 25.6 M
1 | pred_head | DeepLabV3PredictionHead | 16.1 M
2 | loss_fn   | CrossEntropyLoss        | 0     
------------------------------------------------------
41.7 M    Trainable params
0         Non-trainable params
41.7 M    Total params
166.736   Total estimated model params size (MB)


Epoch 1: 100%|██████████| 124/124 [00:21<00:00,  5.70it/s, v_num=118]      

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 124/124 [00:22<00:00,  5.47it/s, v_num=118]
Best model was saved at: /workspaces/2024-mo436-course-work/lightning_logs/version_118/checkpoints/epoch=1-step=248.ckpt


Set output file names.
- `output_file_prefix`: prefix to be used on all output files
- `best_checkpoint_paths`: pickle file with dictionary mapping each `cap` value to the path to the file with the best model trained with the `cap` dataloader. 
- `results_filename`: text file with a summary of the results.

In [8]:
if pretrained_weights_filename:
    output_file_prefix = "pretrain-" + ".".join(pretrained_weights_filename.split(".")[:-1]) + "-"
else:
    output_file_prefix = "pretrain-no-"
best_checkpoint_paths = output_file_prefix+'best_checkpoint_filenames.pkl'
results_filename = output_file_prefix+f'results.txt'


Save the dictionary with paths to the best checkpoints.

In [9]:
import pickle 
with open(best_checkpoint_paths, 'wb') as f:
    pickle.dump(best_checkpoint_path, f)


## 3.3 Evaluate models

Test dataloader

In [10]:
test_dl = data_module.test_dataloader()

Evaluate model

In [11]:
import models.deeplabv3 as dlv3

with open(best_checkpoint_paths, 'rb') as f:
    best_checkpoint_path = pickle.load(f)

cap_vs_iou = {}
for cap, weights_filename in best_checkpoint_path.items():
    print(f"cap = {cap}: {weights_filename}")
    downstream_model = dlv3.DeepLabV3Model.load_from_checkpoint(weights_filename)
    iou = evaluate_model(downstream_model, test_dl)
    print(f" - iou = {iou}")
    cap_vs_iou[cap] = iou

cap = 0.01: /workspaces/2024-mo436-course-work/lightning_logs/version_113/checkpoints/epoch=1-step=2.ckpt
 - iou = 0.26956263184547424
cap = 0.05: /workspaces/2024-mo436-course-work/lightning_logs/version_114/checkpoints/epoch=1-step=12.ckpt
 - iou = 0.322662353515625
cap = 0.1: /workspaces/2024-mo436-course-work/lightning_logs/version_115/checkpoints/epoch=0-step=12.ckpt
 - iou = 0.2828516364097595
cap = 0.2: /workspaces/2024-mo436-course-work/lightning_logs/version_116/checkpoints/epoch=1-step=48.ckpt
 - iou = 0.3802063465118408
cap = 0.5: /workspaces/2024-mo436-course-work/lightning_logs/version_117/checkpoints/epoch=1-step=124.ckpt
 - iou = 0.4414859414100647
cap = 1.0: /workspaces/2024-mo436-course-work/lightning_logs/version_118/checkpoints/epoch=1-step=248.ckpt
 - iou = 0.4527760148048401


Write results summary to `results_filename`.

In [12]:
with open(results_filename, 'w') as f:
    for cap, iou in cap_vs_iou.items():
        print(f"cap ({cap}): iou = {iou}")
        f.write(f"cap ({cap}): iou = {iou}\n")

cap (0.01): iou = 0.26956263184547424
cap (0.05): iou = 0.322662353515625
cap (0.1): iou = 0.2828516364097595
cap (0.2): iou = 0.3802063465118408
cap (0.5): iou = 0.4414859414100647
cap (1.0): iou = 0.4527760148048401
