In [1]:
# !pip install h5py
# !pip install pandas matplotlib
# !pip install seaborn
# !pip install scipy
# !pip install scikit-learn

In [2]:
import torch
import pandas
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.patches import Patch
import copy
import h5py
import itertools
import os
import gc

from model.model_3D import *
from model.train_eval import *

from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import StepLR

from postprocessing.save_results import *
from postprocessing.plot_results import *
from postprocessing.metrics import single_roc_curve

In [3]:
### check if cuda is available

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print(f"Using {device}.")

Using cuda.


In [4]:
### import NDVI data

def load_with_hdf5(name):
    """
    Load tensor data from HDF5 with gzip compression
    """
    with h5py.File(f"data/{name}.h5", "r") as f:
        loaded_features = torch.from_numpy(f["features"][:])
        loaded_labels = torch.from_numpy(f["labels"][:])
    return TensorDataset(loaded_features, loaded_labels)

train_set = load_with_hdf5("train_set_ndvi_v7")
val_set = load_with_hdf5("val_set_ndvi_v7")
test_set = load_with_hdf5("test_set_ndvi_v7")

In [5]:
print(f"Train set: {len(train_set)} samples")
print(f"Validation set: {len(val_set)} samples")
print(f"Test set: {len(test_set)} samples")

Train set: 518 samples
Validation set: 19 samples
Test set: 19 samples


In [6]:
print(train_set.tensors[0].shape)
print(train_set.tensors[1].shape)


torch.Size([518, 2, 4, 1000, 500])
torch.Size([518, 1000, 500])


In [7]:
def memory_stats():
    print((torch.cuda.memory_allocated()/1024**2), "Memory allocated")
    print(torch.cuda.memory_cached()/1024**2, "Memory cached")

In [8]:
def train_and_validate(batch_size, learning_rate, init_hid_dim, num_epochs=100, device='cuda'):
    # Initialize model
    model = UNet3D(
        n_channels=train_set[0][0].shape[0],
        n_classes=1,
        init_hid_dim=init_hid_dim,
        kernel_size=3,
        pooling='max',
        bilinear=False,
        drop_channels=False
    )

    # Clear CUDA cache and run garbage collection before training loop
    torch.cuda.empty_cache()
    gc.collect()

    num_parameters = sum(p.numel() for p in model.parameters())
    print(f"Number of parameters: {num_parameters:.2e}.")

    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, step_size=15, gamma=0.75)

    # Data loaders
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

    best_loss = float('inf')
    best_model = None

    train_losses = []
    val_losses = []
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    csi_scores = []

    # Training loop
    for epoch in range(1, num_epochs + 1):
        # Training step
        train_loss = training_unet(model, train_loader, optimizer, device=device, loss_f='BCE', water_threshold=0.5)
        train_losses.append(train_loss)

        # Validation step
        with torch.no_grad():  # Prevent gradient computation in validation
            val_loss, val_accuracy, val_precision, val_recall, val_f1_score, val_csi_score = validation_unet(
                model, val_loader, device=device, loss_f='BCE', water_threshold=0.5
            )

        # Move metrics to CPU and append to lists
        val_losses.append(val_loss)
        accuracies.append(val_accuracy)
        precisions.append(val_precision)
        recalls.append(val_recall)
        f1_scores.append(val_f1_score)
        csi_scores.append(val_csi_score)

        # Update learning rate
        scheduler.step()

        # Save best model
        if val_loss < best_loss:
            best_loss = val_loss
            best_model = copy.deepcopy(model).cpu()  # Move model to CPU for storage

        # Print progress
        if epoch % 1 == 0:
            print(f"Epoch: {epoch} | Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, "
                  f"Best Validation Loss: {best_loss:.4f}")
            print(f"Metrics | Accuracy: {val_accuracy:.3f}, Precision: {val_precision:.3f}, Recall: {val_recall:.3f}, "
                  f"F1-score: {val_f1_score:.3f}, CSI-score: {val_csi_score:.3f}")
            print(f"Learning Rate: {scheduler.get_last_lr()[0]:.6f}")

    # Cleanup
    del optimizer, val_loss, model, train_loader, train_loss
    gc.collect()
    torch.cuda.empty_cache()

    return best_loss, best_model, train_losses, val_losses, accuracies, precisions, recalls, f1_scores, csi_scores

In [9]:
# Test first with Antonio's Hyperparameters
batch_size = 16
learning_rate = 0.05
init_hid_dim = 8
num_epochs = 2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
loss_f = 'BCE'
machine = 'machine_1'

# Clear GPU memory before starting
torch.cuda.empty_cache()
gc.collect()

# Train and validate
print(f"Running training for Batch Size={batch_size}, Learning Rate={learning_rate}, Init Hid Dim={init_hid_dim}")

val_loss, best_model, train_losses, val_losses, accuracies, precisions, recalls, f1_scores, csi_scores = train_and_validate(
    batch_size=batch_size,
    learning_rate=learning_rate,
    init_hid_dim=init_hid_dim,
    num_epochs=num_epochs,
    device=device
)

# Clear GPU memory after training
torch.cuda.empty_cache()
gc.collect()

# Save the best model
save_model_path(
    machine=machine,
    model=best_model,  # No need to move to CPU if metrics are not tensors
    batch_size=batch_size,
    learning_rate=learning_rate,
    init_hid_dim=init_hid_dim,
    epochs=num_epochs,
    dir_output="model/models_trained"
)

# Save training and validation metrics
save_losses_metrics(
    machine=machine,
    train_losses=train_losses,
    val_losses=val_losses,
    metrics=[accuracies, precisions, recalls, f1_scores, csi_scores],
    batch_size=batch_size, 
    learning_rate=learning_rate, 
    init_hid_dim=init_hid_dim, 
    epochs=num_epochs,
    dir_output="model/losses_metrics" 
)

# Define the test loader
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

# Test the best model
with torch.no_grad():  # Disable gradient computation during testing
    model_loss = copy.deepcopy(best_model)
    test_loss, test_accuracy, test_precision, test_recall, test_f1_score, test_csi_score = validation_unet(
        model_loss, test_loader, device=device, loss_f=loss_f
    )

print(f'Average metrics for test dataset using model with best validation loss:\n\n\
BCE loss:          {test_loss:.3e}\n\
Accuracy:          {test_accuracy:.3f}\n\
Precision:         {test_precision:.3f}\n\
Recall:            {test_recall:.3f}\n\
F1 score:          {test_f1_score:.3f}\n\
CSI score:         {test_csi_score:.3f}')

# Cleanup to free memory
del best_model, model_loss, test_loader
gc.collect()
torch.cuda.empty_cache()

Running training for Batch Size=16, Learning Rate=0.05, Init Hid Dim=8
Number of parameters: 4.87e+05.
Epoch: 1 | Training Loss: 0.2446, Validation Loss: 0.2392, Best Validation Loss: 0.2392
Metrics | Accuracy: 0.914, Precision: 0.565, Recall: 0.767, F1-score: 0.651, CSI-score: 0.483
Learning Rate: 0.050000
Epoch: 2 | Training Loss: 0.1781, Validation Loss: 0.1660, Best Validation Loss: 0.1660
Metrics | Accuracy: 0.926, Precision: 0.633, Recall: 0.695, F1-score: 0.662, CSI-score: 0.496
Learning Rate: 0.050000
Metrics saved at: model/losses_metrics/machine_1_losses_metrics_NDVI_bs16_lr0.05_hid8_epoch2.csv
Average metrics for test dataset using model with best validation loss:

BCE loss:          1.648e-01
Accuracy:          0.928
Precision:         0.669
Recall:            0.738
F1 score:          0.702
CSI score:         0.541


In [10]:
# Hyperparameter Optimization
batch_sizes = [8, 16, 32]
learning_rates = [0.01, 0.05, 0.1]
init_hid_dims = [8, 16, 32]
num_epochs = 1
loss_f = 'BCE'  # Define loss function for training and testing
machine = 'machine_1'  # Dynamically determine machine (e.g., passed as an argument or environment variable)

# Split combinations across machines
all_combinations = list(itertools.product(batch_sizes, learning_rates, init_hid_dims))
split_index = len(all_combinations) // 2
combinations_split = {
    "machine_1": all_combinations[:split_index],
    "machine_2": all_combinations[split_index:]
}

# Dynamically assign combinations based on the `machine` variable
combinations = combinations_split.get(machine, [])

# Results storage
results = []

# Run grid search
for batch_size, learning_rate, init_hid_dim in combinations:
    print(f"Testing: Batch size={batch_size}, Learning rate={learning_rate}, Init Hid Dim={init_hid_dim}")

    # Clear CUDA memory and garbage collect
    torch.cuda.empty_cache()
    gc.collect()

    # Train and validate
    val_loss, best_model, train_losses, val_losses, accuracies, precisions, recalls, f1_scores, csi_scores = train_and_validate(
        batch_size=batch_size,
        learning_rate=learning_rate,
        init_hid_dim=init_hid_dim,
        num_epochs=num_epochs,
        device=device
    )

    # Save the best model
    save_model_path(machine=machine,
        model=best_model,
        batch_size=batch_size,
        learning_rate=learning_rate,
        init_hid_dim=init_hid_dim,
        epochs=num_epochs,
        dir_output="model/models_trained"
    )

    # Save training and validation metrics
    save_losses_metrics(machine=machine,
        train_losses=train_losses,
        val_losses=val_losses,
        metrics=[accuracies, precisions, recalls, f1_scores, csi_scores],
        batch_size=batch_size, 
        learning_rate=learning_rate, 
        init_hid_dim=init_hid_dim, 
        epochs=num_epochs,
        dir_output="model/losses_metrics" 
    )

    # Define the test loader
    test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

    # Test the best model on the test dataset
    with torch.no_grad():
        test_loss, test_accuracy, test_precision, test_recall, test_f1_score, test_csi_score = validation_unet(
            best_model, test_loader, device=device, loss_f=loss_f
        )

    print(f'Average metrics for test dataset using model with best validation loss:\n\n\
            {loss_f} loss:          {test_loss:.3e}\n\
            Accuracy:          {test_accuracy:.3f}\n\
            Precision:         {test_precision:.3f}\n\
            Recall:            {test_recall:.3f}\n\
            F1 score:          {test_f1_score:.3f}\n\
            CSI score:         {test_csi_score:.3f}')

    # Append results
    results.append((
        batch_size, learning_rate, init_hid_dim, 
        float(val_loss), float(test_loss), 
        float(test_accuracy), float(test_precision), float(test_recall), 
        float(test_f1_score), float(test_csi_score)
    ))

    # Cleanup
    del best_model, test_loader
    gc.collect()
    torch.cuda.empty_cache()

# Save results to a CSV file
df_results = pd.DataFrame(results, columns=[
    'Batch Size', 'Learning Rate', 'Init Hid Dim', 
    'Validation Loss', 'Test Loss', 'Test Accuracy', 
    'Test Precision', 'Test Recall', 'Test F1 Score', 'Test CSI Score'
])
df_results.to_csv(f"results_{machine}_{num_epochs}.csv", index=False)
print("Results saved.")


Testing: Batch size=8, Learning rate=0.01, Init Hid Dim=8
Number of parameters: 4.87e+05.
Epoch: 1 | Training Loss: 0.3359, Validation Loss: 0.1782, Best Validation Loss: 0.1782
Metrics | Accuracy: 0.925, Precision: 0.644, Recall: 0.662, F1-score: 0.653, CSI-score: 0.485
Learning Rate: 0.010000
Metrics saved at: model/losses_metrics/machine_1_losses_metrics_NDVI_bs8_lr0.01_hid8_epoch1.csv
Average metrics for test dataset using model with best validation loss:

            BCE loss:          1.742e-01
            Accuracy:          0.928
            Precision:         0.676
            Recall:            0.710
            F1 score:          0.692
            CSI score:         0.530
Testing: Batch size=8, Learning rate=0.01, Init Hid Dim=16
Number of parameters: 1.95e+06.
Epoch: 1 | Training Loss: 0.2299, Validation Loss: 0.1614, Best Validation Loss: 0.1614
Metrics | Accuracy: 0.925, Precision: 0.650, Recall: 0.648, F1-score: 0.649, CSI-score: 0.481
Learning Rate: 0.010000
Metrics save