# Use HuggingFace Accelerate to Train Model on Temporal (monthly) Sentinel Data

## Imports

In [1]:
import torch.multiprocessing as mp
from time import time

In [2]:
from accelerate import Accelerator, init_empty_weights, notebook_launcher
from accelerate.utils import set_seed
from biomasstry.datasets import TemporalSentinel2Dataset, TemporalSentinel1Dataset
from biomasstry.models import TemporalSentinelModel, UTAE
from biomasstry.models.unet_tae import ConvBlock
# from biomasstry.models.utils import run_training
import numpy as np
import pandas as pd
from pynvml import *
import torch
import torch.nn as nn
from torch.utils.data import random_split, DataLoader
from transformers import TrainingArguments, Trainer, logging
from tqdm.auto import tqdm

In [3]:
logging.set_verbosity_error()

In [4]:
mp.set_start_method("forkserver")
mp.set_forkserver_preload(["torch"])

## Utility Functions

In [5]:
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())

1.12.0+cu116
11.6
8302


In [6]:
# Utility functions for printing GPU utilization
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [7]:
print_gpu_utilization()

GPU memory occupied: 261 MB.


## Dataset and DataLoaders

In [8]:
def get_dataloaders(dataset: str, batch_size: int=8, num_workers: int=6):
    """Return train and eval DataLoaders with specified batch size.
    
    dataset: str
        Dataset identifier. Must be one of "Sentinel-1A", "Sentinel-1D" or "Sentinel-2All"
    batch_size: int
        batch size for each batch.
    """
    # If True, access directly S3.
    # If False, assume data is mounted and available under '/datasets/biomassters'
    S3_DIRECT = False
    if S3_DIRECT:
        data_url="s3://drivendata-competition-biomassters-public-us"
    else:
        data_url = ""

    if dataset == "Sentinel-1A": # Sentinel-1 Ascending only
        ds = TemporalSentinel1Dataset(data_url=data_url, bands=["VVA", "VHA"])
    elif dataset == "Sentinel-1D": # Sentinel-1 Descending only
        ds = TemporalSentinel1Dataset(data_url=data_url, bands=["VVD", "VHD"])
    elif dataset == "Sentinel-2all":
        ds = TemporalSentinel2Dataset(data_url=data_url)
    else:
        print("Unrecognized dataset identifier. Must be one of 'Sentinel-1A', 'Sentinel-1D' or 'Sentinel-2all'")
        return None, None

    train_size = int(0.8*len(ds))
    valid_size = len(ds) - train_size
    train_set, eval_set = random_split(ds, [train_size, valid_size])

    print(f"Train samples: {len(train_set)} "
        f"Val. samples: {len(eval_set)}")

    # DataLoaders
    pin_memory = True
    train_dataloader = DataLoader(train_set,
                        batch_size=batch_size,
                        shuffle=True,
                        pin_memory=pin_memory,
                        num_workers=num_workers)
    eval_dataloader = DataLoader(eval_set,
                        batch_size=batch_size * 2,
                        shuffle=False,
                        pin_memory=pin_memory,
                        num_workers=num_workers)
    
    return train_dataloader, eval_dataloader

## Training Loop

In [9]:
def training_loop(dataset: str,
                  mixed_precision: str="fp16",
                  seed: int=123,
                  batch_size: int=8,
                  gradient_accumulation_steps: int=4,
                  nb_epochs=2,
                  train_mode: str=""
    ):
    """Main Training and Evaluation Loop to be called by accelerator.notebook_launcher()."""
    print(f"Args: {mixed_precision}, {seed}, {batch_size}, "
          f"{gradient_accumulation_steps}, {nb_epochs}, {train_mode}")

    # Set random seed
    set_seed(seed)

    # Initialize Accelerator
    accelerator = Accelerator(mixed_precision=mixed_precision,
        gradient_accumulation_steps=gradient_accumulation_steps)

    # Build DataLoaders
    train_dataloader, eval_dataloader = get_dataloaders(dataset, batch_size=batch_size)

    # Assign model inputs based on dataset
    if dataset == "Sentinel-1A":
        input_nc = 2
        n_tsamples = 6
    elif dataset == "Sentinel-1D":
        input_nc = 2
        n_tsamples = 6
    else:
        input_nc = 10
        n_tsamples = 5

    # Create model
    if train_mode == "tune":
        print(f"Fine tuning pre-trained weights")
        print(f"Loading weights from {pretrained_weights_path}")
        saved_dict = torch.load(pretrained_weights_path)
        model = UTAE(10, out_conv=[32, 20])  # Initialize the original model & load pre-trained weights
        model.load_state_dict(saved_dict["state_dict"], )
        model.out_conv = ConvBlock([32, 32, 1], padding_mode="reflect")  # Modify the last layer
        lr = 0.001
        print("Pre-trained weights loaded successfully.")
    else:
        model = UTAE(input_nc)  # modify output layer to predict AGBM
        lr = 0.01
        if train_mode == "resume":
            print("Resuming training...")
            print(f"Loading model from {saved_state_path}")
            state_dict = torch.load(saved_state_path)  # , map_location=accelerator.device)
            model.load_state_dict(state_dict)
            print(f"Model loaded successfully.")
    
    # model = UTAE(input_nc)

    loss_function = nn.MSELoss(reduction='mean')  # Loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)  # Optimizer
    
    # Prepare everything to use accelerator
    # Maintain order while unpacking
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model,
                                                                optimizer,
                                                                train_dataloader,
                                                                eval_dataloader)
    min_valid_metric = np.inf
    # Training loop
    for i in tqdm(range(nb_epochs), disable=not accelerator.is_local_main_process):
        accelerator.print(f"Epoch {i+1}")
        epoch_start = time()
        for b, batch in enumerate(tqdm(train_dataloader, disable=not accelerator.is_local_main_process)):
            inputs, targets, chip_id = batch
            with accelerator.accumulate(model):
                outputs = model(inputs)
                loss = loss_function(outputs, targets)
                accelerator.backward(loss)
                optimizer.step()
                optimizer.zero_grad()
    
        epoch_end = time()
        accelerator.print(f"  Training time: {epoch_end - epoch_start}")
        
        # Save Model State Dict after each epoch in order to continue training later
        unwrap_model = accelerator.unwrap_model(model)  # Unwrap the Accelerator model
        train_model_path = save_path[:-3] + f"_E{i+6}.pt"
        accelerator.save(unwrap_model.state_dict(), train_model_path)
        accelerator.print(f"  Model file path: {train_model_path}")

        # Validation Loop
        val_loss = 0.0
        num_elements = 0
        for batch in tqdm(eval_dataloader, disable=not accelerator.is_local_main_process):
            inputs, targets, _ = batch
            with torch.no_grad():
                predictions = model(inputs)
            # Gather all predictions and targets
            all_predictions, all_targets = accelerator.gather_for_metrics((predictions, targets))
            num_elements += all_predictions.shape[0]
            val_loss += loss_function(all_predictions, all_targets).item()

        val_loss /= num_elements
        val_rmse = np.round(np.sqrt(val_loss), 5)
        accelerator.print(f"  Validation RMSE: {val_rmse:>8f}")
        # check validation score, if improved then save model
        if min_valid_metric > val_rmse:
            accelerator.print(f"  Validation RMSE Decreased({min_valid_metric:.6f}--->{val_rmse:.6f})")
            min_valid_metric = val_rmse

            # Saving Model State Dict
            unwrap_model = accelerator.unwrap_model(model)  # Unwrap the Accelerator model
            accelerator.save(unwrap_model.state_dict(), best_model_path)
            accelerator.print(f"  Best Model file path: {best_model_path}")

In [None]:
dataset = "Sentinel-2all"
mixed_precision = "fp16"
seed = 123
batch_size = 8
gradient_accumulation_steps = 4
nb_epochs = 15
train_mode = "resume"

artifacts_dir = "/notebooks/artifacts"
model_name = "UTAE"
date = "20230118"
pretrained_weights_path = artifacts_dir + "/pretrained_utae/f1model.pth.tar"  # for fine tuning
saved_state_path = artifacts_dir + "/20230118_UTAE_Sentinel-2all_B32_tune_E5.pt"  # for resuming training

save_path = artifacts_dir + (f"/{date}_{model_name}_{dataset}_B"
        f"{batch_size * gradient_accumulation_steps}_{train_mode}.pt")
best_model_path = save_path[:-3] + "_BEST.pt"

# Notebook Launcher for distributed training
train_args = (dataset, mixed_precision, seed, batch_size, gradient_accumulation_steps, nb_epochs, train_mode)
notebook_launcher(training_loop, train_args, num_processes=1)

Launching training on one GPU.
Args: fp16, 123, 8, 4, 15, resume
Train samples: 6951 Val. samples: 1738
Resuming training...
Loading model from /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_tune_E5.pt
Model loaded successfully.


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch 1


  0%|          | 0/869 [00:00<?, ?it/s]

  Training time: 970.5138721466064
  Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_E6.pt


  0%|          | 0/109 [00:00<?, ?it/s]

  Validation RMSE: 22.675860
  Validation RMSE Decreased(inf--->22.675860)
  Best Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_BEST.pt
Epoch 2


  0%|          | 0/869 [00:00<?, ?it/s]

  Training time: 850.9564437866211
  Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_E7.pt


  0%|          | 0/109 [00:00<?, ?it/s]

  Validation RMSE: 21.989560
  Validation RMSE Decreased(22.675860--->21.989560)
  Best Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_BEST.pt
Epoch 3


  0%|          | 0/869 [00:00<?, ?it/s]

  Training time: 871.3295848369598
  Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_E8.pt


  0%|          | 0/109 [00:00<?, ?it/s]

  Validation RMSE: 21.342640
  Validation RMSE Decreased(21.989560--->21.342640)
  Best Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_BEST.pt
Epoch 4


  0%|          | 0/869 [00:00<?, ?it/s]

  Training time: 868.7869493961334
  Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_E9.pt


  0%|          | 0/109 [00:00<?, ?it/s]

  Validation RMSE: 20.668860
  Validation RMSE Decreased(21.342640--->20.668860)
  Best Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_BEST.pt
Epoch 5


  0%|          | 0/869 [00:00<?, ?it/s]

  Training time: 846.2696478366852
  Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_E10.pt


  0%|          | 0/109 [00:00<?, ?it/s]

  Validation RMSE: 20.031980
  Validation RMSE Decreased(20.668860--->20.031980)
  Best Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_BEST.pt
Epoch 6


  0%|          | 0/869 [00:00<?, ?it/s]

  Training time: 875.7249212265015
  Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_E11.pt


  0%|          | 0/109 [00:00<?, ?it/s]

  Validation RMSE: 19.420370
  Validation RMSE Decreased(20.031980--->19.420370)
  Best Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_BEST.pt
Epoch 7


  0%|          | 0/869 [00:00<?, ?it/s]

  Training time: 873.7554521560669
  Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_E12.pt


  0%|          | 0/109 [00:00<?, ?it/s]

  Validation RMSE: 18.784620
  Validation RMSE Decreased(19.420370--->18.784620)
  Best Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_BEST.pt
Epoch 8


  0%|          | 0/869 [00:00<?, ?it/s]

  Training time: 850.6407158374786
  Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_E13.pt


  0%|          | 0/109 [00:00<?, ?it/s]

  Validation RMSE: 18.220880
  Validation RMSE Decreased(18.784620--->18.220880)
  Best Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_BEST.pt
Epoch 9


  0%|          | 0/869 [00:00<?, ?it/s]

  Training time: 849.4932560920715
  Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_E14.pt


  0%|          | 0/109 [00:00<?, ?it/s]

  Validation RMSE: 17.630090
  Validation RMSE Decreased(18.220880--->17.630090)
  Best Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_BEST.pt
Epoch 10


  0%|          | 0/869 [00:00<?, ?it/s]

  Training time: 900.840512752533
  Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_E15.pt


  0%|          | 0/109 [00:00<?, ?it/s]

  Validation RMSE: 17.053880
  Validation RMSE Decreased(17.630090--->17.053880)
  Best Model file path: /notebooks/artifacts/20230118_UTAE_Sentinel-2all_B32_resume_BEST.pt
Epoch 11


  0%|          | 0/869 [00:00<?, ?it/s]