# Use Temporal Sentinel Data and HuggingFace to Train a Stacked ResNet Model

In [1]:
from time import time

In [2]:
from accelerate import Accelerator
from biomasstry.datasets import TemporalSentinel2Dataset, TemporalSentinel1Dataset
from biomasstry.models import TemporalSentinelModel
# from biomasstry.models.utils import run_training
import numpy as np
from pynvml import *
import torch
import torch.nn as nn
from torch.utils.data import random_split, DataLoader
from transformers import TrainingArguments, Trainer, logging
from tqdm.notebook import tqdm

In [3]:
logging.set_verbosity_error()

In [4]:
# Utility functions for printing GPU utilization
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [5]:
print_gpu_utilization()

GPU memory occupied: 261 MB.


## Dataset

In [6]:
S3_DIRECT = False  # Access S3 directly or as a mounted data source
USE_SENTINEL1 = True
if S3_DIRECT:
    data_url="s3://drivendata-competition-biomassters-public-us"
else:
    data_url = ""
if USE_SENTINEL1:
    ds = TemporalSentinel1Dataset(data_url=data_url)
    input_nc = 4
    n_tsamples = 12
else:
    ds = TemporalSentinel2Dataset(data_url=data_url)
    input_nc = 10
    n_tsamples = 5

In [7]:
torch.manual_seed(0)
train_size = int(0.8*len(ds))
valid_size = len(ds) - train_size
train_set, val_set = random_split(ds, [train_size, valid_size])
print(f"Train samples: {len(train_set)} "
      f"Val. samples: {len(val_set)}")

Train samples: 800 Val. samples: 200


## Model

In [8]:
model = TemporalSentinelModel(
    n_tsamples=n_tsamples, 
    input_nc=input_nc,
    output_nc=1,
)  # .to(device)

In [9]:
loss_module = nn.MSELoss(reduction='mean')  # .to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.02)

## Model Training with HuggingFace Transformers and Accelerate

In [10]:
default_args = {
    "output_dir": "/notebooks/artifacts",
    "overwrite_output_dir": "True",
    "evaluation_strategy": "steps",
    "num_train_epochs": 1,
    "log_level": "error",
    "report_to": "none",
}

In [11]:
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    **default_args,
)

## DataLoaders

In [12]:
# DataLoaders
num_workers = 0
train_dl = DataLoader(train_set,
                      batch_size=training_args.per_device_train_batch_size,
                      shuffle=True,
                      pin_memory=True,
                      num_workers=num_workers)
val_dl = DataLoader(val_set,
                    batch_size=training_args.per_device_train_batch_size,
                    shuffle=False,
                    pin_memory=True,
                    num_workers=num_workers)

In [18]:
accel_model = Trainer(model, args=training_args, train_dataset=ds)

# HuggingFace Accelerator with Gradient Accumulation
accelerator = Accelerator(gradient_accumulation_steps=4, mixed_precision='fp16')

accel_model, optimizer, train_dl = accelerator.prepare(accel_model, optimizer, train_dl)

In [19]:
accel_model.train()
for step, batch in enumerate(train_dl, start=1):
    with accelerator.accumulate(model):
    loss = accel_model(**batch).loss
    loss = loss / training_args.gradient_accumulation_steps
    accelerator.backward(loss)
    if step % training_args.gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()



AttributeError: 'TemporalSentinelModel' object has no attribute 'gradient_checkpointing_enable'

In [None]:
train_dl = DataLoader(train_set, batch_size=

In [10]:
batch_size = 16
num_workers = 6

train_dataloader = DataLoader(train_set,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=num_workers,
                            pin_memory=True
                            )

val_dataloader = DataLoader(val_set,
                            batch_size=batch_size,
                            shuffle=False,
                            num_workers=num_workers,
                            pin_memory=True
                        )

## Model Training

In [11]:
# Train and Validation Loops
def train_loop(dataloader, model, loss_fn, optimizer):
    train_metrics = []
    
    print('Training')
    for ix, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
        X = [img_data.to(device) for img_data in batch['image']]
        y = batch['target'].to(device)
        
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_metrics.append(np.round(np.sqrt(loss.item()), 5))
            
    return train_metrics

In [12]:
def valid_loop(dataloader, model, loss_fn):
    num_batches = len(dataloader)
    valid_loss = 0
    valid_metrics = {}

    print('Validation')
    with torch.no_grad():
        for batch in tqdm(dataloader, total=num_batches):
            X = [img_data.to(device) for img_data in batch['image']]
            y = batch['target'].to(device)
            
            pred = model(X)
            valid_loss += loss_fn(pred, y).item()
            
    valid_loss /= num_batches
    valid_rmse = np.round(np.sqrt(valid_loss), 5)
    print(f"Validation Error: \n RMSE: {valid_rmse:>8f} \n")
    return valid_rmse

In [13]:
def run_training(model, loss_module, optimizer, train_dataloader, val_dataloader, save_path, n_epochs=10):
    min_valid_metric = np.inf
    train_metrics = []
    valid_metrics = []
    
    total_train_time = 0
    total_val_time = 0

    for ix in range(n_epochs):
        print(f"\n-------------------------------\nEpoch {ix+1}")
        start = time()
        train_metrics_epoch = train_loop(train_dataloader, model, loss_module, optimizer)
        end = time()
        train_time = end - start
        total_train_time += train_time
        train_metrics.extend(train_metrics_epoch)
        
        start = time()
        valid_metrics_epoch = valid_loop(val_dataloader, model, loss_module)
        end = time()
        val_time = end - start
        total_val_time += val_time
        valid_metrics.append((len(train_metrics), valid_metrics_epoch))

        # check validation score, if improved then save model
        if min_valid_metric > valid_metrics_epoch:
            print(f'Validation RMSE Decreased({min_valid_metric:.6f}--->{valid_metrics_epoch:.6f}) \t Saving The Model')
            min_valid_metric = valid_metrics_epoch

            # Saving State Dict
            torch.save(model.state_dict(), save_path)
        print(f"Train time: {train_time}. Validation time: {val_time}")
    print("Done!")
    print(f"Total train time: {total_train_time} s. Avg. time per epoch: {total_train_time / n_epochs}")
    print(f"Total val time: {total_val_time} s. Avg. time per epoch: {total_val_time / n_epochs}")
    train_metrics_zipped = list(zip(np.arange(0, len(train_metrics)), train_metrics))
    
    return {'training': train_metrics_zipped, 'validation': valid_metrics}

In [14]:
artifacts_dir = "/notebooks/artifacts"
if USE_SENTINEL1:
    model_name = "TemporalS1"
else:
    model_name = "TemporalS2"
n_epochs = 2
date = "20230105"
save_path = artifacts_dir + f"/{date}_{model_name}_B{batch_size}_E{n_epochs}.pt"

In [15]:
metrics = run_training(model=model,
                    loss_module=loss_module,
                    optimizer=optimizer,
                    train_dataloader=train_dataloader,
                    val_dataloader=val_dataloader,
                    save_path=save_path,
                    n_epochs=n_epochs)


-------------------------------
Epoch 1
Training


  0%|          | 0/800 [00:00<?, ?it/s]

Validation


  0%|          | 0/200 [00:00<?, ?it/s]

Validation Error: 
 RMSE: 155599275856489.500000 

Validation RMSE Decreased(inf--->155599275856489.500000) 	 Saving The Model
Train time: 1280.251543521881. Validation time: 96.95875835418701

-------------------------------
Epoch 2
Training


  0%|          | 0/800 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
##### Save the metrics to a file
import pandas as pd
train_metrics_df = pd.DataFrame(metrics['training'], columns=["step", "score"])
val_metrics_df = pd.DataFrame(metrics["validation"], columns=["step", "score"])
train_metrics_df.to_csv(artifacts_dir + "/train_metrics.csv")
val_metrics_df.to_csv(artifacts_dir + "/val_metrics.csv")