# Summary

Notebook for distriputed training.

# Imports/Setup

In [1]:
from accelerate import Accelerator, notebook_launcher
import torch
import numpy as np
import matplotlib.pyplot as plt
from wandb_helper import init_wandb, save_model_architecture, finish_run
from torch import nn
import torch.optim as optim
from safetensors.torch import load_file
from diffusers import UNet2DModel
import data
import dataset
import model
import training
import math
import utility
import op_train

2025-01-24 17:33:46.299468: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-24 17:33:46.314532: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-24 17:33:46.333056: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-24 17:33:46.338714: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-24 17:33:46.352928: I tensorflow/core/platform/cpu_feature_guar

In [3]:
class Config:    
    # dataset
    path = '/data/users/jupyter-dam724/colliding_solutions'
    solver = 'ros2'
    fixed_seq_len = 216
    ahead = 3
    tail = 1
    aug = False
    upsample_size = 96

    # device (not used but needed for dataset)
    device_pref = 'cuda'
    device_ind = None
    
    # distributed training
    num_processes = 2
    per_gpu_batch_size = 32
    total_batch_size = per_gpu_batch_size * num_processes # (temporarily removed)
    workers_per_gpu = 6
    tworkers = workers_per_gpu * num_processes
    vworkers = workers_per_gpu * num_processes
    grad_accumulate = 1
    
    # optimization
    base_lr = 1e-5
    max_lr = 1e-4
    lr = base_lr * math.sqrt(total_batch_size / (per_gpu_batch_size))  # sqrt scaling
    
    # training
    epoches = 40
    timesteps = 4000
    loss_type = "simple"
    val_delay = 1
    patience = 50
    
    # experimentations
    project_name = "Operator Guided Diffusion"
    experiment_name = 'operator-training-big-multistep-nodrop'
    save_path = f'/data/users/jupyter-dam724/time-invariant-operator/checkpoint/{experiment_name}/'
    utility.validate_and_create_save_path(save_path, experiment_name)
    
    experiment_name = 'operator-training-big-multistep-lowerdropout'
    tset = 'valid'
    from_checkpoint = f'/data/users/jupyter-dam724/time-invariant-operator/checkpoint/{experiment_name}/{tset}/model.safetensors'

In [4]:
init_wandb(
    project_name=Config.project_name,
    run_name=Config.experiment_name,
    config_class=Config,
    save_path=Config.save_path
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdavid724[0m ([33mdavid724-lehigh-university[0m). Use [1m`wandb login --relogin`[0m to force relogin




# Training

In [5]:
def acelerate_ddp():
    accelerator = Accelerator(gradient_accumulation_steps=Config.grad_accumulate)
    
    data_params = {
        'path': Config.path, 
        'device_pref': Config.device_pref, 
        'solver': Config.solver, 
        'fixed_seq_len': Config.fixed_seq_len, 
        'ahead': Config.ahead, 
        'tail': Config.tail,
        'device_ind': Config.device_ind
    }

    _, (x_train_data, y_train_data), (x_valid_data, y_valid_data) = data.main(**data_params)

    dataset_params = {
        'x_train_data': x_train_data, 
        'y_train_data': y_train_data, 
        'x_valid_data': x_valid_data, 
        'y_valid_data': y_valid_data, 
        'batch_size': Config.total_batch_size,
        'tworkers': Config.tworkers, 
        'vworkers': Config.vworkers,
        'upsample_size': Config.upsample_size,
        'aug': Config.aug
    }

    train_dl, valid_dl = dataset.main_operator(**dataset_params)
    
    unet = UNet2DModel(
        sample_size=(Config.upsample_size, Config.upsample_size),        
        in_channels=3,         
        out_channels=1,         
        layers_per_block=2,      
        block_out_channels=(64, 128, 256, 256),  
        down_block_types=(
            "DownBlock2D",      # 64 channels at 96x96
            "DownBlock2D",      # 64 channels at 48x48
            "AttnDownBlock2D",  # 128 channels at 24x24
            "AttnDownBlock2D"   # 64 channels at 12x12
        ),
        up_block_types=(
            "AttnUpBlock2D",
            "AttnUpBlock2D",
            "UpBlock2D",
            "UpBlock2D"
        ),
        dropout=0.0
    )
    
    save_model_architecture(unet, Config.save_path)
    
    if Config.from_checkpoint is not None:
        state_dict = load_file(Config.from_checkpoint)
        model.load_model_weights(unet, state_dict)

    optimizer = optim.AdamW(unet.parameters(), lr=Config.lr)
    
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=Config.max_lr,
        epochs=Config.epoches,
        steps_per_epoch=len(train_dl),
        pct_start=0.25,  
        div_factor=1e4,  
        final_div_factor=1e4 
    )
    
    # Send everything through `accelerator.prepare`
    train_dl, valid_dl, unet, optimizer, scheduler = accelerator.prepare(
        train_dl, valid_dl, unet, optimizer, scheduler
    )
        
    train_log, valid_log = [], []
    
    training_params = {
        'accelerator': accelerator,
        'train': train_dl, 
        'valid': valid_dl, 
        'model': unet, 
        'epochs': Config.epoches, 
        'patience': Config.patience, 
        'criterion': model.OperatorLoss(0.5, 0.5), 
        'save_path': Config.save_path, 
        'train_log': train_log, 
        'valid_log': valid_log, 
        'optimizer': optimizer, 
        'scheduler': scheduler, 
        'loading_bar': False,
        'val_delay': Config.val_delay
    }
    
    op_train.accelerator_train_operator(**training_params)

In [6]:
notebook_launcher(acelerate_ddp, args=(), num_processes=Config.num_processes)

Launching training on 2 GPUs.
Now using GPU.
Now using GPU.
Train size: 143697, Percent of toal: 74.68%, Unique instances: 700
Train size: 48714, Percent of toal: 25.32%, Unique instances: 240
Train size: 143697, Percent of toal: 74.68%, Unique instances: 700
Train size: 48714, Percent of toal: 25.32%, Unique instances: 240


grad.sizes() = [256, 512, 1, 1], strides() = [512, 1, 512, 512]
bucket_view.sizes() = [256, 512, 1, 1], strides() = [512, 1, 1, 1] (Triggered internally at ../torch/csrc/distributed/c10d/reducer.cpp:327.)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
grad.sizes() = [256, 512, 1, 1], strides() = [512, 1, 512, 512]
bucket_view.sizes() = [256, 512, 1, 1], strides() = [512, 1, 1, 1] (Triggered internally at ../torch/csrc/distributed/c10d/reducer.cpp:327.)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch 1/40, Train Loss: 0.909004807472229, Validation Loss: 1.0518529415130615
Epoch 2/40, Train Loss: 0.8876733779907227, Validation Loss: 1.0355528593063354
Epoch 3/40, Train Loss: 0.8724039793014526, Validation Loss: 1.022350549697876
Epoch 4/40, Train Loss: 0.8645174503326416, Validation Loss: 1.015981912612915
Epoch 5/40, Train Loss: 0.8661057353019714, Validation Loss: 1.0313947200775146
Epoch 6/40, Train Loss: 0.8744868040084839, Validation Loss: 1.0546786785125732
Epoch 7/40, Train Loss: 0.8850003480911255, Validation Loss: 1.0320265293121338
Epoch 8/40, Train Loss: 0.8937674760818481, Validation Loss: 1.0396535396575928
Epoch 9/40, Train Loss: 0.8928471803665161, Validation Loss: 1.0770690441131592
Epoch 10/40, Train Loss: 0.8860424160957336, Validation Loss: 1.0214364528656006
Epoch 11/40, Train Loss: 0.867950975894928, Validation Loss: 1.0171575546264648
Epoch 12/40, Train Loss: 0.8480733633041382, Validation Loss: 0.9687395691871643
Epoch 13/40, Train Loss: 0.83027553558349

In [7]:
finish_run()

VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,██▇▇▇▇████▇▇▇▆▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁
valid_loss,▇▆▆▆▆▇▆▇█▆▆▄▅▄▄▄▆▃▂▂▂▂▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,39
model_architecture,UNet2DModel(  (conv...
train_loss,0.50578
valid_loss,0.89245
