# Summary

Notebook for distriputed training.

# Imports/Setup

In [1]:
from accelerate import Accelerator, notebook_launcher
import torch
import numpy as np
import matplotlib.pyplot as plt
from wandb_helper import init_wandb, save_model_architecture, finish_run
from torch import nn
import torch.optim as optim
from safetensors.torch import load_file
from diffusers import UNet2DModel
import data
import dataset
import model
import training
import math
import utility

2025-01-16 22:10:27.576195: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-16 22:10:27.590555: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-16 22:10:27.608898: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-16 22:10:27.614548: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-16 22:10:27.627916: I tensorflow/core/platform/cpu_feature_guar

In [2]:
class Config:    
    # dataset
    path = '/data/users/jupyter-dam724/colliding_solutions'
    solver = 'ros2'
    fixed_seq_len = 216
    ahead = 1
    tail = 1
    aug = False

    # device (not used but needed for dataset)
    device_pref = 'cuda'
    device_ind = None
    
    # distributed training
    num_processes = 2
    per_gpu_batch_size = 4
    total_batch_size = per_gpu_batch_size * num_processes # (temporarily removed)
    workers_per_gpu = 6
    tworkers = workers_per_gpu * num_processes
    vworkers = workers_per_gpu * num_processes
    grad_accumulate = 8
    
    # optimization
    base_lr = 1e-5
    lr = base_lr * math.sqrt(total_batch_size / (per_gpu_batch_size))  # sqrt scaling
    
    # training
    epoches = 50
    timesteps = 4000
    loss_type = "simple"
    
    # experimentations
    project_name = "Operator Guided Diffusion"
    experiment_name = 'unconditional-init-run'
    save_path = f'/data/users/jupyter-dam724/time-invariant-operator/checkpoint/{experiment_name}/'
    utility.validate_and_create_save_path(save_path, experiment_name)
    from_checkpoint = None

In [3]:
init_wandb(
    project_name=Config.project_name,
    run_name=Config.experiment_name,
    config_class=Config,
    save_path=Config.save_path
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdavid724[0m ([33mdavid724-lehigh-university[0m). Use [1m`wandb login --relogin`[0m to force relogin




# Training

In [4]:
def acelerate_ddp():
    accelerator = Accelerator(gradient_accumulation_steps=Config.grad_accumulate)
    
    data_params = {
        'path': Config.path, 
        'device_pref': Config.device_pref, 
        'solver': Config.solver, 
        'fixed_seq_len': Config.fixed_seq_len, 
        'ahead': Config.ahead, 
        'tail': Config.tail,
        'device_ind': Config.device_ind
    }

    _, (x_train_data, y_train_data), (x_valid_data, y_valid_data) = data.main(**data_params)
    
    dataset_params = {
        'x_train_data': x_train_data, 
        'batch_size': Config.total_batch_size,
        't_timesteps': Config.timesteps,
        'tworkers': Config.tworkers, 
        'vworkers': Config.vworkers,
        'aug': Config.aug
    }

    train_dl = dataset.main(**dataset_params)
    
    unet = UNet2DModel(
        (224, 224), 1, 1,
        layers_per_block=4,
        block_out_channels=(128, 128, 256, 128),
        down_block_types=(
            "DownBlock2D",
            "DownBlock2D",
            "DownBlock2D",
            "DownBlock2D"
        ),
        up_block_types=(
            "UpBlock2D",
            "UpBlock2D",
            "UpBlock2D",
            "UpBlock2D"
        ),
    )
    
    save_model_architecture(unet, Config.save_path)
    
    if Config.from_checkpoint is not None:
        state_dict = load_file(Config.from_checkpoint)
        model.load_model_weights(unet, state_dict)

    optimizer = optim.AdamW(unet.parameters(), lr=Config.lr)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=Config.epoches)
    
    # Send everything through `accelerator.prepare`
    train_dl, unet, optimizer, scheduler = accelerator.prepare(
        train_dl, unet, optimizer, scheduler
    )
        
    train_log, valid_log = [], []
    
    training_params = {
        'accelerator': accelerator,
        'train': train_dl, 
        'model': unet, 
        'epochs': Config.epoches, 
        'criterion': nn.MSELoss(), 
        'save_path': Config.save_path, 
        'loss_type': Config.loss_type,
        'train_log': train_log, 
        'optimizer': optimizer, 
        'scheduler': scheduler, 
        't_timesteps': Config.timesteps,
        'loading_bar': False
    }
    
    training.accelerator_train(**training_params)

In [None]:
notebook_launcher(acelerate_ddp, args=(), num_processes=Config.num_processes)

Launching training on 2 GPUs.
Now using GPU.
Now using GPU.
Train size: 145097, Percent of toal: 74.68%, Unique instances: 700
Train size: 49194, Percent of toal: 25.32%, Unique instances: 240
Train size: 145097, Percent of toal: 74.68%, Unique instances: 700
Train size: 49194, Percent of toal: 25.32%, Unique instances: 240


In [None]:
finish_run()