In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp hpc

# hpc

> routines for running on clusters

This part isn't strictly for audio i/o, but...sue me. The point of this package is to reduce code-copying between Harmonai projects. 

In [None]:
#|hide
from nbdev.showdoc import *

In [None]:
#|export 
import yaml
import accelerate
from pathlib import Path
import torch
import torchaudio
from torchaudio import transforms as T
import os

In [None]:
#|export 
def get_accel_config(filename='~/.cache/huggingface/accelerate/default_config.yaml'):
    "get huggingface accelerate config info" 
    try:  # first try to use the default file
        filename = filename.replace('~', str(Path.home()))
        with open(filename, 'r') as file:
            ac =  yaml.safe_load(file)
    except OSError:
        ac = {}
        
    # then update using any environment variables
    if os.getenv('MAIN_PROCESS_IP') is not None: ac['main_process_ip'] = os.getenv('MAIN_PROCESS_IP')
    if os.getenv('MACHINE_RANK')    is not None: ac['machine_rank']    = os.getenv('MACHINE_RANK')
    if os.getenv('NUM_MACHINES')    is not None: ac['num_machines']    = os.getenv('NUM_MACHINES')
    if os.getenv('NUM_PROCESSES')   is not None: ac['num_processes']   = os.getenv('NUM_PROCESSES')

    return ac

Let's test that:

In [None]:
ac = get_accel_config('examples/accel_config.yaml')
ac

{'compute_environment': 'LOCAL_MACHINE',
 'deepspeed_config': {},
 'distributed_type': 'MULTI_GPU',
 'fsdp_config': {},
 'machine_rank': 0,
 'main_process_ip': '',
 'main_process_port': 12332,
 'main_training_function': 'main',
 'mixed_precision': 'no',
 'num_machines': 2,
 'num_processes': 8,
 'use_cpu': False}

Next is a little utility to replace `print`, where it'll only print on the cluster headnode. Note that you can only send one string to `hprint`, so use f-strings

In [None]:
#|export        
class HostPrinter():
    "lil accelerate utility for only printing on host node"
    def __init__(self, accelerator, tag='\033[96m', untag='\033[0m'): #added some colors
        self.accelerator, self.tag, self.untag = accelerator, tag, untag
    def __call__(self, s:str):
        if self.accelerator.is_main_process:
            print(self.tag + s + self.untag, flush=True)

Here's a test:

In [None]:
#test hostprinter
accelerator = accelerate.Accelerator()
device = accelerator.device
hprint = HostPrinter(accelerator)  # hprint only prints on head node
hprint(f'Using device: {device}')

[96mUsing device: cuda[0m


## PyTorch+Accelerate Model routines
For when the model is wrapped in a `accelerate` accelerator

In [None]:
#|export 
def save(accelerator, args, model, opt=None, epoch=None, step=None):
    "for checkpointing & model saves"
    accelerator.wait_for_everyone()
    filename = f'{args.name}_{step:08}.pth' if (step is not None) else f'{args.name}.pth'
    if accelerator.is_main_process:
        tqdm.write(f'Saving to {filename}...')
    obj = {'model': accelerator.unwrap_model(model).state_dict() }
    if opt is not None:   obj['opt'] = opt.state_dict()
    if epoch is not None: obj['epoch'] = epoch
    if step is not None:  obj['step'] = step
    accelerator.save(obj, filename)

## Utils for Accelerate of Lightning
Be sure to use "unwrap" any accelerate model when calling these

In [None]:
#|export 
def n_params(
    module # raw PyTorch model/module, e.g. returned by accelerator.unwrap_model()
    ):
    """Returns the number of trainable parameters in a module.
    Be sure to use accelerator.unwrap_model when calling this.  """
    return sum(p.numel() for p in module.parameters())

In [None]:
#|export 
def freeze(
    model  # raw PyTorch model, e.g. returned by accelerator.unwrap_model()
    ):
    """freezes model weights; turns off gradient info
    If using accelerate, call thisaccelerator.unwrap_model when calling this.  """
    for param in model.parameters():  
        param.requires_grad = False

In [None]:
#| hide
from nbdev import nbdev_export
nbdev_export()