In [1]:
import multiprocessing
import nvidia_smi
from os import path as osp, getcwd, environ
import torch

In [2]:
# Check GPU(s)
!nvidia-smi -L

GPU 0: Tesla V100-SXM3-32GB (UUID: GPU-872c2e61-392f-1887-5433-dd78e7f6bc11)


In [3]:
n_gpus = torch.cuda.device_count()

In [4]:
# Choose num workers based on number of available CPUs
n_workers = 64#multiprocessing.cpu_count() - 2
n_workers

64

In [5]:
# Choose batch size based on amount of available video memory
# Tailored for the shape of each image/mask is (256,256)
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
avail_gpu_memory = info.free
batch_size_choices = [2**n for n in range(0,10)]
batch_size_pre_alignment = (avail_gpu_memory / 10e8)*4
batch_size = [x for x in batch_size_choices if x < batch_size_pre_alignment][-1]
batch_size
# Multiply final result by 4 if image/mask shape is (128,128)
# Or divide final result by 4 if image/mask shape is (512,512)

128

In [6]:
batch_size = batch_size // 4    # For 512x512 images and segmentation masks

In [7]:
batch_size

32

In [8]:
distributed = False
# see https://pytorch.org/docs/stable/distributed.html and related documentation for more info
if distributed:
    environ["WORLD_SIZE"]=str(n_gpus)
    environ["RANK"]="0"
    environ["MASTER_ADDR"]="127.0.0.1"
    environ["MASTER_PORT"]="5000"

In [10]:
# Training config variables
dataset_path = osp.join(getcwd(), "../config/mix_datasets_config.yml")
train_script = osp.join(getcwd(), "../train.py")
model_path = osp.join(getcwd(), "../models/segformerB3_mix.py")
pretrained_weights = osp.join(getcwd(), "../pretrained/segformer_b3/mit_b3.pth")
exp_name = "segformerB3_mix"

In [12]:
execute_training_cmd

'python /nfs/hpc/share/wigginno/branching/ClickSEG/notebooks/../train.py /nfs/hpc/share/wigginno/branching/ClickSEG/notebooks/../models/segformerB3_mix.py --pretrained_weights=/nfs/hpc/share/wigginno/branching/ClickSEG/notebooks/../pretrained/segformer_b3/mit_b3.pth --dataset_path=/nfs/hpc/share/wigginno/branching/ClickSEG/notebooks/../config/mix_datasets_config.yml --gpus=0 --workers=64 --batch-size=16 --exp-name=segformerB3_mix'

In [11]:
# Run training
train_args = [
    model_path,
    f"--pretrained_weights={pretrained_weights}",
    f"--dataset_path={dataset_path}",
    "--gpus=0",
    #f"--ngpus={n_gpus}",
    f"--workers=64",#{n_workers}",
    f"--batch-size=16",#{batch_size}",
    f"--exp-name={exp_name}",
    #"--resume-exp=000",
    #"--resume-prefix=42",
    #"--start-epoch=43"
]

execute_training_cmd = f"python {train_script} {' '.join(train_args)}"
!{execute_training_cmd}

Number of GPUs: 1
Run experiment with config:
{   'CHECKPOINTS_PATH': PosixPath('/nfs/hpc/share/wigginno/branching/ClickSEG/experiments/segformerB3_tubes/036_segformerB3_mix/checkpoints'),
    'EXP_PATH': PosixPath('/nfs/hpc/share/wigginno/branching/ClickSEG/experiments/segformerB3_tubes/036_segformerB3_mix'),
    'LOGS_PATH': PosixPath('/nfs/hpc/share/wigginno/branching/ClickSEG/experiments/segformerB3_tubes/036_segformerB3_mix/logs'),
    'VIS_PATH': PosixPath('/nfs/hpc/share/wigginno/branching/ClickSEG/experiments/segformerB3_tubes/036_segformerB3_mix/vis'),
    'batch_size': 16,
    'dataset_path': '/nfs/hpc/share/wigginno/branching/ClickSEG/notebooks/../config/mix_datasets_config.yml',
    'device': device(type='cuda', index=0),
    'distributed': False,
    'exp_name': 'segformerB3_mix',
    'gpu_ids': [0],
    'gpus': '0',
    'local_rank': 0,
    'model_path': '/nfs/hpc/share/wigginno/branching/ClickSEG/notebooks/../models/segformerB3_mix.py',
    'multi_gpu': False,
    'ngpus