# WML-A Job Submission via WML-A CLI

Offical examples can be found here: https://wmla-console-cpd-wmla.apps.cpd.mskcc.org/ui/#/cliTools

In [1]:
import os
import wmla_utils as wmla_util

In [2]:
wmla_framework = 'distPyTorch'
wmla_util.get_options_training(wmla_framework)

exec-start:        the wmla framework to use, such as PyTorch, distPyTorch, etc.
model-dir:         a local folder to be submitted, with model training scripts
model-main:        the main file to execute for training
cs-datastore-meta: location of data, this usually refers to the folder in WMLA's data volume; you can check the content or interact with it from WMLA's notebook server, under the 'data' folder
workerDeviceNum:   number of GPU devices in one worker pod
workerMemory:      memory of worker pod, increase the value if needed
msd-env:           custom environment variables & values; when used in cli, the command looks like this: 
                   python dlicmd.py --msd-env varA=1 --msd-env varB=mytoken ...
numWorker:         number of worker pods to use; in the context of distributed deep learning, this is similar to the number of processes to open
* PyTorch's doc recommends to use 1 GPU per process in DistributedDataParallel. In WMLA, this is equivalent to workerDeviceNum = 1

{'exec-start': 'distPyTorch',
 'model-dir': '/userfs/job_submission',
 'model-main': 'train_command.py',
 'cs-datastore-meta': 'type=fs,data_path=My_Datasets/',
 'workerDeviceNum': 1,
 'workerMemory': '8g',
 'msd-env': ['varA=1', 'varB=mytoken'],
 'numWorker': 2}

In [3]:
volume_display_name = 'DeepLIIFData'
file_training = 'cli.py' # use None if your main training file and execution file are the same one

options = {'exec-start': wmla_framework,
           'numWorker': 2,
           'workerDeviceNum': 1,
           'workerMemory': '8g',
           'model-dir': '/userfs/job_submission',
           'model-main': 'train_command.py',
           'cs-datastore-meta': 'type=fs,data_path=DeepLIIF_Datasets/',
           'msd-env': [f"USER_ACCESS_TOKEN={os.environ['USER_ACCESS_TOKEN']}",
                       f"BASE_URL=https://cpd-cpd.apps.cpd.mskcc.org",
                       f"VOLUME_DISPLAY_NAME={volume_display_name}"]}

paths_file = ["/userfs/deepliif-repo/*",
              "/userfs/train_command.py",
              "/userfs/monitor_gpu.sh",
              "/userfs/storage_volume_utils.py",
              "/userfs/cpd_utils.py"]

paths_folder = ["/userfs/deepliif-repo/deepliif"]

In [4]:
os.environ['DIR_job_submission'] = options['model-dir']
os.environ['file_exec'] = options['model-main']
os.environ['VOLUME_DISPLAY_NAME'] = volume_display_name
os.environ['HOST'] = 'wmla-console-cpd-wmla.apps.cpd.mskcc.org'

In [5]:
wmla_util.prepare_submission_folder_training(paths_file,paths_folder,wmla_framework=wmla_framework,
                                             file_training=file_training)

Copying files and/or folders to /userfs/job_submission...
cp: -r not specified; omitting directory '/userfs/deepliif-repo/checkpoints'
cp: -r not specified; omitting directory '/userfs/deepliif-repo/Datasets'
cp: -r not specified; omitting directory '/userfs/deepliif-repo/deepliif'
cp: -r not specified; omitting directory '/userfs/deepliif-repo/DeepLIIF_Statistics'
cp: -r not specified; omitting directory '/userfs/deepliif-repo/docs'
cp: -r not specified; omitting directory '/userfs/deepliif-repo/images'
cp: -r not specified; omitting directory '/userfs/deepliif-repo/model-server'
cp: -r not specified; omitting directory '/userfs/deepliif-repo/Registration'
cp: -r not specified; omitting directory '/userfs/deepliif-repo/Sample_Large_Tissues'
cp: -r not specified; omitting directory '/userfs/deepliif-repo/Scripts'

Patching /userfs/job_submission/cli.py for wmla framework distPyTorch...
Done


In [6]:
wmla_util.submit_training(options,path_cli='wmla-utils/dlicmd.py')

Copying files and directories ...
Content size: 110.8K
{
  "execId": "cpd-wmla-14",
  "appId": "cpd-wmla-14"
}

