In [1]:
!nvidia-smi

Mon Jul  3 09:14:27 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:06:00.0 Off |                  N/A |
| 23%   31C    P8     8W / 250W |      0MiB / 11264MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Setup environment

In [2]:
# sync python module
%load_ext autoreload
%autoreload 2

## Setup config

In [9]:
import os

workspace_dir = '/nfs/Workspace/CardiacSeg'
model_name = 'unetcnx_a1'
data_name = 'chgh'
sub_data_dir_name = 'dataset_2'
exp_name = 't_5'
data_dict_file_name = 'exp_b7_9.json'

tune_mode = 'train'

# set exp dir
root_exp_dir = os.path.join(
    workspace_dir, 
    'exps',
    'exps',
    model_name,
    data_name,
    'tune_results'
)

# set data dir
root_data_dir = os.path.join(
    workspace_dir, 
    'dataset',
    data_name
)
data_dir = os.path.join(root_data_dir, sub_data_dir_name)

# data dict json path
data_dicts_json = os.path.join(workspace_dir, 'exps', 'data_dicts', data_name, data_dict_file_name)

# set model, log, eval dir
model_dir = os.path.join('./', 'models')
log_dir = os.path.join('./', 'logs')
eval_dir = os.path.join('./', 'evals')

# model path
best_checkpoint = os.path.join(model_dir, 'best_model.pth')
final_checkpoint = os.path.join(model_dir, 'final_model.pth')

# mkdir root exp dir
os.makedirs(root_exp_dir, exist_ok=True)

# for pretrain
pretrain_exp_name = 'exp_50'
pretrain_data_name = 'image_cas'
pretrain_model_dir = os.path.join(
    workspace_dir,
    'exps',
    'exps',
    model_name,
    pretrain_data_name,
    'pretrain',
    pretrain_exp_name,
    'models'
)
pretrain_checkpoint = os.path.join(pretrain_model_dir, 'model_bestValRMSE.pt')

%cd {root_exp_dir}/../

/nfs/Workspace/CardiacSeg/exps/exps/unet3d/chgh


## Train UNETCNX

In [4]:
# training
!PYTHONPATH={workspace_dir} /opt/conda/bin/python {workspace_dir}/expers/tune.py \
--tune_mode={tune_mode} \
--exp_name={exp_name} \
--data_name={data_name} \
--data_dir={data_dir} \
--root_exp_dir={root_exp_dir} \
--model_name={model_name}\
--model_dir={model_dir} \
--log_dir={log_dir} \
--eval_dir={eval_dir} \
--start_epoch=0 \
--val_every=20 \
--max_early_stop_count=20 \
--max_epoch=8000  \
--data_dicts_json={data_dicts_json} \
--pin_memory \
--out_channels=2 \
--patch_size=4 \
--feature_size=48 \
--drop_rate=0.1 \
--depths 3 3 9 3 \
--kernel_size 7 \
--exp_rate 4 \
--norm_name='layer' \
--a_min=-42 \
--a_max=423 \
--space_x=0.7 \
--space_y=0.7 \
--space_z=1.0 \
--roi_x=128 \
--roi_y=128 \
--roi_z=128 \
--optim='AdamW' \
--lr=7e-4 \
--weight_decay=5e-4 \
--checkpoint={final_checkpoint} \
--use_init_weights \
--infer_post_process \
--deep_sup \
# --resume_tuner \
# --test_mode \
# --save_eval_csv \

resume tuner form /nfs/Workspace/CardiacSeg/exps/exps/unetcnx_a1/chgh/tune_results
2023-07-03 09:14:48,667	INFO worker.py:1625 -- Started a local Ray instance.
2023-07-03 09:14:49,720	INFO tune.py:218 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `Tuner(...)`.
The `local_dir` argument of `Experiment is deprecated. Use `storage_path` or set the `TUNE_RESULT_DIR` environment variable instead.
2023-07-03 09:14:49,738	INFO tensorboardx.py:172 -- pip install "ray[tune]" to see TensorBoard files.
2023-07-03 09:14:49,743	INFO experiment_state.py:392 -- A local experiment checkpoint was found and will be used to restore the previous experiment state.
2023-07-03 09:14:49,744	INFO trial_runner.py:386 -- Using the newest experiment state file found within the experiment directory: experiment_state-2023-05-30_15-33-09.json
== Status ==
Current time: 2023-07-03 09:14:49 (running for 00:00:00.06)
Using FIFO scheduling algorithm.
Logica

## Train other models

In [15]:
# training
!PYTHONPATH={workspace_dir} /opt/conda/bin/python {workspace_dir}/expers/tune.py \
--tune_mode={tune_mode} \
--exp_name={exp_name} \
--data_name={data_name} \
--data_dir={data_dir} \
--root_exp_dir={root_exp_dir} \
--model_name={model_name}\
--model_dir={model_dir} \
--log_dir={log_dir} \
--eval_dir={eval_dir} \
--start_epoch=0 \
--val_every=20 \
--max_early_stop_count=20 \
--max_epoch=8000  \
--data_dicts_json={data_dicts_json} \
--pin_memory \
--out_channels=2 \
--patch_size=4 \
--feature_size=48 \
--drop_rate=0.1 \
--depths 3 3 9 3 \
--kernel_size 7 \
--exp_rate 4 \
--norm_name='layer' \
--a_min=-42 \
--a_max=423 \
--space_x=0.7 \
--space_y=0.7 \
--space_z=1.0 \
--roi_x=128 \
--roi_y=128 \
--roi_z=128 \
--optim='AdamW' \
--lr=5e-4 \
--weight_decay=5e-4 \
--checkpoint={final_checkpoint} \
--use_init_weights \
--infer_post_process \
# --resume_tuner \
# --save_eval_csv \
# --test_mode \

resume tuner form /nfs/Workspace/CardiacSeg/exps/exps/swinunetr/chgh/tune_results
2023-07-02 08:04:28,844	INFO worker.py:1625 -- Started a local Ray instance.
2023-07-02 08:04:29,645	INFO tune.py:218 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `Tuner(...)`.
The `local_dir` argument of `Experiment is deprecated. Use `storage_path` or set the `TUNE_RESULT_DIR` environment variable instead.
2023-07-02 08:04:29,660	INFO tensorboardx.py:172 -- pip install "ray[tune]" to see TensorBoard files.
2023-07-02 08:04:29,665	INFO experiment_state.py:392 -- A local experiment checkpoint was found and will be used to restore the previous experiment state.
2023-07-02 08:04:29,666	INFO trial_runner.py:386 -- Using the newest experiment state file found within the experiment directory: experiment_state-2023-04-10_06-53-58.json
== Status ==
Current time: 2023-07-02 08:04:29 (running for 00:00:00.07)
Using FIFO scheduling algorithm.
Logical

## Analysis

In [42]:
!PYTHONPATH=/nfs/Workspace/CardiacSeg /opt/conda/bin/python /nfs/Workspace/CardiacSeg/expers/tune_anal.py \
--exp_name={exp_name} \
--local_dir={root_exp_dir}

Loading results from /nfs/Workspace/CardiacSeg/exps/exps/unetcnx_a1/chgh/tune_results/exp_b7_9_ds1_fh_tn1...
2023-05-28 09:29:00,909 - No `self.trials`. Drawing logdirs from checkpoint file. This may result in some information that is out of sync, as checkpointing is periodic.
Trial 6fc08_00000:  {'optim': {'lr': 0.0005, 'weight_decay': 0.0003}} 0.8928633 0.88013774
Trial 6fc08_00001:  {'optim': {'lr': 0.0005, 'weight_decay': 0.0007}} 0.8774688 0.8682186

Best trial 6fc08_00000: 
config: {'optim': {'lr': 0.0005, 'weight_decay': 0.0003}}
tt_dice: 0.8928633
tt_hd95: 4.149112902449147
inf_dice: 0.88013774
inf_hd95: 8.449605779201455
best log dir: /nfs/Workspace/CardiacSeg/exps/exps/unetcnx_a1/chgh/tune_results/exp_b7_9_ds1_fh_tn1/main_6fc08_00000_0_optim=lr_0_0005_weight_decay_0_0003_2023-05-27_14-19-05
final early stop count: 20
final epoch: 1080
best val dice: 0.8920990824699402
[0m