In [23]:
!nvidia-smi

Sun Jul  2 08:58:26 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.141.03   Driver Version: 470.141.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:0B:00.0 Off |                    0 |
| N/A   31C    P0    47W / 163W |   7720MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Setup environment

In [24]:
# sync python module
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Setup config

In [27]:
import os

workspace_dir = '/nfs/Workspace/CardiacSeg'
model_name = 'unetcnx_a1'
data_name = 'chgh'
sub_data_dir_name = 'dataset_2'
exp_name = 't_5'
data_dict_file_name = 'exp_b7_9.json'

tune_mode = 'train'

# set exp dir
root_exp_dir = os.path.join(
    workspace_dir, 
    'exps',
    'exps',
    model_name,
    data_name,
    'tune_results'
)

# set data dir
root_data_dir = os.path.join(
    workspace_dir, 
    'dataset',
    data_name
)
data_dir = os.path.join(root_data_dir, sub_data_dir_name)

# data dict json path
data_dicts_json = os.path.join(workspace_dir, 'exps', 'data_dicts', data_name, data_dict_file_name)

# set model, log, eval dir
model_dir = os.path.join('./', 'models')
log_dir = os.path.join('./', 'logs')
eval_dir = os.path.join('./', 'evals')

# model path
best_checkpoint = os.path.join(model_dir, 'best_model.pth')
final_checkpoint = os.path.join(model_dir, 'final_model.pth')

# mkdir root exp dir
os.makedirs(root_exp_dir, exist_ok=True)

# for pretrain
pretrain_exp_name = 'exp_50'
pretrain_data_name = 'image_cas'
pretrain_model_dir = os.path.join(
    workspace_dir,
    'exps',
    'exps',
    model_name,
    pretrain_data_name,
    'pretrain',
    pretrain_exp_name,
    'models'
)
pretrain_checkpoint = os.path.join(pretrain_model_dir, 'model_bestValRMSE.pt')

%cd {root_exp_dir}/../

/nfs/Workspace/CardiacSeg/exps/exps/unetcnx_a1/chgh


## Train UNETCNX

In [28]:
# training
!PYTHONPATH={workspace_dir} /opt/conda/bin/python {workspace_dir}/expers/tune.py \
--tune_mode={tune_mode} \
--exp_name={exp_name} \
--data_name={data_name} \
--data_dir={data_dir} \
--root_exp_dir={root_exp_dir} \
--model_name={model_name}\
--model_dir={model_dir} \
--log_dir={log_dir} \
--eval_dir={eval_dir} \
--start_epoch=0 \
--val_every=20 \
--max_early_stop_count=20 \
--max_epoch=8000  \
--data_dicts_json={data_dicts_json} \
--pin_memory \
--out_channels=2 \
--patch_size=4 \
--feature_size=48 \
--drop_rate=0.1 \
--depths 3 3 9 3 \
--kernel_size 7 \
--exp_rate 4 \
--norm_name='layer' \
--a_min=-42 \
--a_max=423 \
--space_x=0.7 \
--space_y=0.7 \
--space_z=1.0 \
--roi_x=128 \
--roi_y=128 \
--roi_z=128 \
--optim='AdamW' \
--lr=7e-4 \
--weight_decay=5e-4 \
--checkpoint={final_checkpoint} \
--use_init_weights \
--infer_post_process \
--deep_sup \
# --resume_tuner \
# --test_mode \
# --save_eval_csv \

test mode
resume tuner form /nfs/Workspace/CardiacSeg/exps/exps/unetcnx_a1/chgh/tune_results
run test mode ...
cuda is available
model: unetcnx_a1
patch size: 4
ker size: 7
exp rate: 4
feature sizes: [48, 96, 192, 384]
depths: [3, 3, 9, 3]
drop rate: 0.1
use init weights: True
is conv stem: False
use init weights
use deep sup
loss: dice ce loss
optimzer: AdamW
{'lr': 0.0007, 'weight_decay': 0.0005}
=> loaded checkpoint '/nfs/Workspace/CardiacSeg/exps/exps/unetcnx_a1/chgh/tune_results/t_5/main_d92fb_00000_0_exp=exp_t_5_2023-05-30_02-36-58/models/best_model.pth' (epoch 521) (bestacc 0.8913398385047913) (early stop count 0)
load json from /nfs/Workspace/CardiacSeg/exps/data_dicts/chgh/exp_b7_9.json
train files (9): ['pid_08', 'pid_27', 'pid_30', 'pid_52', 'pid_56', 'pid_57', 'pid_110', 'pid_1002', 'pid_1003']
val files (3): ['pid_08_1', 'pid_107', 'pid_108']
test files (3): ['pid_02', 'pid_106', 'pid_1000']
infer data: {'image': '/nfs/Workspace/CardiacSeg/dataset/chgh/dataset_2/pid_02/pid

## Train other models

In [15]:
# training
!PYTHONPATH={workspace_dir} /opt/conda/bin/python {workspace_dir}/expers/tune.py \
--tune_mode={tune_mode} \
--exp_name={exp_name} \
--data_name={data_name} \
--data_dir={data_dir} \
--root_exp_dir={root_exp_dir} \
--model_name={model_name}\
--model_dir={model_dir} \
--log_dir={log_dir} \
--eval_dir={eval_dir} \
--start_epoch=0 \
--val_every=20 \
--max_early_stop_count=20 \
--max_epoch=8000  \
--data_dicts_json={data_dicts_json} \
--pin_memory \
--out_channels=2 \
--patch_size=4 \
--feature_size=48 \
--drop_rate=0.1 \
--depths 3 3 9 3 \
--kernel_size 7 \
--exp_rate 4 \
--norm_name='layer' \
--a_min=-42 \
--a_max=423 \
--space_x=0.7 \
--space_y=0.7 \
--space_z=1.0 \
--roi_x=128 \
--roi_y=128 \
--roi_z=128 \
--optim='AdamW' \
--lr=5e-4 \
--weight_decay=5e-4 \
--checkpoint={final_checkpoint} \
--use_init_weights \
--infer_post_process \
# --resume_tuner \
# --save_eval_csv \
# --test_mode \

resume tuner form /nfs/Workspace/CardiacSeg/exps/exps/swinunetr/chgh/tune_results
2023-07-02 08:04:28,844	INFO worker.py:1625 -- Started a local Ray instance.
2023-07-02 08:04:29,645	INFO tune.py:218 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `Tuner(...)`.
The `local_dir` argument of `Experiment is deprecated. Use `storage_path` or set the `TUNE_RESULT_DIR` environment variable instead.
2023-07-02 08:04:29,660	INFO tensorboardx.py:172 -- pip install "ray[tune]" to see TensorBoard files.
2023-07-02 08:04:29,665	INFO experiment_state.py:392 -- A local experiment checkpoint was found and will be used to restore the previous experiment state.
2023-07-02 08:04:29,666	INFO trial_runner.py:386 -- Using the newest experiment state file found within the experiment directory: experiment_state-2023-04-10_06-53-58.json
== Status ==
Current time: 2023-07-02 08:04:29 (running for 00:00:00.07)
Using FIFO scheduling algorithm.
Logical

## Analysis

In [42]:
!PYTHONPATH=/nfs/Workspace/CardiacSeg /opt/conda/bin/python /nfs/Workspace/CardiacSeg/expers/tune_anal.py \
--exp_name={exp_name} \
--local_dir={root_exp_dir}

Loading results from /nfs/Workspace/CardiacSeg/exps/exps/unetcnx_a1/chgh/tune_results/exp_b7_9_ds1_fh_tn1...
2023-05-28 09:29:00,909 - No `self.trials`. Drawing logdirs from checkpoint file. This may result in some information that is out of sync, as checkpointing is periodic.
Trial 6fc08_00000:  {'optim': {'lr': 0.0005, 'weight_decay': 0.0003}} 0.8928633 0.88013774
Trial 6fc08_00001:  {'optim': {'lr': 0.0005, 'weight_decay': 0.0007}} 0.8774688 0.8682186

Best trial 6fc08_00000: 
config: {'optim': {'lr': 0.0005, 'weight_decay': 0.0003}}
tt_dice: 0.8928633
tt_hd95: 4.149112902449147
inf_dice: 0.88013774
inf_hd95: 8.449605779201455
best log dir: /nfs/Workspace/CardiacSeg/exps/exps/unetcnx_a1/chgh/tune_results/exp_b7_9_ds1_fh_tn1/main_6fc08_00000_0_optim=lr_0_0005_weight_decay_0_0003_2023-05-27_14-19-05
final early stop count: 20
final epoch: 1080
best val dice: 0.8920990824699402
[0m