In [1]:
import sys, time
import ray
from ray.tune import experiment
from ray.air import CheckpointConfig, RunConfig, session

from fran.managers.raytune_manager import *
from fran.utils.helpers import *

from skimage._shared.utils import check_nD
sys.path+= [".."]
import os; 
tr2 = ray.util.pdb.set_trace



Please cite the following paper when using nnUNet:

Isensee, F., Jaeger, P.F., Kohl, S.A.A. et al. "nnU-Net: a self-configuring method for deep learning-based biomedical image segmentation." Nat Methods (2020). https://doi.org/10.1038/s41592-020-01008-z


If you have questions or suggestions, feel free to open an issue at https://github.com/MIC-DKFZ/nnUNet

https://app.neptune.ai/drusmanbashir/KiTS19/
Remember to stop your project once you’ve finished logging your metadata (https://docs.neptune.ai/api/project#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


In [2]:

# For myself. Delete this cell if you want
if is_notebook() == False:
    os.chdir("nbs")

In [3]:
common_paths_filename="config.yaml"
proj_defaults = load_proj_defaults(common_paths_filename,"kits21")
config_dict =  load_config_from_workbook(proj_defaults.configuration_filename, raytune=False)

# If you are new to ray-tune, please see their [Getting started](https://docs.ray.io/en/latest/tune/getting-started.html) page.
# Ray-tune options are set below. Alter them to your preference.

In [4]:
os.environ['TUNE_DISABLE_STRICT_METRIC_CHECKING']="1"
n_trials=5
raytune=True
profiler=False
n_gpu_per_trial=1
n_cpu_per_trial =32/n_gpu_per_trial    # alter this based on your number of cpu cores
    #Note: Make sure you have set the CUDA_VISIBLE_DEVICES env variable otherwise ray will not recognize your GPU.
neptune=True  # enables neptune logging.
max_epochs = 2
grace_period=1
if profiler==True:  
    os.environ["PERFTOOLS_PATH"]="/usr/lib/x86_64-linux-gnu/libprofiler.so"
    os.environ["PERFTOOLS_LOGFILE"]="/tmp/pprof.out"

config = load_config_from_workbook(proj_defaults.configuration_filename, raytune=raytune)
metadata =config['metadata']
dice_channel = 1 if out_channels_from_dict_or_cell(metadata['dest_labels'])==2 else 2
monitor = "loss_dice_region"+str(dice_channel)
# most_recent_experiment= metadata['most_recent_experiment'].item()

resume= False
most_recent_experiment="kits21_337"
if resume==True:
    experiment_name=most_recent_experiment
else:
    experiment_name=proj_defaults.project_title +"_"+ str(random.randint(1,1e+3)) 

debug_mode = False
if debug_mode ==True:
    ray.init(local_mode=True)
num_samples = 4 if debug_mode==True else n_trials
max_num_epochs = 3 if debug_mode==True else max_epochs
grace_period = max_num_epochs if debug_mode == True else grace_period
scheduler = ASHAScheduler(
    metric=monitor,
    mode="min",
    max_t=max_num_epochs,
    grace_period=grace_period,
    reduction_factor=2)
    


loss_channels = out_channels_from_dict_or_cell(metadata['dest_labels'])
loss_cols = ["loss_dice_label"+str(indx) for indx in range(1,loss_channels)]
# loss_cols = ["loss_dice_label1"]
loss_cols = ["training_iteration", "loss_ce_focal"] + loss_cols
reporter = CLIReporter(
    metric_columns=loss_cols ,
    parameter_columns = {'dataset_params/stride_random': 'random stride'},
)

In [5]:

train_single_gpu= partial(train_with_tune,False,neptune,max_num_epochs,proj_defaults)
train_multi_gpu= partial(train_with_tune,True, neptune,raytune,max_num_epochs,proj_defaults)
train = train_multi_gpu if n_gpu_per_trial>1 else train_single_gpu

# Setting up Ray Tuners: One for new trials. Another to resume previous trial (name stored manually as most_recent_experiment)

In [6]:


trial_path = proj_defaults.checkpoints_parent_folder/experiment_name
trainable_with_resources = tune.with_resources(train, {"cpu": 24, "gpu":1})
tuner_new = tune.Tuner(
    trainable_with_resources,
    tune_config=tune.TuneConfig(
        num_samples = num_samples,
        scheduler = scheduler,
        trial_name_creator=lambda trial: trial_str_creator(experiment_name), # this ignore trial name!
        trial_dirname_creator = trial_dirname_creator,
    ),
    param_space = config,
    run_config = RunConfig(
        name=experiment_name,

        local_dir = proj_defaults.checkpoints_parent_folder,
        checkpoint_config= CheckpointConfig(

        num_to_keep=5,
        checkpoint_at_end=False,
        ),
        progress_reporter=reporter
        
    ),

            

)

tuner_resume = tune.Tuner.restore(
    path=str(trial_path)

)

# Running the Ray Tune experiment. Note down the experiment name for reference. It is also stored under the corresponding online Neptune run as metadata/run_name

In [7]:


if resume==True:
    print("Resuming existing experiment, name: {}".format(experiment_name))
    results = tuner_resume.fit()
else:
    results = tuner_new.fit()

# 

2023-01-17 18:22:47,790	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


[2m[36m(pid=563285)[0m 
[2m[36m(pid=563285)[0m 
[2m[36m(pid=563285)[0m Please cite the following paper when using nnUNet:
[2m[36m(pid=563285)[0m 
[2m[36m(pid=563285)[0m Isensee, F., Jaeger, P.F., Kohl, S.A.A. et al. "nnU-Net: a self-configuring method for deep learning-based biomedical image segmentation." Nat Methods (2020). https://doi.org/10.1038/s41592-020-01008-z
[2m[36m(pid=563285)[0m 
[2m[36m(pid=563285)[0m 
[2m[36m(pid=563285)[0m If you have questions or suggestions, feel free to open an issue at https://github.com/MIC-DKFZ/nnUNet
[2m[36m(pid=563285)[0m 
== Status ==
Current time: 2023-01-17 18:22:49 (running for 00:00:00.17)
Memory usage on this node: 13.9/62.6 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: None
Resources requested: 24.0/48 CPUs, 1.0/2 GPUs, 0.0/31.51 GiB heap, 0.0/15.76 GiB objects (0.0/1.0 accelerator_type:G)
Result logdir: /s/checkpoints/kits21/kits21_162
Number of trials: 5/5 (4 PENDING, 1 RUNNIN

[2m[36m(func pid=563285)[0m   nep_run = neptune.init_run(run=None,mode="async",**self.neptune_run_settings)


[2m[36m(pid=563461)[0m https://app.neptune.ai/drusmanbashir/KiTS19/
[2m[36m(func pid=563461)[0m Loading dataset from BBox file /home/ub/datasets/preprocessed/kits21/stage2_patches/64_160_160/bboxes_info
[2m[36m(func pid=563461)[0m Loading dataset from BBox file /home/ub/datasets/preprocessed/kits21/stage2_patches/64_160_160/bboxes_info
[2m[36m(func pid=563461)[0m https://app.neptune.ai/drusmanbashir/KiTS19/
[2m[36m(func pid=563285)[0m https://app.neptune.ai/drusmanbashir/KiTS19/e/KITS-2387


[2m[36m(func pid=563285)[0m   return self.assign(value, wait)


[2m[36m(func pid=563461)[0m No checkpoints on Tune. Initializing..
[2m[36m(func pid=563285)[0m Initializing new run. Run name kits21_162_327 
== Status ==
Current time: 2023-01-17 18:22:58 (running for 00:00:09.24)
Memory usage on this node: 15.4/62.6 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: None
Resources requested: 48.0/48 CPUs, 2.0/2 GPUs, 0.0/31.51 GiB heap, 0.0/15.76 GiB objects (0.0/1.0 accelerator_type:G)
Result logdir: /s/checkpoints/kits21/kits21_162
Number of trials: 5/5 (3 PENDING, 2 RUNNING)
+----------------+----------+---------------------+-----------------+
| Trial name     | status   | loc                 | random stride   |
|----------------+----------+---------------------+-----------------|
| kits21_162_327 | RUNNING  | 192.168.0.63:563285 | False           |
| kits21_162_329 | RUNNING  | 192.168.0.63:563461 | False           |
| kits21_162_331 | PENDING  |                     | True            |
| kits21_162_333 | PENDIN

[2m[36m(func pid=563461)[0m   nep_run = neptune.init_run(run=None,mode="async",**self.neptune_run_settings)


[2m[36m(func pid=563461)[0m https://app.neptune.ai/drusmanbashir/KiTS19/e/KITS-2388
Epoch 1/2 : |----------------------------------------| 1.85% [1/54 00:00<00:28]


[2m[36m(func pid=563461)[0m   return self.assign(value, wait)


Epoch 1/2 : |█---------------------------------------| 3.70% [2/54 00:00<00:24 0.7411]
[2m[36m(func pid=563461)[0m Initializing new run. Run name kits21_162_329 
Epoch 1/2 : |██--------------------------------------| 5.56% [3/54 00:01<00:22 0.7399]
Epoch 1/2 : |██--------------------------------------| 7.41% [4/54 00:01<00:21 0.6732]
Epoch 1/2 : |███-------------------------------------| 9.26% [5/54 00:02<00:22 0.6218]
== Status ==
Current time: 2023-01-17 18:23:03 (running for 00:00:14.24)
Memory usage on this node: 20.2/62.6 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: None
Resources requested: 48.0/48 CPUs, 2.0/2 GPUs, 0.0/31.51 GiB heap, 0.0/15.76 GiB objects (0.0/1.0 accelerator_type:G)
Result logdir: /s/checkpoints/kits21/kits21_162
Number of trials: 5/5 (3 PENDING, 2 RUNNING)
+----------------+----------+---------------------+-----------------+
| Trial name     | status   | loc                 | random stride   |
|----------------+---------

Trial name,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,loss,loss_ce_focal,loss_dice,loss_dice_label1,loss_dice_label2,loss_dice_label3,node_ip,pid,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
kits21_162_327,2023-01-17_18-24-10,True,,636c1f52c9f248a0b1a1eb74c8b368bb,"0_0=0.8208,1=1.6127,1=0.1623,0=0.4023,1=1.5693,1=0.3327,0=-0.7508,1=0.7600,1=0.4159,stride_random=False,base_ch_opts=16,deep_supervision=False,lr=0.0035",zen,2,0.117164,0.520817,-0.286489,-0.596494,-0.224746,-0.0382262,192.168.0.63,563285,77.1088,37.2422,77.1088,1673979850,0,,2,f29df_00000,0.00214434
kits21_162_329,2023-01-17_18-24-14,True,,8517274fabd048a6915f99bd06a89fb9,"1_0=0.9547,1=1.7595,1=0.2284,0=0.2711,1=1.7794,1=0.1492,0=-0.9756,1=0.7258,1=0.9663,stride_random=False,base_ch_opts=32,deep_supervision=False,lr=0.0100",zen,2,0.127463,0.547977,-0.293052,-0.499109,-0.346449,-0.0335971,192.168.0.63,563461,77.3805,35.307,77.3805,1673979854,0,,2,f29df_00001,0.00210381
kits21_162_331,2023-01-17_18-25-27,True,,636c1f52c9f248a0b1a1eb74c8b368bb,"2_0=0.6264,1=1.6404,1=0.8658,0=0.2935,1=1.6498,1=0.9770,0=-0.6378,1=0.6059,1=0.2484,stride_random=True,base_ch_opts=32,deep_supervision=True,lr=0.0025",zen,2,0.11596,0.529504,-0.297584,-0.60311,-0.281443,-0.00819869,192.168.0.63,563285,71.2425,35.6964,71.2425,1673979927,0,,2,f29df_00002,0.00214434
kits21_162_333,2023-01-17_18-25-26,True,,8517274fabd048a6915f99bd06a89fb9,"3_0=0.6329,1=1.6175,1=0.9620,0=0.3386,1=1.5884,1=0.6512,0=-0.6391,1=0.7441,1=0.5638,stride_random=True,base_ch_opts=32,deep_supervision=True,lr=0.0055",zen,2,0.0890995,0.479782,-0.301583,-0.489953,-0.377812,-0.0369825,192.168.0.63,563461,66.4962,32.9996,66.4962,1673979926,0,,2,f29df_00003,0.00210381
kits21_162_335,2023-01-17_18-26-45,True,,8517274fabd048a6915f99bd06a89fb9,"4_0=0.9754,1=1.5360,1=0.7411,0=0.4758,1=1.6444,1=0.9664,0=-0.5670,1=0.7777,1=0.1080,stride_random=True,base_ch_opts=16,deep_supervision=True,lr=0.0025",zen,2,0.0722236,0.474749,-0.330302,-0.644837,-0.34016,-0.00590859,192.168.0.63,563461,73.4169,37.5028,73.4169,1673980005,0,,2,f29df_00004,0.00210381


[2m[36m(func pid=563285)[0m Better model found at epoch 0 with valid_loss value: 0.07176317274570465.
Epoch 1/2 : |██████████████████████████████████------| 87.04% [47/54 00:28<00:04 0.1470]


[2m[36m(func pid=563285)[0m   value = StringVal(value)


Epoch 1/2 : |███████████████████████████████████-----| 88.89% [48/54 00:29<00:03 0.1511]
Epoch 1/2 : |████████████████████████████████████----| 90.74% [49/54 00:29<00:03 0.1491]
Epoch 1/2 : |█████████████████████████████████████---| 92.59% [50/54 00:30<00:02 0.1457]
Epoch 2/2 : |----------------------------------------| 0.00% [0/54 00:00<?]
Epoch 1/2 : |█████████████████████████████████████---| 94.44% [51/54 00:31<00:01 0.1435]
Epoch 1/2 : |██████████████████████████████████████--| 96.30% [52/54 00:31<00:01 0.1395]
Epoch 2/2 : |----------------------------------------| 1.85% [1/54 00:01<00:53]
Epoch 1/2 : |███████████████████████████████████████-| 98.15% [53/54 00:31<00:00 0.1359]
Epoch 1/2 : |----------------------------------------| 0.00% [0/14 00:00<?]              
Epoch 1/2 : |██--------------------------------------| 7.14% [1/14 00:00<00:01]
Epoch 1/2 : |█████-----------------------------------| 14.29% [2/14 00:00<00:01 0.1320]
Epoch 2/2 : |█--------------------------------------

[2m[36m(func pid=563461)[0m   value = StringVal(value)


Epoch 2/2 : |█████-----------------------------------| 12.96% [7/54 00:04<00:31 0.1376]
Epoch 2/2 : |█████-----------------------------------| 14.81% [8/54 00:05<00:32 0.1340]
Epoch 2/2 : |██████----------------------------------| 16.67% [9/54 00:06<00:30 0.1311]
Epoch 2/2 : |----------------------------------------| 0.00% [0/54 00:00<?]
Epoch 2/2 : |███████---------------------------------| 18.52% [10/54 00:06<00:28 0.1259]
Epoch 2/2 : |----------------------------------------| 1.85% [1/54 00:00<00:20]
Epoch 2/2 : |████████--------------------------------| 20.37% [11/54 00:07<00:28 0.1214]
Epoch 2/2 : |█---------------------------------------| 3.70% [2/54 00:01<00:34 0.1288]
Epoch 2/2 : |████████--------------------------------| 22.22% [12/54 00:07<00:26 0.1188]
Epoch 2/2 : |█████████-------------------------------| 24.07% [13/54 00:08<00:25 0.1144]
Epoch 2/2 : |██--------------------------------------| 5.56% [3/54 00:02<00:35 0.1316]
Epoch 2/2 : |██████████---------------------------

2023-01-17 18:26:51,445	INFO tune.py:762 -- Total run time: 242.13 seconds (241.91 seconds for the tuning loop).


# Examining the results: 
### Check out https://docs.ray.io/en/latest/tune/examples/tune_analyze_results.html for more info.

In [8]:
results

<ray.tune.result_grid.ResultGrid at 0x7f00ed38cd00>

In [10]:
num_results = len(results)
print("Number of results:", num_results)

Number of results: 5


In [13]:
# Iterate over results
for i, result in enumerate(results):
    if result.error:
        print(f"Trial #{i} had an error:", result.error)
        continue

    print(
        f"Trial #{i} finished successfully with a mean accuracy metric of:",
        result.metrics["loss_dice_label2"]   # tumour dice
    )

Trial #0 finished successfully with a mean accuracy metric of: tensor(-0.2247)
Trial #1 finished successfully with a mean accuracy metric of: tensor(-0.3464)
Trial #2 finished successfully with a mean accuracy metric of: tensor(-0.2814)
Trial #3 finished successfully with a mean accuracy metric of: tensor(-0.3778)
Trial #4 finished successfully with a mean accuracy metric of: tensor(-0.3402)
