다음을 리뷰 :
https://github.com/Project-MONAI/tutorials/blob/master/3d_segmentation/ignite/unet_training_array.py

In [1]:
import os
import sys
import numpy as np
from glob import glob
import matplotlib.pyplot as plt
import nibabel as nib    # nifti 포맷 파일 생성때만 이용

import torch
from torch.utils.data import DataLoader
import monai
## decollate_batch : 배치 텐서를 리스트의 텐서로 변환
from monai.data import ImageDataset, create_test_image_3d, decollate_batch
from monai.transforms import (
    Activations, 
    AddChannel, 
    AsDiscrete, 
    Compose, 
    RandRotate90,
    RandSpatialCrop, 
    Resize,
    ScaleIntensity, 
    EnsureType
)

# 삭제
# from monai.metrics import DiceMetric
# from monai.inferers import sliding_window_inference
# from monai.visualize import plot_2d_or_3d_image
# from torch.utils.tensorboard import SummaryWriter


## 새롭게 추가

# Events : process point를 지정 Events.EPOCH_COMPETED
from ignite.engine import Events, create_supervised_evaluator, create_supervised_trainer
# ModelCheckpoint : training 동안 모델 계속 저장
from ignite.handlers import EarlyStopping, ModelCheckpoint
from monai.handlers import (
    MeanDice,          # val_metric {}를 정의시 넣음
    StatsHandler,      # 각 epoch마다 loss, metric 출력
    TensorBoardImageHandler,
    TensorBoardStatsHandler,    # 각 epoch마다 loss, metric plot
    stopping_fn_from_metric,    # ignite EarlyStopping 과 연결, metric기준 stopping
)



import torch.nn as nn

import logging

In [2]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

In [3]:
tempdir = './dataset'
monai.config.print_config()
# os.environ["CUDA_VISIBLE_DEVICES"] = '6'
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '5,6'

## 디렉토리에 40개 랜덤이미지, 마스크 생성
print(f"generating synthetic data to {tempdir} (this may take a while)")
for i in range(40):
    # np image 생성
    im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1)  
#     print(type(im), type(seg))  # np.array
#     print(im.shape, seg.shape)  # (128, 128, 128) (128, 128, 128) 3d 라서 img, seg가 같은 dim인 듯?

    n = nib.Nifti1Image(im, np.eye(4))
    nib.save(n, os.path.join(tempdir, f"img{i:d}.nii.gz"))

    n = nib.Nifti1Image(seg, np.eye(4))
    nib.save(n, os.path.join(tempdir, f"seg{i:d}.nii.gz"))

## 파일이름들 가져오기
images = sorted(glob(os.path.join(tempdir, "im*.nii.gz")))    # 40개 nifti file 리스트
segs = sorted(glob(os.path.join(tempdir, "seg*.nii.gz")))

## transform 정의
train_imtrans = Compose(
    [
        ScaleIntensity(),
        AddChannel(),
        # aug
        RandSpatialCrop((96, 96, 96), random_size=False),
#         RandRotate90(prob=0.5, spatial_axes=(0, 2)),
        EnsureType(),
    ]
)
train_segtrans = Compose(
    [
        # 스케일링 필요없나 봄 (1또는 0이므로)
        AddChannel(),
        # aug (img와 같은 aug를 해주는가..? -> 맞음.. 왜인진.. 모름)
        RandSpatialCrop((96, 96, 96), random_size=False),
#         RandRotate90(prob=0.5, spatial_axes=(0, 2)),
        EnsureType(),
    ]
)
val_imtrans = Compose([ScaleIntensity(), AddChannel(), Resize((96, 96, 96)), EnsureType()])
val_segtrans = Compose([AddChannel(), Resize((96, 96, 96)), EnsureType()])

## define image dataset, data loader
check_ds = ImageDataset(
    images, segs, transform=train_imtrans, seg_transform=train_segtrans
)
check_loader = DataLoader(
    check_ds, batch_size=2, num_workers=2, 
#     pin_memory=torch.cuda.is_available()
)
im, seg = monai.utils.misc.first(check_loader)
print(im.shape, seg.shape)

## train, val loader
train_ds = ImageDataset(
    images[:20], segs[:20], transform=train_imtrans, seg_transform=train_segtrans
)
# num_workers = 4 * torch.cuda.device_count()
train_loader = DataLoader(
    train_ds, 
    batch_size=4, 
    shuffle=True, 
    num_workers=4, 
    pin_memory=torch.cuda.is_available()
)
val_ds = ImageDataset(
    images[-20:], segs[-20:], transform=val_imtrans, seg_transform=val_segtrans
)
val_loader = DataLoader(
    val_ds, batch_size=2, num_workers=2, pin_memory=torch.cuda.is_available()
)

MONAI version: 0.9.dev2152
Numpy version: 1.21.2
Pytorch version: 1.8.1+cu102
MONAI flags: HAS_EXT = False, USE_COMPILED = False
MONAI rev id: c5bd8aff8ba461d7b349eb92427d452481a7eb72

Optional dependencies:
Pytorch Ignite version: 0.4.6
Nibabel version: 3.2.1
scikit-image version: 0.18.3
Pillow version: 8.4.0
Tensorboard version: 2.6.0
gdown version: 4.2.0
TorchVision version: 0.11.0a0
tqdm version: 4.62.3
lmdb version: 1.2.1
psutil version: 5.8.0
pandas version: 1.3.4
einops version: 0.3.2
transformers version: 4.12.5
mlflow version: 1.21.0

For details about installing the optional dependencies, please visit:
    https://docs.monai.io/en/latest/installation.html#installing-the-recommended-dependencies

generating synthetic data to ./dataset (this may take a while)
torch.Size([2, 1, 96, 96, 96]) torch.Size([2, 1, 96, 96, 96])


### ~~(post-proc)~~, ~~(metric)~~, model 정의

차이
* metric(for valid) 정의, post-proc들이 ignite valid 아랫과정으로 넘어감
* post-proc 과정이 하나 더생김(label에다 적용하는데 안해도 될듯.)

In [4]:
# device = torch.device("cuda:5" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

net = monai.networks.nets.UNet(
    spatial_dims=3,
    in_channels=1,
    out_channels=1,
    channels=(16, 32, 64, 128, 256),
    strides=(2, 2, 2, 2),
    num_res_units=2,
).to(device)

# net = monai.networks.nets.UNet(
#     spatial_dims=3,
#     in_channels=1,
#     out_channels=1,
#     channels=(16, 32, 64, 128, 256),
#     strides=(2, 2, 2, 2),
#     num_res_units=2,
# )
# net = nn.DataParallel(net, device_ids = [4,5,6,7])
# net.cuda()



loss = monai.losses.DiceLoss(sigmoid=True)
opt = torch.optim.Adam(net.parameters(), 1e-3)


A100-SXM4-40GB with CUDA capability sm_80 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the A100-SXM4-40GB GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



### Ignite - training 관련 정의

In [5]:
# Ignite trainer expects batch=(img, seg) and returns output=loss at every iteration,
# user can add output_transform to return other values, like: y_pred, y, etc.
trainer = create_supervised_trainer(net, opt, loss, device, False)


###### !!Check Point!! : 모델 저장
checkpoint_handler = ModelCheckpoint(
    dirname="./runs_array/",
    filename_prefix="net",
    n_saved=5,    # 딱 10개만 저장, 더 업데이트 되면 덮어쓰기
    require_empty=False  # True: 기존 모델이 dir에 있다면 덮어쓰지 않고 오류
)
trainer.add_event_handler(
    event_name=Events.EPOCH_COMPLETED,
    handler=checkpoint_handler,
    to_save={"net": net, "opt": opt}    # opt는 딱히 뭔진 모르겠음
)

###### !! StatsHandler!! : 각 iter와 각 epoch 마다 loss와 metrics를 출력
# StatsHandler prints loss at every iteration and print metrics at every epoch,
# we don't set metrics for trainer here, so just print loss, user can also customize print functions
# and can use output_transform to convert engine.state.output if it's not a loss value
# trainer에 metrics만 설정 해뒀다면 loss뿐 아니라 프린트를 커스터 마이징 가능(여기엔 안되어 있음)
train_stats_handler = StatsHandler(name="trainer", output_transform=lambda x: x)
train_stats_handler.attach(trainer)

###### !! TensorBoardStatsHandler!! : 각 iter, epoch마다 loss와 metric을 plot. statshandler와 같음
# TensorBoardStatsHandler plots loss at every iteration and plots metrics at every epoch, same as StatsHandler
train_tensorboard_stats_handler = TensorBoardStatsHandler(output_transform=lambda x:x)
train_tensorboard_stats_handler.attach(trainer)


### Ignite-valid 관련 정의

In [6]:
post_pred = Compose([EnsureType(), Activations(sigmoid=True), AsDiscrete(threshold=0.5)])
post_label = Compose([EnsureType(), AsDiscrete(threshold=0.5)])   # 정답(label)에 대한 post-proc
metric_name = "Mean_Dice"
val_metrics = {metric_name: MeanDice()}
validation_every_n_epochs = 1

## post-processing과정이 까다롭다..
evaluator = create_supervised_evaluator(
    net,
    val_metrics,
    device,
    True,
    # 순서 바꿔도 되야하지 않나
    output_transform=lambda x, y, y_pred: ([post_pred(i) for i in decollate_batch(y_pred)], [post_label(i) for i in decollate_batch(y)]),
)

###### !! evaluator proc 정의 !!
@trainer.on(Events.EPOCH_COMPLETED(every=validation_every_n_epochs))
def run_validation(engine):
    evaluator.run(val_loader)

###### !! EarlyStopping !!
early_stopper = EarlyStopping(
    patience=4,
    score_function=stopping_fn_from_metric(metric_name),
    trainer=trainer
)
evaluator.add_event_handler(
    event_name=Events.EPOCH_COMPLETED, 
    handler=early_stopper
)

###### !! StatsHandler!! : 각 iter와 각 epoch 마다 loss와 metrics를 출력 for validation
val_stats_handler = StatsHandler(
    name="evaluator",
    output_transform=lambda x: None,   # no need to print loss value, so disable per iteration output
    global_epoch_transform=lambda x: trainer.state.epoch,    # trainer에서 global epoch number 가져오기
)
val_stats_handler.attach(evaluator)

###### !! TensorBoardStatsHandler!! : 각 iter, epoch마다 loss와 metric을 plot. statshandler와 같음
val_tensorboard_stats_handler = TensorBoardStatsHandler(
    output_transform=lambda x: None,  # no need to plot loss value, so disable per iteration output
    global_epoch_transform=lambda x: trainer.state.epoch,
)
val_tensorboard_stats_handler.attach(evaluator)

###### !! TensorBoardImageHandler!! : 마지막 배치(?)에서 첫 번째 이미지와 해당 레이블 및 모델 출력을 그리는 핸들러 추가.
# 매 validation spoch에서 그림그리기
# add handler to draw the first image and the corresponding label and model output in the last batch
# here we draw the 3D output as GIF format along Depth axis, at every validation epoch

# batch_transform : ignite.engine.state.batch 에서 이미지와 레이블 가져올 수 있음
# output_transform : ignite.engine.state.output 에서 prediction 결과 이미지 가져옴, output[index] index는 몇번째 element인지
val_tensorboard_image_handler = TensorBoardImageHandler(
    batch_transform=lambda batch: (batch[0], batch[1]),
    output_transform=lambda output: output[0],
    global_iter_transform=lambda x: trainer.state.epoch,
)
evaluator.add_event_handler(
    event_name=Events.EPOCH_COMPLETED, handler=val_tensorboard_image_handler
)

train_epochs = 30
state = trainer.run(train_loader, train_epochs)
print(state)


INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=30.
ERROR:ignite.engine.engine.Engine:Current run is terminating due to exception: Unable to find a valid cuDNN algorithm to run convolution
ERROR:trainer:Exception: Unable to find a valid cuDNN algorithm to run convolution
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/ignite/engine/engine.py", line 834, in _run_once_on_dataset
    self.state.output = self._process_function(self, self.state.batch)
  File "/opt/conda/lib/python3.8/site-packages/ignite/engine/__init__.py", line 92, in update
    y_pred = model(x)
  File "/home/ducke/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/monai/networks/nets/unet.py", line 281, in forward
    x = self.model(x)
  File "/home/ducke/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_imp

RuntimeError: Unable to find a valid cuDNN algorithm to run convolution

logging을 주면 standardOUT 으로 확인 가능.

In [17]:
# state 확인가능 한지 보자.
type(state), dir(state)

(ignite.engine.events.State,
 ['__class__',
  '__delattr__',
  '__dict__',
  '__dir__',
  '__doc__',
  '__eq__',
  '__format__',
  '__ge__',
  '__getattribute__',
  '__gt__',
  '__hash__',
  '__init__',
  '__init_subclass__',
  '__le__',
  '__lt__',
  '__module__',
  '__ne__',
  '__new__',
  '__reduce__',
  '__reduce_ex__',
  '__repr__',
  '__setattr__',
  '__sizeof__',
  '__str__',
  '__subclasshook__',
  '__weakref__',
  '_update_attrs',
  'batch',
  'dataloader',
  'epoch',
  'epoch_length',
  'event_to_attr',
  'get_event_attrib_value',
  'iteration',
  'max_epochs',
  'metrics',
  'output',
  'seed',
  'times'])

In [22]:
# batch는 마지막 배치
state.batch[0].shape

torch.Size([4, 1, 96, 96, 96])

In [23]:
state.metrics

{}

--------------------------

In [9]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '5,6'

monai.config.print_config()
tempdir = './dataset'

# create a temporary directory and 40 random image, mask pairs
print(f"generating synthetic data to {tempdir} (this may take a while)")
for i in range(40):
    im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1)

    n = nib.Nifti1Image(im, np.eye(4))
    nib.save(n, os.path.join(tempdir, f"im{i:d}.nii.gz"))

    n = nib.Nifti1Image(seg, np.eye(4))
    nib.save(n, os.path.join(tempdir, f"seg{i:d}.nii.gz"))

images = sorted(glob(os.path.join(tempdir, "im*.nii.gz")))
segs = sorted(glob(os.path.join(tempdir, "seg*.nii.gz")))

# define transforms for image and segmentation
train_imtrans = Compose(
    [
        ScaleIntensity(),
        AddChannel(),
        RandSpatialCrop((96, 96, 96), random_size=False),
        RandRotate90(prob=0.5, spatial_axes=(0, 2)),
        EnsureType(),
    ]
)
train_segtrans = Compose(
    [AddChannel(), RandSpatialCrop((96, 96, 96), random_size=False), 
     RandRotate90(prob=0.5, spatial_axes=(0, 2)), EnsureType()]
)
val_imtrans = Compose(
    [ScaleIntensity(), AddChannel(), Resize((96, 96, 96)), EnsureType()]
)
val_segtrans = Compose([AddChannel(), Resize((96, 96, 96)), EnsureType()])

# # define image dataset, data loader
# check_ds = ImageDataset(
#     images, segs, transform=train_imtrans, seg_transform=train_segtrans
# )
# check_loader = DataLoader(
#     check_ds, batch_size=10, num_workers=2, pin_memory=torch.cuda.is_available()
# )
# im, seg = monai.utils.misc.first(check_loader)
# print(im.shape, seg.shape)

# create a training data loader
train_ds = ImageDataset(
    images[:20], segs[:20], transform=train_imtrans, seg_transform=train_segtrans
)
train_loader = DataLoader(
    train_ds,
    batch_size=4,
    shuffle=True,
    num_workers=4,
    pin_memory=torch.cuda.is_available(),
)
# create a validation data loader
val_ds = ImageDataset(
    images[-20:], segs[-20:], transform=val_imtrans, seg_transform=val_segtrans
)
val_loader = DataLoader(
    val_ds, batch_size=2, num_workers=4, pin_memory=torch.cuda.is_available()
)

# create UNet, DiceLoss and Adam optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cuda:5" if torch.cuda.is_available() else "cpu")
net = monai.networks.nets.UNet(
    spatial_dims=3,
    in_channels=1,
    out_channels=1,
    channels=(16, 32, 64, 128, 256),
    strides=(2, 2, 2, 2),
    num_res_units=2,
).to(device)
loss = monai.losses.DiceLoss(sigmoid=True)
lr = 1e-3
opt = torch.optim.Adam(net.parameters(), lr)

# Ignite trainer expects batch=(img, seg) and returns output=loss at every iteration,
# user can add output_transform to return other values, like: y_pred, y, etc.
trainer = create_supervised_trainer(net, opt, loss, device, False)

# adding checkpoint handler to save models (network params and optimizer stats) during training
checkpoint_handler = ModelCheckpoint(
    "./runs_array/", "net", n_saved=10, require_empty=False
)
trainer.add_event_handler(
    event_name=Events.EPOCH_COMPLETED,
    handler=checkpoint_handler,
    to_save={"net": net, "opt": opt},
)

# StatsHandler prints loss at every iteration and print metrics at every epoch,
# we don't set metrics for trainer here, so just print loss, user can also customize print functions
# and can use output_transform to convert engine.state.output if it's not a loss value
train_stats_handler = StatsHandler(name="trainer", output_transform=lambda x: x)
train_stats_handler.attach(trainer)

# TensorBoardStatsHandler plots loss at every iteration and plots metrics at every epoch, same as StatsHandler
train_tensorboard_stats_handler = TensorBoardStatsHandler(output_transform=lambda x: x)
train_tensorboard_stats_handler.attach(trainer)

validation_every_n_epochs = 1
# Set parameters for validation
metric_name = "Mean_Dice"
# add evaluation metric to the evaluator engine
val_metrics = {metric_name: MeanDice()}

post_pred = Compose([EnsureType(), Activations(sigmoid=True), AsDiscrete(threshold=0.5)])
post_label = Compose([EnsureType(), AsDiscrete(threshold=0.5)])

# Ignite evaluator expects batch=(img, seg) and returns output=(y_pred, y) at every iteration,
# user can add output_transform to return other values
evaluator = create_supervised_evaluator(
    net,
    val_metrics,
    device,
    True,
    output_transform=lambda x, y, y_pred: ([post_pred(i) for i in decollate_batch(y_pred)], [post_label(i) for i in decollate_batch(y)]),
)

@trainer.on(Events.EPOCH_COMPLETED(every=validation_every_n_epochs))
def run_validation(engine):
    evaluator.run(val_loader)

# add early stopping handler to evaluator
early_stopper = EarlyStopping(
    patience=20, score_function=stopping_fn_from_metric(metric_name), trainer=trainer
)
evaluator.add_event_handler(
    event_name=Events.EPOCH_COMPLETED, handler=early_stopper
)

# add stats event handler to print validation stats via evaluator
val_stats_handler = StatsHandler(
    name="evaluator",
    output_transform=lambda x: None,  # no need to print loss value, so disable per iteration output
    global_epoch_transform=lambda x: trainer.state.epoch,
)  # fetch global epoch number from trainer
val_stats_handler.attach(evaluator)

# add handler to record metrics to TensorBoard at every validation epoch
val_tensorboard_stats_handler = TensorBoardStatsHandler(
    output_transform=lambda x: None,  # no need to plot loss value, so disable per iteration output
    global_epoch_transform=lambda x: trainer.state.epoch,
)  # fetch global epoch number from trainer
val_tensorboard_stats_handler.attach(evaluator)

# add handler to draw the first image and the corresponding label and model output in the last batch
# here we draw the 3D output as GIF format along Depth axis, at every validation epoch
val_tensorboard_image_handler = TensorBoardImageHandler(
    batch_transform=lambda batch: (batch[0], batch[1]),
    output_transform=lambda output: output[0],
    global_iter_transform=lambda x: trainer.state.epoch,
)
evaluator.add_event_handler(
    event_name=Events.EPOCH_COMPLETED, handler=val_tensorboard_image_handler
)

train_epochs = 50
state = trainer.run(train_loader, train_epochs)
print(state)


MONAI version: 0.9.dev2152
Numpy version: 1.21.2
Pytorch version: 1.10.0a0+0aef44c
MONAI flags: HAS_EXT = False, USE_COMPILED = False
MONAI rev id: c5bd8aff8ba461d7b349eb92427d452481a7eb72

Optional dependencies:
Pytorch Ignite version: 0.4.6
Nibabel version: 3.2.1
scikit-image version: 0.18.3
Pillow version: 8.4.0
Tensorboard version: 2.6.0
gdown version: 4.2.0
TorchVision version: 0.11.0a0
tqdm version: 4.62.3
lmdb version: 1.2.1
psutil version: 5.8.0
pandas version: 1.3.4
einops version: 0.3.2
transformers version: 4.12.5
mlflow version: 1.21.0

For details about installing the optional dependencies, please visit:
    https://docs.monai.io/en/latest/installation.html#installing-the-recommended-dependencies

generating synthetic data to ./dataset (this may take a while)


2022-01-20 08:23:07,428 ignite.handlers.early_stopping.EarlyStopping INFO: EarlyStopping: Stop training


State:
	iteration: 160
	epoch: 32
	epoch_length: 5
	max_epochs: 50
	output: 0.4166615605354309
	batch: <class 'list'>
	metrics: <class 'dict'>
	dataloader: <class 'torch.utils.data.dataloader.DataLoader'>
	seed: <class 'NoneType'>
	times: <class 'dict'>

