In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import argparse
import configparser
import logging
import os
import sys

import numpy as np
import torch
import wandb
from architectures import TempRedUNet
from custom_losses import FocalLoss, LovaszSoftmax3d, SumFocalLovasz
from datasets import SparkDataset
from new_unet import UNet
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from training_inference_tools import (
    compute_class_weights,
    random_flip,
    random_flip_noise,
    sampler,
    test_function,
    training_step,
    weights_init,
)

import unet

logger = logging.getLogger(__name__)

In [5]:
############################# fixed parameters #############################

# General params
verbosity = 3
logfile = None  # change this when publishing finished project on github
wandb_project_name = "sparks"
output_relative_path = "runs/"  # directory where output, saved params and
# testing results are saved

# Dataset parameters
ignore_index = 4  # label ignored during training
num_classes = 4  # i.e., BG, sparks, waves, puffs
ndims = 3  # using 3D data

In [6]:
############################# load config file #############################

CONFIG_FILE = os.path.join("config_files", "config.ini")
c = configparser.ConfigParser()
if os.path.isfile(CONFIG_FILE):
    logger.info(f"Loading {CONFIG_FILE}")
    c.read(CONFIG_FILE)
else:
    logger.warning(
        f"No config file found at {CONFIG_FILE}, trying to use fallback values."
    )

In [7]:
############################## set parameters ##############################

params = {}

# training params
params["run_name"] = c.get("training", "run_name", fallback="TEST")  # Run name
params["load_run_name"] = c.get("training", "load_run_name", fallback=None)
params["load_epoch"] = c.getint("training", "load_epoch", fallback=0)
params["train_epochs"] = c.getint("training", "train_epochs", fallback=5000)
params["criterion"] = c.get("training", "criterion", fallback="nll_loss")
params["lr_start"] = c.getfloat("training", "lr_start", fallback=1e-4)
params["ignore_frames_loss"] = c.getint("training", "ignore_frames_loss")
if (params["criterion"] == "focal_loss") or (params["criterion"] == "sum_losses"):
    params["gamma"] = c.getfloat("training", "gamma", fallback=2.0)
if params["criterion"] == "sum_losses":
    params["w"] = c.getfloat("training", "w", fallback=0.5)
params["cuda"] = c.getboolean("training", "cuda")

# dataset params
params["relative_path"] = c.get("dataset", "relative_path")
params["dataset_size"] = c.get("dataset", "dataset_size", fallback="full")
params["batch_size"] = c.getint("dataset", "batch_size", fallback=1)
params["num_workers"] = c.getint("dataset", "num_workers", fallback=1)
params["data_duration"] = c.getint("dataset", "data_duration")
params["data_step"] = c.getint("dataset", "data_step")
params["data_smoothing"] = c.get("dataset", "data_smoothing", fallback="2d")
params["norm_video"] = c.get("dataset", "norm_video", fallback="chunk")
params["remove_background"] = c.get(
    "dataset", "remove_background", fallback="average"
)
params["only_sparks"] = c.getboolean("dataset", "only_sparks", fallback=False)
params["noise_data_augmentation"] = c.getboolean(
    "dataset", "noise_data_augmentation", fallback=False
)
params["sparks_type"] = c.get("dataset", "sparks_type", fallback="peaks")
params["inference"] = c.get("dataset", "inference", fallback="overlap")

# UNet params
params["nn_architecture"] = c.get(
    "network", "nn_architecture", fallback="pablos_unet"
)
params["unet_steps"] = c.getint("network", "unet_steps")
params["first_layer_channels"] = c.getint("network", "first_layer_channels")
params["num_channels"] = c.getint("network", "num_channels", fallback=1)
params["dilation"] = c.getboolean("network", "dilation", fallback=1)
params["border_mode"] = c.get("network", "border_mode")
params["batch_normalization"] = c.get(
    "network", "batch_normalization", fallback="none"
)
params["temporal_reduction"] = c.getboolean(
    "network", "temporal_reduction", fallback=False
)
params["initialize_weights"] = c.getboolean(
    "network", "initialize_weights", fallback=False
)
if params["nn_architecture"] == "github_unet":
    params["attention"] = c.getboolean("network", "attention")
    params["up_mode"] = c.get("network", "up_mode")

In [8]:
############################# configure logger #############################

level_map = {
    3: logging.DEBUG,
    2: logging.INFO,
    1: logging.WARNING,
    0: logging.ERROR,
}
log_level = level_map[verbosity]
log_handlers = (logging.StreamHandler(sys.stdout),)

logging.basicConfig(
    level=log_level,
    format="[{asctime}] [{levelname:^8s}] [{name:^12s}] <{lineno:^4d}> -- {message:s}",
    style="{",
    datefmt="%H:%M:%S",
    handlers=log_handlers,
)

In [9]:
############################# configure wandb ##############################

if c.getboolean("general", "wandb_enable", fallback=False):
    wandb.init(
        project=wandb_project_name,
        name=params["run_name"],
        notes=c.get("general", "wandb_notes", fallback=None),
    )
    logging.getLogger("wandb").setLevel(logging.DEBUG)
    # wandb.save(CONFIG_FILE)

In [10]:
############################# print parameters #############################

logger.info("Command parameters:")
for k, v in params.items():
    logger.info(f"{k:>18s}: {v}")
    # TODO: AGGIUNGERE TUTTI I PARAMS NECESSARI DA PRINTARE

[14:37:59] [  INFO  ] [  __main__  ] < 3  > -- Command parameters:
[14:37:59] [  INFO  ] [  __main__  ] < 5  > --           run_name: TEST
[14:37:59] [  INFO  ] [  __main__  ] < 5  > --      load_run_name: None
[14:37:59] [  INFO  ] [  __main__  ] < 5  > --         load_epoch: 0
[14:37:59] [  INFO  ] [  __main__  ] < 5  > --       train_epochs: 100000
[14:37:59] [  INFO  ] [  __main__  ] < 5  > --          criterion: lovasz_softmax
[14:37:59] [  INFO  ] [  __main__  ] < 5  > --           lr_start: 0.0001
[14:37:59] [  INFO  ] [  __main__  ] < 5  > -- ignore_frames_loss: 6
[14:37:59] [  INFO  ] [  __main__  ] < 5  > --               cuda: True
[14:37:59] [  INFO  ] [  __main__  ] < 5  > --      relative_path: ../data/sparks_dataset
[14:37:59] [  INFO  ] [  __main__  ] < 5  > --       dataset_size: minimal
[14:37:59] [  INFO  ] [  __main__  ] < 5  > --         batch_size: 4
[14:37:59] [  INFO  ] [  __main__  ] < 5  > --        num_workers: 0
[14:37:59] [  INFO  ] [  __main__  ] < 5  > --

In [11]:
############################ configure datasets ############################

# select samples that are used for training and testing
if params["dataset_size"] == "full":
    train_sample_ids = [
        "01",
        "02",
        "03",
        "04",
        "06",
        "07",
        "08",
        "09",
        "11",
        "12",
        "13",
        "14",
        "16",
        "17",
        "18",
        "19",
        "21",
        "22",
        "23",
        "24",
        "27",
        "28",
        "29",
        "30",
        "33",
        "35",
        "36",
        "38",
        "39",
        "41",
        "42",
        "43",
        "44",
        "46",
    ]
    test_sample_ids = ["05", "10", "15", "20", "25", "32", "34", "40", "45"]
elif params["dataset_size"] == "minimal":
    train_sample_ids = ["01"]
    test_sample_ids = ["34"]
else:
    logger.error(f"{params['dataset_size']} is not a valid dataset size.")
    exit()

# detect CUDA devices
if params["cuda"]:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    pin_memory = True
else:
    device = "cpu"
    pin_memory = False
n_gpus = torch.cuda.device_count()
logger.info(f"Using torch device {device}, with {n_gpus} GPUs")

# set if temporal reduction is used
if params["temporal_reduction"]:
    logger.info(f"Using temporal reduction with {params['num_channels']} channels")

# normalize whole videos or chunks individually
if params["norm_video"] == "chunk":
    logger.info("Normalizing each chunk using min and max")
elif params["norm_video"] == "movie":
    logger.info("Normalizing whole video using min and max")
elif params["norm_video"] == "abs_max":
    logger.info("Normalizing whole video using 16-bit absolute max")

# initialize training dataset
dataset_path = os.path.realpath(f"{params['relative_path']}")
assert os.path.isdir(dataset_path), f'"{dataset_path}" is not a directory'
logger.info(f"Using {dataset_path} as dataset root path")
dataset = SparkDataset(
    base_path=dataset_path,
    sample_ids=train_sample_ids,
    testing=False,
    smoothing=params["data_smoothing"],
    step=params["data_step"],
    duration=params["data_duration"],
    remove_background=params["remove_background"],
    temporal_reduction=params["temporal_reduction"],
    num_channels=params["num_channels"],
    normalize_video=params["norm_video"],
    only_sparks=params["only_sparks"],
    sparks_type=params["sparks_type"],
    ignore_index=ignore_index,
    inference=None,
)

# apply transforms
if params["noise_data_augmentation"]:
    dataset = unet.TransformedDataset(dataset, random_flip_noise)
else:
    dataset = unet.TransformedDataset(dataset, random_flip)

logger.info(f"Samples in training dataset: {len(dataset)}")

# initialize testing dataset
pattern_test_filenames = os.path.join(
    f"{dataset_path}", "videos_test", "[0-9][0-9]_video.tif"
)

testing_datasets = [
    SparkDataset(
        base_path=dataset_path,
        sample_ids=[sample_id],
        testing=True,
        smoothing=params["data_smoothing"],
        step=params["data_step"],
        duration=params["data_duration"],
        remove_background=params["remove_background"],
        temporal_reduction=params["temporal_reduction"],
        num_channels=params["num_channels"],
        normalize_video=params["norm_video"],
        only_sparks=params["only_sparks"],
        sparks_type=params["sparks_type"],
        ignore_frames=params["ignore_frames_loss"],
        ignore_index=ignore_index,
        inference=params["inference"],
    )
    for sample_id in test_sample_ids
]

for i, tds in enumerate(testing_datasets):
    logger.info(f"Testing dataset {i} contains {len(tds)} samples")

# initialize data loaders
dataset_loader = DataLoader(
    dataset,
    batch_size=params["batch_size"],
    shuffle=True,
    num_workers=params["num_workers"],
    pin_memory=pin_memory,
)

[14:37:59] [  INFO  ] [  __main__  ] < 57 > -- Using torch device cuda, with 1 GPUs
[14:37:59] [  INFO  ] [  __main__  ] < 69 > -- Normalizing whole video using 16-bit absolute max
[14:37:59] [  INFO  ] [  __main__  ] < 74 > -- Using C:\Users\dotti\sparks_project\data\sparks_dataset as dataset root path
[14:37:59] [  INFO  ] [  datasets  ] <296 > -- Added padding of 12 frames to video with unsuitable duration
[14:37:59] [  INFO  ] [  __main__  ] < 98 > -- Samples in training dataset: 9
[14:37:59] [  INFO  ] [  datasets  ] <192 > -- Computing spark peaks...
[14:38:03] [  INFO  ] [  datasets  ] <199 > -- Sample 34 contains 16 sparks.
[14:38:03] [  INFO  ] [  datasets  ] <296 > -- Added padding of 24 frames to video with unsuitable duration
[14:38:03] [  INFO  ] [  __main__  ] <127 > -- Testing dataset 0 contains 22 samples


In [12]:
############################## configure UNet ##############################

if params["nn_architecture"] == "pablos_unet":

    batch_norm = {"batch": True, "none": False}

    unet_config = unet.UNetConfig(
        steps=params["unet_steps"],
        first_layer_channels=params["first_layer_channels"],
        num_classes=num_classes,
        ndims=ndims,
        dilation=params["dilation"],
        border_mode=params["border_mode"],
        batch_normalization=batch_norm[params["batch_normalization"]],
        num_input_channels=params["num_channels"],
    )

    if not params["temporal_reduction"]:
        network = unet.UNetClassifier(unet_config)
    else:
        assert (
            params["data_duration"] % params["num_channels"] == 0
        ), "using temporal reduction chunks_duration must be a multiple of num_channels"
        network = TempRedUNet(unet_config)

elif params["nn_architecture"] == "github_unet":
    network = UNet(
        in_channels=params["num_channels"],
        out_channels=num_classes,
        n_blocks=params["unet_steps"] + 1,
        start_filts=params["first_layer_channels"],
        # up_mode = 'transpose', # TESTARE DIVERSE POSSIBILTÀ
        # up_mode='resizeconv_nearest',  # Enable to avoid checkerboard artifacts
        merge_mode="concat",  # Default, dicono che funziona meglio
        # planar_blocks=(0,), # magari capire cos'è e testarlo ??
        activation="relu",
        normalization=params[
            "batch_normalization"
        ],  # Penso che nell'implementazione di Pablo è 'none'
        attention=params["attention"],  # magari da testare con 'True' ??
        # full_norm=False,  # Uncomment to restore old sparse normalization scheme
        dim=ndims,
        conv_mode=params["border_mode"],  # 'valid' ha dei vantaggi a quanto pare...
    )

if device != "cpu":
    network = nn.DataParallel(network).to(device)
    torch.backends.cudnn.benchmark = True

if c.getboolean("general", "wandb_enable", fallback=False):
    wandb.watch(network)

if params["initialize_weights"]:
    logger.info("Initializing UNet weights...")
    network.apply(weights_init)

In [13]:
########################### initialize training ############################

optimizer = optim.Adam(network.parameters(), lr=params["lr_start"])
network.train()

output_path = os.path.join(output_relative_path, params["run_name"])
logger.info(f"Output directory: {output_path}")

summary_writer = SummaryWriter(os.path.join(output_path, "summary"), purge_step=0)

if params["load_run_name"] != None:
    load_path = os.path.join(output_relative_path, params["load_run_name"])
    logger.info(f"Model loaded from directory: {load_path}")
else:
    load_path = None

# class weights
if params["criterion"] in ["nll_loss", "focal_loss", "sum_losses"]:
    class_weights = compute_class_weights(dataset)
    logger.info(
        "Using class weights: {}".format(
            ", ".join(str(w.item()) for w in class_weights)
        )
    )

if params["criterion"] == "nll_loss":
    criterion = nn.NLLLoss(
        ignore_index=ignore_index, weight=class_weights.to(device)
    )
elif params["criterion"] == "focal_loss":
    criterion = FocalLoss(
        reduction="mean",
        ignore_index=ignore_index,
        alpha=class_weights,
        gamma=params["gamma"],
    )
elif params["criterion"] == "lovasz_softmax":
    criterion = LovaszSoftmax3d(
        classes="present", per_image=False, ignore=ignore_index
    )
elif params["criterion"] == "sum_losses":
    criterion = SumFocalLovasz(
        classes="present",
        per_image=False,
        ignore=ignore_index,
        alpha=class_weights,
        gamma=params["gamma"],
        reduction="mean",
        w=params["w"],
    )

# directory where predicted class movies are saved
preds_output_dir = os.path.join(output_path, "predictions")
os.makedirs(preds_output_dir, exist_ok=True)

trainer = unet.TrainingManager(
    # training items
    training_step=lambda _: training_step(
        sampler,
        network,
        optimizer,
        device,
        criterion,
        dataset_loader,
        ignore_frames=params["ignore_frames_loss"],
        wandb_log=c.getboolean("general", "wandb_enable", fallback=False),
    ),
    save_every=c.getint("training", "save_every", fallback=5000),
    # load_path=load_path,
    save_path=output_path,
    managed_objects=unet.managed_objects(
        {"network": network, "optimizer": optimizer}
    ),
    # testing items
    test_function=lambda _: test_function(
        network=network,
        device=device,
        criterion=criterion,
        testing_datasets=testing_datasets,
        ignore_frames=params["ignore_frames_loss"],
        wandb_log=c.getboolean("general", "wandb_enable", fallback=False),
        training_name=params["run_name"],
        output_dir=preds_output_dir,
        training_mode=True,
        debug=c.getboolean("general", "debug_mode", fallback=False),
    ),
    test_every=c.getint("training", "test_every", fallback=1000),
    plot_every=c.getint("training", "test_every", fallback=1000),
    summary_writer=summary_writer,
)

[14:38:03] [  INFO  ] [  __main__  ] < 7  > -- Output directory: runs/TEST


In [14]:
############################## start training ##############################

if params["load_epoch"] != 0:
    trainer.load(params["load_epoch"])

if c.getboolean("general", "training", fallback=False):  # Run training procedure on data
    logger.info("Validate network before training")
    trainer.run_validation()
    logger.info("Starting training")
    trainer.train(
        params["train_epochs"],
        print_every=c.getint("training", "print_every", fallback=100),
    )


In [16]:

if c.getboolean("general", "testing", fallback=False):  # Run training procedure on data
    logger.info("Starting final validation")
    trainer.run_validation()

[14:40:17] [  INFO  ] [  __main__  ] < 2  > -- Starting final validation
[14:40:17] [  INFO  ] [unet.trainer] <139 > -- Validating network at iteration 0...
[14:40:17] [ DEBUG  ] [training_inference_tools] <433 > -- Testing function: running sample in UNet
[14:40:28] [ DEBUG  ] [training_inference_tools] <465 > -- Time to run sample 34 in UNet: 11.05 s
[14:40:28] [ DEBUG  ] [training_inference_tools] <473 > -- Testing function: re-organising annotations
[14:40:31] [ DEBUG  ] [training_inference_tools] <502 > -- Testing function: getting processed output (segmentation and instances)
[14:40:32] [ DEBUG  ] [data_processing_tools] <439 > -- Events detection threshold: 0.783
[14:40:40] [ DEBUG  ] [data_processing_tools] <506 > -- Number of sparks detected by nonmaxima suppression: 0
[14:40:42] [ DEBUG  ] [training_inference_tools] <538 > -- Time to process predictions: 14.82 s
[14:40:43] [ DEBUG  ] [training_inference_tools] <555 > -- Testing function: computing pairwise scores between 25 a

IndexError: list index out of range