## create rasterized datasets

In [1]:
import sys

sys.path.append("..")
from utils.preprocess import PreprocessCylindricalAnnotations
import os

dataset = os.getcwd().split("/")[-1]
annotation_path = f"../annotations/{dataset}/"
annotation_csvs = [
    annotation_path + f for f in os.listdir(annotation_path) if f.endswith(".csv")
]
annotation_name = "plasmodesmata"
preprocess = PreprocessCylindricalAnnotations(
    username="ackermand",
    annotation_name=annotation_name,
    training_csvs=annotation_csvs,
    rois_to_split_yml=f"../annotations/{dataset}/rois_from_spreadsheet.yml",
    dataset=dataset,
    radius=4,
)
preprocess.standard_preprocessing()
# seems to work https://neuroglancer-demo.appspot.com/#!gs://flyem-user-links/short/2023-09-30.025524.json

rois as annotations: precomputed://https://cellmap-vm1.int.janelia.org/dm11/ackermand/neuroglancer_annotations/plasmodesmata/splitting/jrc_22ak351-leaf-3r/bounding_boxes


100%|██████████| 2038/2038 [00:07<00:00, 280.38it/s]
100%|██████████| 2038/2038 [03:00<00:00, 11.31it/s]
100%|██████████| 2038/2038 [00:02<00:00, 936.06it/s]


number of original centers: 2038, number of training centers: 1432


100%|██████████| 606/606 [00:00<00:00, 162318.68it/s]


annotations: precomputed://https://cellmap-vm1.int.janelia.org/dm11/ackermand/neuroglancer_annotations/plasmodesmata/removed_annotations/jrc_22ak351-leaf-3r/removed_annotations


100%|██████████| 1432/1432 [00:00<00:00, 142594.98it/s]

annotations: precomputed://https://cellmap-vm1.int.janelia.org/dm11/ackermand/neuroglancer_annotations/plasmodesmata/removed_annotations/jrc_22ak351-leaf-3r/kept_annotations





# Dacapo

In [2]:
from dacapo.experiments.architectures import CNNectomeUNetConfig
from dacapo.experiments.trainers import GunpowderTrainerConfig
from dacapo.experiments.trainers.gp_augments import (
    ElasticAugmentConfig,
    IntensityAugmentConfig,
)
from dacapo.experiments.tasks import AffinitiesTaskConfig
from funlib.geometry.coordinate import Coordinate
import math

## Trainer

In [3]:
trainer_config = GunpowderTrainerConfig(
    name="default_v2_no_dataset_predictor_node_lr_5E-5",
    batch_size=2,
    learning_rate=0.00005,
    augments=[
        ElasticAugmentConfig(
            control_point_spacing=(100, 100, 100),
            control_point_displacement_sigma=(10.0, 10.0, 10.0),
            rotation_interval=(0, math.pi / 2.0),
            subsample=8,
            uniform_3d_rotation=True,
        ),
        IntensityAugmentConfig(
            scale=(0.7, 1.3),
            shift=(-0.2, 0.2),
            clip=True,
        ),
    ],
    clip_raw=True,
    num_data_fetchers=20,
    snapshot_interval=10000,
    min_masked=0.05,
    add_predictor_nodes_to_dataset=False,
)

## Task

In [4]:
task_config = AffinitiesTaskConfig(
    name=f"3d_lsdaffs_weight_ratio_0.50",
    neighborhood=[
        (1, 0, 0),
        (0, 1, 0),
        (0, 0, 1),
        (3, 0, 0),
        (0, 3, 0),
        (0, 0, 3),
        (9, 0, 0),
        (0, 9, 0),
        (0, 0, 9),
    ],
    lsds=True,
    lsds_to_affs_weight_ratio=0.50,
)

## Architecture

I had an issue where, by default, I created the rasterization at the same resolution as the raw data. But the default architecture (with the upsampling layer `upsample_factors`) expects it to be at 2x the resolution including mask and validation. This resulted in an error when submitting. Since we don't really care about a higher res (at the moment), we can just comment out the upsampling layer (`constant_upsample` and `upsample_factors`)

In [5]:
architecture_config = CNNectomeUNetConfig(
    name="unet",
    input_shape=Coordinate(216, 216, 216),
    eval_shape_increase=Coordinate(72, 72, 72),
    fmaps_in=1,
    num_fmaps=12,
    fmaps_out=72,
    fmap_inc_factor=6,
    downsample_factors=[(2, 2, 2), (3, 3, 3), (3, 3, 3)],
    # constant_upsample=True,
    # upsample_factors=[(2, 2, 2)],
)

## Datasplit

EVERYTHING MUST BE IN Z,Y,X AND NM!

In [6]:
from dacapo.store.create_store import create_config_store

config_store = create_config_store()

# use pseudorandom centers
from pathlib import Path
from dacapo.experiments.datasplits.datasets.arrays import (
    ZarrArrayConfig,
    IntensitiesArrayConfig,
    CropArrayConfig,
)
from dacapo.experiments.datasplits.datasets import RawGTDatasetConfig
from dacapo.experiments.datasplits import TrainValidateDataSplitConfig
from funlib.geometry import Roi

raw_config = ZarrArrayConfig(
    name="raw",
    file_name=Path(f"/nrs/cellmap/data/{dataset}/{dataset}.n5"),
    dataset="em/fibsem-uint8/s0",
)
# We get an error without this, and will suggests having it as such https://cell-map.slack.com/archives/D02KBQ990ER/p1683762491204909
raw_config = IntensitiesArrayConfig(
    name="raw", source_array_config=raw_config, min=0, max=255
)

gt_config = ZarrArrayConfig(
    name=annotation_name,
    file_name=Path(
        f"/nrs/cellmap/ackermand/cellmap/{annotation_name}/annotations_as_cylinders.n5"
    ),
    dataset=dataset,
)

# mask out regions of overlapping plasmodesmata
mask_config = ZarrArrayConfig(
    name="mask",
    file_name=Path(
        f"/nrs/cellmap/ackermand/cellmap/{annotation_name}/annotation_intersection_masks.zarr"
    ),
    dataset=dataset,
)

# NOTE: Everything has to be in z,y,x
validation_data_config = []
for i, roi in enumerate(preprocess.rois_dict["validation"]):
    val_gt_config = CropArrayConfig(
        f"val_gt_{i}", source_array_config=gt_config, roi=roi
    )
    validation_data_config.append(
        RawGTDatasetConfig(
            f"val_{i}",
            raw_config=raw_config,
            gt_config=val_gt_config,
            mask_config=mask_config,
        )
    )

training_data_config = RawGTDatasetConfig(
    f"train",
    raw_config=raw_config,
    gt_config=gt_config,
    sample_points=[
        Coordinate(pseudorandom_training_center[::-1])
        for pseudorandom_training_center in preprocess.pseudorandom_training_centers
    ],
    mask_config=mask_config,
)
datasplit_config = TrainValidateDataSplitConfig(
    name=f"{dataset}_{annotation_name}_pseudorandom_training_centers",
    train_configs=[training_data_config],
    validate_configs=validation_data_config,
)

# store it so that can combine later
config_store.store_datasplit_config(datasplit_config)

## Run

In [7]:
from dacapo.experiments import RunConfig
from dacapo.experiments.starts import StartConfig
from dacapo.store.create_store import create_config_store

config_store = create_config_store()
start_config = StartConfig(
    "finetuned_3d_lsdaffs_weight_ratio_0.50_plasmodesmata_pseudorandom_training_centers_maxshift_18_more_annotations_unet_default_v2_no_dataset_predictor_node_lr_5E-5__1",
    "140000",
)
iterations = 200000
# make validation interval huge so don't have to deal with validation until after the fact
validation_interval = 5000
repetitions = 3
for i in range(repetitions):
    run_config = RunConfig(
        name=("_").join(
            [
                "scratch" if start_config is None else "finetuned",
                task_config.name,
                datasplit_config.name,
                architecture_config.name,
                trainer_config.name,
            ]
        )
        + f"__{i}",
        task_config=task_config,
        datasplit_config=datasplit_config,
        architecture_config=architecture_config,
        trainer_config=trainer_config,
        num_iterations=iterations,
        validation_interval=validation_interval,
        repetition=i,
        start_config=start_config,
    )
    config_store.store_run_config(run_config)
    # "dacapo run -r {run_config.name}"
    print(
        f"visualize run: python /groups/scicompsoft/home/ackermand/Programming/ml_experiments/scripts/visualize_pipeline.py visualize-pipeline -r {run_config.name}"
    )

visualize run: python /groups/scicompsoft/home/ackermand/Programming/ml_experiments/scripts/visualize_pipeline.py visualize-pipeline -r finetuned_3d_lsdaffs_weight_ratio_0.50_jrc_22ak351-leaf-3r_plasmodesmata_pseudorandom_training_centers_unet_default_v2_no_dataset_predictor_node_lr_5E-5__0
visualize run: python /groups/scicompsoft/home/ackermand/Programming/ml_experiments/scripts/visualize_pipeline.py visualize-pipeline -r finetuned_3d_lsdaffs_weight_ratio_0.50_jrc_22ak351-leaf-3r_plasmodesmata_pseudorandom_training_centers_unet_default_v2_no_dataset_predictor_node_lr_5E-5__1
visualize run: python /groups/scicompsoft/home/ackermand/Programming/ml_experiments/scripts/visualize_pipeline.py visualize-pipeline -r finetuned_3d_lsdaffs_weight_ratio_0.50_jrc_22ak351-leaf-3r_plasmodesmata_pseudorandom_training_centers_unet_default_v2_no_dataset_predictor_node_lr_5E-5__2


# Prediction Mask

In [7]:
from funlib.persistence import open_ds, prepare_ds
from funlib.geometry import Roi, Coordinate
from scipy.ndimage import binary_dilation, distance_transform_edt
import numpy as np

for iterations in range(1, 4):
    ds = open_ds(
        f"/nrs/cellmap/ackermand/cellmap/leaf-gall/{dataset}.n5",
        "plasmodesmata_column_cells",
    )
    voxel_size = ds.voxel_size
    data = ds.to_ndarray() > 0
    ds = open_ds(
        f"/nrs/cellmap/ackermand/cellmap/leaf-gall/{dataset}.n5",
        "plasmodesmata_column_target_cells",
    )
    data += ds.to_ndarray() > 0
    data = 1 - (data > 0)
    data = binary_dilation(data, iterations=iterations)

    output_ds = prepare_ds(
        "/nrs/cellmap/ackermand/cellmap/leaf-gall/prediction_masks.zarr",
        f"dilation_iterations_{iterations}_{dataset}",
        total_roi=ds.roi,
        voxel_size=voxel_size,
        dtype=np.uint8,
        write_size=Coordinate(np.array([64, 64, 64]) * 256),
        delete=True,
        # force_exact_write_size=True
    )
    output_ds[ds.roi] = data