# Create Training and Validation Dataset

Create datasets without using the DeepD3 training GUI. It builds on the data_preparation module.


## Imports

In [None]:
import imageio.v2 as io
import numpy as np
from pathlib import Path
import tifffile
from datetime import datetime

# DeepD3 imports
from deepd3.data_preparation.structure_data import create_d3data, create_d3set
from deepd3.data_preparation.dummy_stack import dummy_stack
from deepd3.training.utils import set_seed

## Define path variables

In [2]:
data_folder = Path(r"../data/images")
label_folder = Path(r"../data/labels/")

output_d3data = Path("../data/d3data")
output_d3data.mkdir(exist_ok=True)

output_d3set = Path("../data/d3set")
output_d3set.mkdir(exist_ok=True)

# Get all image names
img_filenames = list(data_folder.glob("*.tif"))

today = datetime.now().strftime("%y%m%d")

Get the resolution values from saved images (assumes square pixels).

In [3]:
resolutions = np.zeros_like(img_filenames, dtype=float)

for n, path in enumerate(img_filenames):
    
    with tifffile.TiffFile(path) as tif:  
        x_res = tif.pages[0].tags["XResolution"].value
        x_res = x_res[0] / x_res[1] if isinstance(x_res, tuple) else float(x_res)

        resolutions[n] = round(1.0 / x_res, 3)

## Create structured d3data

This is to train custom 2D images. By calling dummy_stack, an empty dimension is added as a workaround for the missing z dimension.

In [4]:
from deepd3.data_preparation.dummy_stack import dummy_stack
for n, img_path in enumerate(img_filenames):

    dendrites_label_path = label_folder / f"{img_path.stem}_dendrites.tif"
    spines_label_path = label_folder / f"{img_path.stem}_spines.tif"

    # Load images
    image = io.imread(img_path)    
    dendrites_label = io.imread(dendrites_label_path)
    spines_label = io.imread(spines_label_path)

    # Normalize and convert image to uint16
    image = image - image.min()
    image = image.astype(np.uint16)

    # Create dummy stacks if images are 2D
    if image.ndim == 2:
        image = dummy_stack(image)
        dendrites_label = dummy_stack(dendrites_label)
        spines_label = dummy_stack(spines_label)

    assert image.shape == dendrites_label.shape == spines_label.shape, \
        "Image and label shapes do not match!"

    resolution = resolutions[n]

    # Save as .d3data
    output_fn = output_d3data / f"{img_path.stem}.d3data"

    if not output_fn.exists():   
        # Create d3data file
        create_d3data(
            img_path,
            image,
            dendrites_label,
            spines_label,
            resolution,
            str(output_fn))

Saved ..\data\d3data\img_0001.d3data with stack shape (1, 276, 1104) and resolution 0.091 µm/px
Saved ..\data\d3data\img_0002.d3data with stack shape (1, 256, 128) and resolution 0.117 µm/px
Saved ..\data\d3data\img_0003.d3data with stack shape (1, 130, 130) and resolution 0.19 µm/px
Saved ..\data\d3data\img_0004.d3data with stack shape (1, 260, 130) and resolution 0.146 µm/px
Saved ..\data\d3data\img_0005.d3data with stack shape (1, 260, 130) and resolution 0.085 µm/px
Saved ..\data\d3data\img_0006.d3data with stack shape (1, 390, 130) and resolution 0.077 µm/px
Saved ..\data\d3data\img_0007.d3data with stack shape (1, 396, 132) and resolution 0.061 µm/px
Saved ..\data\d3data\img_0008.d3data with stack shape (1, 272, 136) and resolution 0.11 µm/px
Saved ..\data\d3data\img_0009.d3data with stack shape (1, 408, 136) and resolution 0.066 µm/px
Saved ..\data\d3data\img_0010.d3data with stack shape (1, 490, 140) and resolution 0.114 µm/px
Saved ..\data\d3data\img_0011.d3data with stack sha

## Create training and validation sets

First let's set a random seed for reproducibility.

In [None]:
import random
set_seed(41)

Now let's split in two groups.

In [9]:
d3data_files = list(output_d3data.glob("*.d3data"))
random.shuffle(d3data_files)

# split index (e.g. 80% train, 20% val)
train_ratio = 0.8
split_idx = int(len(d3data_files) * train_ratio)

train_samples = d3data_files[:split_idx]
val_samples   = d3data_files[split_idx:]

## Save sets

In [10]:
create_d3set(train_samples, str(output_d3set / f"{today}_training_dataset.d3set"))
create_d3set(val_samples, str(output_d3set / f"{today}_validation_dataset.d3set"))

..\data\d3data\img_0017.d3data ...
..\data\d3data\img_0041.d3data ...
..\data\d3data\img_0006.d3data ...
..\data\d3data\img_0008.d3data ...
..\data\d3data\img_0049.d3data ...
..\data\d3data\img_0028.d3data ...
..\data\d3data\img_0048.d3data ...
..\data\d3data\img_0013.d3data ...
..\data\d3data\img_0030.d3data ...
..\data\d3data\img_0024.d3data ...
..\data\d3data\img_0042.d3data ...
..\data\d3data\img_0043.d3data ...
..\data\d3data\img_0039.d3data ...
..\data\d3data\img_0032.d3data ...
..\data\d3data\img_0031.d3data ...
..\data\d3data\img_0005.d3data ...
..\data\d3data\img_0038.d3data ...
..\data\d3data\img_0012.d3data ...
..\data\d3data\img_0003.d3data ...
..\data\d3data\img_0007.d3data ...
..\data\d3data\img_0034.d3data ...
..\data\d3data\img_0014.d3data ...
..\data\d3data\img_0026.d3data ...
..\data\d3data\img_0027.d3data ...
..\data\d3data\img_0004.d3data ...
..\data\d3data\img_0040.d3data ...
..\data\d3data\img_0044.d3data ...
..\data\d3data\img_0020.d3data ...
..\data\d3data\img_0

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->Index(['Generated_from', 'Timestamp'], dtype='object')]

  store.put(group._v_pathname + "/" + name, level)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->Index(['Generated_from', 'Timestamp'], dtype='object')]

  store.put(group._v_pathname + "/" + name, level)
