# Create Training and Validation Dataset

Create datasets without using the DeepD3 training GUI.


## Imports

In [None]:
import imageio as io
import numpy as np
import pandas as pd
from pathlib import Path

# DeepD3 imports
from deepd3.deepd3.data_preparation.structure_data import create_d3data, create_d3set
from deepd3.deepd3.data_preparation.dummy_stack import dummy_stack

## Open log

In [6]:
log_path = Path(r"../data/log.csv")
log_df = pd.read_csv(log_path, index_col=0)

resolutions = log_df["resolution"].to_numpy()[::3]

# Sanity check
assert len(resolutions) == len(log_df) // 3

## Define path variables

In [None]:
data_folder = Path(r"../data/images")
label_folder = Path(r"../data/labels/")

output_d3data = Path("../data/d3data")
output_d3data.mkdir(exist_ok=True)

output_d3set = Path("../data/d3set")
output_d3set.mkdir(exist_ok=True)

# Get all image names
img_filenames = list(data_folder.glob("*.tif"))

## Create structured d3data

In [None]:
for n, img_path in enumerate(img_filenames):

    dendrites_label_path = label_folder / f"{img_path.stem}_dendrites.tif"
    spines_label_path = label_folder / f"{img_path.stem}_spines.tif"

    # Load images
    image = io.imread(img_path)
    dendrites_label = io.imread(dendrites_label_path)
    spines_label = io.imread(spines_label_path)

    # Normalize and convert image to uint16
    image = image - image.min()
    image = image.astype(np.uint16)

    # Create dummy stacks if images are 2D
    if image.ndim == 2:
        stack = dummy_stack(image)
        dendrite = dummy_stack(dendrites_label)
        spines = dummy_stack(spines_label)

    resolution = resolutions[n]

    # Save as .d3data
    output_fn = output_d3data / f"{img_path.stem}.d3data"
    
    # Create d3data file
    create_d3data(
        img_path,
        stack,
        dendrite,
        spines,
        resolution,
        str(output_fn))

## Create training and validation sets

First let's set a random seed for reproducibility

In [10]:
import random

random.seed(42)

Now let's split in two groups

In [None]:
d3data_files = list(output_d3data.glob("*.d3data"))
random.shuffle(d3data_files)

# split index (e.g. 80% train, 20% val)
train_ratio = 0.8
split_idx = int(len(d3data_files) * train_ratio)

train_samples = d3data_files[:split_idx]
val_samples   = d3data_files[split_idx:]

NameError: name 'output_d3data' is not defined

## Save sets

In [None]:
create_d3set(train_samples, str(output_d3set / "train.d3set"))
create_d3set(val_samples, str(output_d3set / "val.d3set"))