In [1]:
import os
import sys
import os
import csv
import pandas as pd
import random
from random import shuffle
import numpy as np
from scipy import ndimage
from PIL import Image
from pathlib import Path
from tqdm import tqdm
from sklearn.model_selection import train_test_split


## Data preparation
#### Generate masks


In [2]:
# DECLARING CONSTANTS
RANDOM_STATE = 2023  # for spliting data into train, val, test
# Path to the data directory
DATA_DIR = "data"
# Paths to the images directories
TRAIN_SET_DIR = "training_set"
TRAIN_SET_LABELS_DIR = "training_set_label"
TEST_SET_DIR = "test_set"
# Path to the config directory
CONFIG_DIR = "config"
# Paths to the config files
FETAL_HC_ALL = "fetal_hc_all.csv"
FETAL_HC_TRAIN = "fetal_hc_train.csv"
FETA_HC_VAL = "fetal_hc_valid.csv"
FETAL_HC_TEST = "fetal_hc_test.csv"
FETAL_HC_TEST_GT_SEG = "fetal_hc_test_gt_seg.csv"

In [3]:
def get_backgrouond(img: np.ndarray) -> np.ndarray:  # 2D or 3D
    if img.sum() == 0:
        print("the largest component is null")
        return img
    if len(img.shape) == 3:  # is 3D
        im_structure = ndimage.generate_binary_structure(3, 1)  # iterate structure
    elif len(img.shape) == 2:  # is 2D
        im_structure = ndimage.generate_binary_structure(2, 1)
    else:
        raise ValueError("the dimension number shoud be 2 or 3")
    labeled_array, _ = ndimage.label(img, im_structure)  # labeling
    assert labeled_array.max() == 2
    return np.asarray(labeled_array == 1, np.uint8)


def get_segmentation_masks(root: str) -> None:
    root = Path(root)
    train_set = root / TRAIN_SET_DIR
    target_dir = root / TRAIN_SET_LABELS_DIR
    target_dir.mkdir(exist_ok=True)

    file_paths = list(train_set.glob("*Annotation.png"))

    for file_path in tqdm(file_paths):
        image = Image.open(file_path)

        data = np.asarray(image)
        data = np.asarray(data == 0)
        data_bg = get_backgrouond(data)
        data_fg = np.asarray(1.0 - data_bg, np.uint8) * 255

        out_img = Image.fromarray(data_fg)

        out_name = file_path.name.replace("Annotation", "seg")
        out_name = target_dir / out_name

        out_img.save(out_name)


In [4]:
get_segmentation_masks(DATA_DIR)

 79%|███████▉  | 792/999 [00:06<00:02, 97.41it/s] 

In [None]:
def create_train_csv_files(root: str) -> None:
    root = Path(root)
    config = Path(CONFIG_DIR)

    all_names_file = config / FETAL_HC_ALL
    train_names_file = config / FETAL_HC_TRAIN
    valid_names_file = config / FETA_HC_VAL
    test_names_file = config / FETAL_HC_TEST

    images_dir = root / TRAIN_SET_DIR
    label_dir = root / TRAIN_SET_LABELS_DIR

    filenames = [str(path).replace(f"{DATA_DIR}/", "") for path in list(label_dir.iterdir())]
    images_paths = [str(path).replace(f"{DATA_DIR}/", "") for path in list(images_dir.glob("*HC.png"))]

    df_all = pd.DataFrame({"image": images_paths, "label": filenames})
    df_all.to_csv(all_names_file, index=False)

    df_train, df_test = train_test_split(df_all, test_size=0.2, random_state=RANDOM_STATE)
    df_train, df_valid = train_test_split(df_train, test_size=0.25, random_state=RANDOM_STATE)

    df_train.to_csv(train_names_file, index=False)
    df_valid.to_csv(valid_names_file, index=False)
    df_test.to_csv(test_names_file, index=False)

    df_gt_seg = pd.concat(
        [
            df_test["label"],
            df_test["label"].apply(lambda x: filename_without_seg(x)),
        ],
        axis=1,
    )
    df_gt_seg.columns = ["ground_truth", "segmentation"]
    df_gt_seg.to_csv(config / FETAL_HC_TEST_GT_SEG, index=False)


def filename_without_seg(file_name: str) -> str:
    file_name = file_name.replace("_seg", "")
    file_name = Path(file_name)
    return str(file_name.stem + file_name.suffix)

In [None]:
create_train_csv_files(DATA_DIR)


In [None]:
!pymic_train config/unet.cfg

In [None]:
!mkdir -p result
!pymic_train config/unet.cfg

In [None]:
!pymic_eval_seg config/evaluation.cfg