# Preprocessing and Mask-creating pipeline

## Imports

In [None]:
import os
from src.utils import load_yaml, build_paths

## Config loading and folder structure creation

Firstly, we need to load the configuration file, where the paths of the directories that will be used during preprocessing, mask generation, and mask fusing are specified.

**Prerequisites:**
- Existing folders and files in root directory:
    - `./data/raw-train/images/`
    - `./data/raw-train/tiger-coco.json`
    - `/data/raw-test/images/`
    - `/data/raw-test/masks/`
    - `./data/processed-train/`
    - `/data/processed-test/`
 
You can easily change the root data path `./data` and all the other paths based on your preference in the `./configs/paths.yaml` file. 

In [None]:
# Load the config where paths are specified
paths = load_yaml("./configs/paths.yaml")

In [None]:
paths["pseudo_mask_dirs"]["all"]

In [None]:
# Root
root = paths["root_data_dir"]

# First level paths
raw_train = os.path.join(root, paths["raw_train"])
raw_test = os.path.join(root, paths["raw_test"])
processed_train = os.path.join(root, paths["processed_train"])
processed_test = os.path.join(root, paths["processed_test"])

# Path chunks
images = paths["images"]
masks = paths["masks"]
patches = paths["patches"]
tiger_annotations = paths["tiger_annotations"]
tnbc_binary_masks = paths["tnbc_binary_masks"]
scaled = paths["scaled"]
folds = paths["folds"]

# Image source chunks
eq = paths["source_dirs"]["raw_eq"]
normalized = paths["source_dirs"]["normalized"]
normalized_eq = paths["source_dirs"]["normalized_eq"]
hematoxylin = paths["source_dirs"]["hematoxylin"]
hematoxylin_eq = paths["source_dirs"]["hematoxylin_eq"]

# Raw data paths
raw_train_images = os.path.join(raw_train, images)
raw_test_images = os.path.join(raw_test, images)
raw_test_masks = os.path.join(raw_test, masks)

# Processed train data paths
train_eq = os.path.join(processed_train, eq)
train_normalized = os.path.join(processed_train, normalized)
train_normalized_eq = os.path.join(processed_train, normalized_eq)
train_hematoxylin = os.path.join(processed_train, hematoxylin)
train_hematoxylin_eq = os.path.join(processed_train, hematoxylin_eq)
train_patches = os.path.join(processed_train, patches)
train_masks = os.path.join(processed_train, masks)

# Paths where created pseudo-masks will be stored
train_pseudo_masks_all = build_paths(train_masks, paths["pseudo_mask_dirs"]["all"])
train_pseudo_masks_best_75 = build_paths(train_masks, paths["pseudo_mask_dirs"]["best_75"])
train_pseudo_masks_best_50 = build_paths(train_masks, paths["pseudo_mask_dirs"]["best_50"])
train_pseudo_masks_best_25 = build_paths(train_masks, paths["pseudo_mask_dirs"]["best_25"])
train_pseudo_masks_leave_fused = build_paths(train_masks, paths["pseudo_mask_dirs"]["fused"])

# Processed test data paths
test_binary_masks = os.path.join(processed_test, tnbc_binary_masks)
test_normalized = os.path.join(processed_test, normalized)
test_scaled = os.path.join(processed_test, scaled)
test_patches = os.path.join(processed_test, patches)
test_folds = os.path.join(test_patches, folds)

In [None]:
# Create directories
dirs_to_create = [
    # processed train
    train_eq,
    train_normalized,
    train_normalized_eq,
    train_hematoxylin,
    train_hematoxylin_eq,
    train_patches,
    train_masks,
    # processed test
    test_binary_masks,
    test_normalized,
    test_scaled,
    test_patches,
    test_folds
]

# adding pseudo-mask directories
dirs_to_create += train_pseudo_masks_all
dirs_to_create += train_pseudo_masks_best_75
dirs_to_create += train_pseudo_masks_best_50
dirs_to_create += train_pseudo_masks_best_25
dirs_to_create += train_pseudo_masks_leave_fused

skipped = 0
for directory in dirs_to_create:
    if os.path.exists(directory):
        skipped += 1
        print(f"Directory '{directory}' already exists. Skipping.")
        continue
    os.makedirs(directory, exist_ok=True)
    print(f"Created directory '{directory}'")
print(f"Created {len(dirs_to_create)-skipped} directories, skipped {skipped}.")