In [1]:
import os
from glob import glob
from natsort import natsorted
from pathlib import Path

from Source.Utils import create_dir, generate_dataset, generate_tiled_dataset, generate_masks, sanity_check, split_dataset, split_dataset_simple

In [2]:
### Greate a dataset split if there isn't any
data_root = Path("D:/Datasets/SortedWesternData/_curatedDataset")

all_images = []
all_xml = []


for sub_dir in data_root.iterdir():
    if "OK" in sub_dir.name:
        #print(sub_dir)
        _images = natsorted(glob(f"{sub_dir}/*.jpg"))
        _xml = natsorted(glob(f"{sub_dir}/page/*.xml"))

        assert(len(_images) == len(_xml))

        all_images.extend(_images)
        all_xml.extend(_xml)


print(f"Dataset => Images: {len(all_images)}, XML: {len(all_xml)}")

sanity_check(all_images, all_xml)

train_images, train_xml, val_images, val_xml, test_images, test_xml = split_dataset(all_images, all_xml)

assert (len(train_images) == len(train_xml))
assert(len(val_images) == len(val_xml))
assert(len(test_images) == len(test_xml))


print(f"Train => Images: {len(train_images)}, XML: {len(train_xml)}")
print(f"Val => Images: {len(val_images)}, XML: {len(val_xml)}")
print(f"Test => Images: {len(test_images)}, XML: {len(test_xml)}")

Dataset => Images: 93, XML: 93
Train => Images: 74, XML: 74
Val => Images: 9, XML: 9
Test => Images: 10, XML: 10


In [3]:
output_dir = os.path.join(data_root, "MultiClassDataset")

train_imgs_dir = os.path.join(output_dir, "train", "images")
train_masks_dir = os.path.join(output_dir, "train", "masks")

val_imgs_dir = os.path.join(output_dir, "val", "images")
val_masks_dir  = os.path.join(output_dir, "val", "masks")

test_imgs_dir = os.path.join(output_dir, "test", "images")
test_masks_dir  = os.path.join(output_dir, "test", "masks")

create_dir(train_imgs_dir)
create_dir(train_masks_dir)

create_dir(val_imgs_dir)
create_dir(val_masks_dir)

create_dir(test_imgs_dir)
create_dir(test_masks_dir)

generate_dataset(train_images, train_xml, train_imgs_dir, train_masks_dir)
generate_dataset(val_images, val_xml, val_imgs_dir, val_masks_dir)

100%|██████████| 74/74 [00:06<00:00, 12.13it/s]
100%|██████████| 9/9 [00:00<00:00, 11.90it/s]


In [None]:
# Tiled Dataset

output_dir = os.path.join(data_root, "MultiClassDataset", "Tiled_v2")

train_imgs_dir = os.path.join(output_dir, "train", "images")
train_masks_dir = os.path.join(output_dir, "train", "masks")

val_imgs_dir = os.path.join(output_dir, "val", "images")
val_masks_dir  = os.path.join(output_dir, "val", "masks")

test_imgs_dir = os.path.join(output_dir, "test", "images")
test_masks_dir  = os.path.join(output_dir, "test", "masks")

create_dir(train_imgs_dir)
create_dir(train_masks_dir)

create_dir(val_imgs_dir)
create_dir(val_masks_dir)

create_dir(test_imgs_dir)
create_dir(test_masks_dir)

tile_overlap = 0.8

generate_tiled_dataset(train_images, train_xml, train_imgs_dir, train_masks_dir, overlap=tile_overlap)
generate_tiled_dataset(val_images, val_xml, val_imgs_dir, val_masks_dir, overlap=tile_overlap)
generate_tiled_dataset(test_images, test_xml, test_imgs_dir, test_masks_dir, overlap=tile_overlap)

100%|██████████| 74/74 [02:24<00:00,  1.96s/it]
100%|██████████| 9/9 [00:06<00:00,  1.44it/s]
100%|██████████| 10/10 [00:06<00:00,  1.66it/s]


In [None]:
# individual directory
data_root = "D:/Datasets/SortedWesternData/_curatedDataset"
input_dir = f"{data_root}/W1PD192038"
overlay = "no"
annotate_lines = "yes"

generate_masks(input_dir, annotate_lines, overlay)

100%|██████████| 9/9 [00:01<00:00,  7.36it/s]


In [None]:
# generate tiled dataset from pre-created train/val/test distribution based on sub-directories
overlay_preview = "no"
filter_blank = "no"
precrop = False
patch_size = 512
dataset_root = "G:/Datasets/BDRC/_LineLayoutDatasets/LayoutData_Done_LongLines"
data_dir = Path(os.path.join(dataset_root, "Data"))

output_data_dir = os.path.join(dataset_root, "Dataset")

distributions = ["train", "test", "val"]

filter_for_ok_flag = True

for dist in distributions:
    """
    Note that this loop presupposes that each directory has a train, test, and val sub-directory 
    in which the repsective images and page-xml files are stored.
    This is basically for a scenario in which one really wants to hand-craft the individual data splits
    to have full control over the the data (e.g. samples with images etc. in each split)
    """
    img_out_dir = os.path.join(output_data_dir, dist, "Images")
    mask_out_dir = os.path.join(output_data_dir, dist, "Masks")

    create_dir(img_out_dir)
    create_dir(mask_out_dir)
               
    for sub_dir in data_dir.iterdir():
        if "OK" in sub_dir.name: # remove that if you want, I just used this to have a handy filter in a directory where the is some wip on the datasets
            distribution_path = f"{sub_dir}/{dist}"

            _images = natsorted(glob(f"{distribution_path}/*.jpg"))
            _xml = natsorted(glob(f"{distribution_path}/page/*.xml"))

            assert(len(_images) == len(_xml))

            generate_tiled_dataset(_images, _xml, img_out_dir, mask_out_dir)