In [1]:
from datasets import Dataset, DatasetDict, Image
from util import ImageDataset, readSetFromFile
import numpy as np

In [2]:
def load_images(
    masks_glob, 
    includes_file=None,
    imagery_folder="imagery/",
    masks_folder="masks/", 
    fraction=0.1, 
    exclude=[]
):
    include_masks = list(readSetFromFile(includes_file, str)) if includes_file else None

    train_data = ImageDataset(
        "../learning/" + imagery_folder, 
        "../learning/" + masks_folder, 
        masks_glob, 
        include_masks=include_masks, 
        exclude_masks=exclude, 
        subset="Train", 
        fraction=fraction,
    )
    print("")
    val_data = ImageDataset(
        "../learning/" + imagery_folder, 
        "../learning/" + masks_folder, 
        masks_glob, 
        include_masks=include_masks, 
        exclude_masks=exclude, 
        subset="Test", 
        fraction=fraction,
    )

    train_image_names = [str(path) for path in train_data.image_names]
    train_mask_names = [str(path) for path in train_data.mask_names]

    val_image_names = [str(path) for path in val_data.image_names]
    val_mask_names = [str(path) for path in val_data.mask_names]

    return (train_data.names, train_image_names, train_mask_names), (val_data.names, val_image_names, val_mask_names)

def create_dataset(names, image_paths, label_paths=None):
    if label_paths is not None:
        dataset = Dataset.from_dict({"image": image_paths, "label": label_paths, "name": names})
        dataset = dataset.cast_column("image", Image())
        dataset = dataset.cast_column("label", Image())
    else:
        dataset = Dataset.from_dict({"image": image_paths, "name": names})
        dataset = dataset.cast_column("image", Image())

    return dataset

[stodoran/elwha-segmentation-manual](https://huggingface.co/datasets/stodoran/elwha-segmentation-manual)

In [3]:
train_paths, val_paths = load_images("*_corrected.png", "../data/train_images.txt", fraction=0.45)
train_dataset_manual = create_dataset(*train_paths)
validation_dataset_manual = create_dataset(*val_paths)

dataset = DatasetDict({
    "train": train_dataset_manual,
    "validation": validation_dataset_manual,
})
# This function assumes you have ran the huggingface-cli login command in the terminal/notebook
dataset.push_to_hub("stodoran/elwha-segmentation-manual")

Found and loaded 4382 images with glob *_corrected.png.
Pruned 4156 masks based on set of 0 included masks.
Pruned 0 masks from set of 0 excluded masks.
Subset of 125 ground truth segmentation masks marked for Train.

Found and loaded 4382 images with glob *_corrected.png.
Pruned 4156 masks based on set of 0 included masks.
Pruned 0 masks from set of 0 excluded masks.
Subset of 101 ground truth segmentation masks marked for Test.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/stodoran/elwha-segmentation-manual/commit/7fad9623980c28fda648192e2a25fe4e4432db70', commit_message='Upload dataset', commit_description='', oid='7fad9623980c28fda648192e2a25fe4e4432db70', pr_url=None, pr_revision=None, pr_num=None)

The extra image names set to be excluded are additional datapoints used for validation experiments and must not be trained on.

In [4]:
exclude_names = np.concatenate((val_paths[0], ["Elwha_MR_20160714_062_014", "Elwha_MR_20170922_062_014"]))

[stodoran/elwha-segmentation-v1](https://huggingface.co/datasets/stodoran/elwha-segmentation-v1)

In [5]:
train_paths_v1, _ = load_images("*_binary.png", "../data/useful_images.txt", fraction=0, exclude=exclude_names)
train_dataset_v1 = create_dataset(*train_paths_v1)

dataset = DatasetDict({
    "train": train_dataset_v1,
    "validation": validation_dataset_manual,
})
dataset.push_to_hub("stodoran/elwha-segmentation-v1")

Found and loaded 4382 images with glob *_binary.png.
Pruned 3164 masks based on set of 103 included masks.
Pruned 69 masks from set of 103 excluded masks.
Subset of 1149 ground truth segmentation masks marked for Train.

Found and loaded 4382 images with glob *_binary.png.
Pruned 3164 masks based on set of 103 included masks.
Pruned 69 masks from set of 103 excluded masks.
Subset of 0 ground truth segmentation masks marked for Test.


Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/383 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Map:   0%|          | 0/383 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Map:   0%|          | 0/383 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/477 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/stodoran/elwha-segmentation-v1/commit/3f1811afc9b341bf14502e772e908e74fead9469', commit_message='Upload dataset', commit_description='', oid='3f1811afc9b341bf14502e772e908e74fead9469', pr_url=None, pr_revision=None, pr_num=None)

[stodoran/elwha-segmentation-v2](https://huggingface.co/datasets/stodoran/elwha-segmentation-v2)

In [6]:
train_paths_v2, _ = load_images("*[!_manualfix].png", masks_folder="corrections_v1/", fraction=0, exclude=exclude_names)
train_dataset_v2 = create_dataset(*train_paths_v2)

dataset = DatasetDict({
    "train": train_dataset_v2,
    "validation": validation_dataset_manual,
})
dataset.push_to_hub("stodoran/elwha-segmentation-v2")

Found and loaded 1148 images with glob *[!_manualfix].png.
Pruned 41 masks from set of 103 excluded masks.
Subset of 1107 ground truth segmentation masks marked for Train.

Found and loaded 1148 images with glob *[!_manualfix].png.
Pruned 41 masks from set of 103 excluded masks.
Subset of 0 ground truth segmentation masks marked for Test.


Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/stodoran/elwha-segmentation-v2/commit/52ebaf5e3fcbaf26b93db581cc85f6343965e559', commit_message='Upload dataset', commit_description='', oid='52ebaf5e3fcbaf26b93db581cc85f6343965e559', pr_url=None, pr_revision=None, pr_num=None)

[stodoran/elwha-segmentation-predict](https://huggingface.co/datasets/stodoran/elwha-segmentation-predict)

In [7]:
train_paths_pred, _ = load_images("*.png", imagery_folder="imagery/", masks_folder="imagery/", fraction=0)
train_paths_pred = train_paths_pred[:2] # Remove the labels paths (3rd item in tuple)
train_dataset_pred = create_dataset(*train_paths_pred)

dataset = DatasetDict({
    "data": train_dataset_pred,
})
dataset.push_to_hub("stodoran/elwha-segmentation-predict")

Found and loaded 4382 images with glob *.png.
Pruned 0 masks from set of 0 excluded masks.
Subset of 4382 ground truth segmentation masks marked for Train.

Found and loaded 4382 images with glob *.png.
Pruned 0 masks from set of 0 excluded masks.
Subset of 0 ground truth segmentation masks marked for Test.


Uploading the dataset shards:   0%|          | 0/7 [00:00<?, ?it/s]

Map:   0%|          | 0/626 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Map:   0%|          | 0/626 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]