In [1]:
from datasets import Dataset, DatasetDict, Image
from util import TLDataset, readSetFromFile
import numpy as np

In [2]:
def loadImages(
    masks_glob, 
    indices_file=None,
    imagery_folder="imagery/",
    masks_folder="masks/", 
    fraction=0.1, 
    exclude=[]
):
    all_data = TLDataset(
        "../learning/", imagery_folder, masks_folder, masks_glob, 
        subset="Train", fraction=0, seed=1, # We want to select every image
    )

    indices = list(readSetFromFile(indices_file)) if indices_file else None
    all_names = np.array([name.replace(".png", "") for name in all_data.names])
    selected_names = all_names[indices] if indices else all_names
    names = [name for name in selected_names if name not in exclude]
    if exclude: print(f"Excluded {len(selected_names) - len(names)} images")

    cutoff = int(len(names) * fraction)
    train_names = names[cutoff:]
    val_names = names[:cutoff]
    suffix = masks_glob.replace("*", "")

    image_paths_train = [f"../learning/{imagery_folder}{name}.png" for name in train_names]
    label_paths_train = [f"../learning/{masks_folder}{name}{suffix}" for name in train_names]

    image_paths_val = [f"../learning/{imagery_folder}{name}.png" for name in val_names]
    label_paths_val = [f"../learning/{masks_folder}{name}{suffix}" for name in val_names]
    print(f"Located {len(train_names)} train images")
    if (val_names): print(f"Located {len(val_names)} validation images")

    return (train_names, image_paths_train, label_paths_train), (val_names, image_paths_val, label_paths_val)

def createDataset(names, image_paths, label_paths=None):
    if label_paths:
        dataset = Dataset.from_dict({"image": image_paths, "label": label_paths, "name": names})
        dataset = dataset.cast_column("image", Image())
        dataset = dataset.cast_column("label", Image())
    else:
        dataset = Dataset.from_dict({"image": image_paths, "name": names})
        dataset = dataset.cast_column("image", Image())

    return dataset

[stodoran/elwha-segmentation-manual](https://huggingface.co/datasets/stodoran/elwha-segmentation-manual)

In [3]:
train_paths, val_paths = loadImages("*_corrected.png", "../data/train_images.txt", fraction=0.45)
train_dataset_manual = createDataset(*train_paths)
validation_dataset_manual = createDataset(*val_paths)

dataset = DatasetDict({
    "train": train_dataset_manual,
    "validation": validation_dataset_manual,
})
# This function assumes you have ran the huggingface-cli login command in the terminal/notebook
# dataset.push_to_hub("stodoran/elwha-segmentation-manual")

Found and loaded 4382 images with glob *_corrected.png.
Subset of 4382 ground truth segmentation masks marked for Train.
Located 125 train images
Located 101 validation images


The extra image names set to be excluded are additional datapoints used for validation experiments and must not be trained on.

In [4]:
exclude_names = val_paths[0] + ["Elwha_MR_20160714_062_014", "Elwha_MR_20170922_062_014"]

[stodoran/elwha-segmentation-v1](https://huggingface.co/datasets/stodoran/elwha-segmentation-v1)

In [5]:
train_paths_v1, _ = loadImages("*_binary.png", "../data/useful_images.txt", fraction=0, exclude=exclude_names)
train_dataset_v1 = createDataset(*train_paths_v1)

dataset = DatasetDict({
    "train": train_dataset_v1,
    "validation": validation_dataset_manual,
})
# dataset.push_to_hub("stodoran/elwha-segmentation-v1")

Found and loaded 4384 images with glob *_binary.png.
Subset of 4384 ground truth segmentation masks marked for Train.
Excluded 69 images
Located 1149 train images


[stodoran/elwha-segmentation-predict](https://huggingface.co/datasets/stodoran/elwha-segmentation-predict)

In [8]:
train_paths_pred, _ = loadImages("*.png", imagery_folder="imagery/", masks_folder="imagery/", fraction=0)
train_paths_pred = train_paths_pred[:2]
train_dataset_pred = createDataset(*train_paths_pred)

dataset = DatasetDict({
    "data": train_dataset_pred,
})
# dataset.push_to_hub("stodoran/elwha-segmentation-predict")

Found and loaded 4382 images with glob *.png.
Subset of 4382 ground truth segmentation masks marked for Train.
Located 4382 train images


Uploading the dataset shards:   0%|          | 0/9 [00:00<?, ?it/s]

Map:   0%|          | 0/487 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Map:   0%|          | 0/487 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Map:   0%|          | 0/487 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Map:   0%|          | 0/487 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Map:   0%|          | 0/487 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Map:   0%|          | 0/487 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Map:   0%|          | 0/487 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Map:   0%|          | 0/487 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Map:   0%|          | 0/486 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/stodoran/elwha-segmentation-predict/commit/6709ae1136a4109c9eb40806e8de78405549d70f', commit_message='Upload dataset', commit_description='', oid='6709ae1136a4109c9eb40806e8de78405549d70f', pr_url=None, pr_revision=None, pr_num=None)