In [1]:
from datasets import Dataset, DatasetDict, Image
from util import TLDataset, readSetFromFile
import numpy as np

In [2]:
def loadImages(masks_glob, indices_file, fraction=0.1, exclude=[]):
    all_data = TLDataset(
        "../learning/", "imagery/", "masks/", masks_glob, 
        subset="Train", fraction=0, seed=1, # We want to select every image
    )

    indices = list(readSetFromFile(indices_file))
    all_names = np.array([name.replace(".png", "") for name in all_data.names])
    selected_names = all_names[indices]
    names = [name for name in selected_names if name not in exclude]
    if exclude: print(f"Excluded {len(selected_names) - len(names)} images")

    cutoff = int(len(names) * fraction)
    train_names = names[cutoff:]
    val_names = names[:cutoff]
    suffix = masks_glob.replace("*", "")

    image_paths_train = [f"../learning/imagery/{name}.png" for name in train_names]
    label_paths_train = [f"../learning/masks/{name}{suffix}" for name in train_names]

    image_paths_val = [f"../learning/imagery/{name}.png" for name in val_names]
    label_paths_val = [f"../learning/masks/{name}{suffix}" for name in val_names]
    print(f"Located {len(train_names)} train images")
    if (val_names): print(f"Located {len(val_names)} validation images")

    return (train_names, image_paths_train, label_paths_train), (val_names, image_paths_val, label_paths_val)

def createDataset(names, image_paths, label_paths):
    dataset = Dataset.from_dict({"image": image_paths, "label": label_paths, "name": names})
    dataset = dataset.cast_column("image", Image())
    dataset = dataset.cast_column("label", Image())

    return dataset

[stodoran/elwha-segmentation-manual](https://huggingface.co/datasets/stodoran/elwha-segmentation-manual)

In [3]:
train_paths, val_paths = loadImages("*_corrected.png", "../data/train_images.txt", fraction=0.45)
train_dataset_manual = createDataset(*train_paths)
validation_dataset_manual = createDataset(*val_paths)

dataset = DatasetDict({
    "train": train_dataset_manual,
    "validation": validation_dataset_manual,
})
dataset.push_to_hub("stodoran/elwha-segmentation-manual")

Found and loaded 4382 images with glob *_corrected.png.
Subset of 4382 ground truth segmentation masks marked for Train.
Located 125 train images
Located 101 validation images


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/469 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/stodoran/elwha-segmentation-manual/commit/aaf0d1241aca825d7a51abe0163422573b85b064', commit_message='Upload dataset', commit_description='', oid='aaf0d1241aca825d7a51abe0163422573b85b064', pr_url=None, pr_revision=None, pr_num=None)

The extra image names set to be excluded are additional datapoints used for validation experiments and must not be trained on.

In [4]:
exclude_names = val_paths[0] + ["Elwha_MR_20160714_062_014", "Elwha_MR_20170922_062_014"]

[stodoran/elwha-segmentation-v1](https://huggingface.co/datasets/stodoran/elwha-segmentation-v1)

In [5]:
train_paths_v1, _ = loadImages("*_binary.png", "../data/useful_images.txt", fraction=0, exclude=exclude_names)
train_dataset_v1 = createDataset(*train_paths_v1)

dataset = DatasetDict({
    "train": train_dataset_v1,
    "validation": validation_dataset_manual,
})

# This function assumes you have ran the huggingface-cli login command in a terminal/notebook
dataset.push_to_hub("stodoran/elwha-segmentation-v1")

Found and loaded 4384 images with glob *_binary.png.
Subset of 4384 ground truth segmentation masks marked for Train.
Excluded 69 images
Located 1149 train images


Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/383 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Map:   0%|          | 0/383 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Map:   0%|          | 0/383 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/357 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/stodoran/elwha-segmentation-v1/commit/663917582116d9db71121180e927b485db30b3dd', commit_message='Upload dataset', commit_description='', oid='663917582116d9db71121180e927b485db30b3dd', pr_url=None, pr_revision=None, pr_num=None)