In [18]:
from datasets import Dataset, DatasetDict, Image
from util import TLDataset, readSetFromFile
from tqdm import tqdm

In [19]:
def loadImages(masks_glob, indices_file, fraction=0.1):
    all_data = TLDataset(
        "../learning/", "imagery/", "masks/", masks_glob, 
        subset="Train", fraction=0, seed=1, # We want to select every image
    )

    indices = readSetFromFile(indices_file)
    names = [all_data[index]["name"].replace(".png", "") for index in tqdm(indices)]

    cutoff = int(len(names) * fraction)
    train_names = names[cutoff:]
    val_names = names[:cutoff]

    suffix = masks_glob.replace("*", "")

    image_paths_train = [f"../learning/imagery/{name}.png" for name in train_names]
    label_paths_train = [f"../learning/masks/{name}{suffix}" for name in train_names]

    image_paths_val = [f"../learning/imagery/{name}.png" for name in val_names]
    label_paths_val = [f"../learning/masks/{name}{suffix}" for name in val_names]
    print(f"Located {len(train_names)} train images")
    print(f"Located {len(val_names)} validation images")

    return (train_names, image_paths_train, label_paths_train), (val_names, image_paths_val, label_paths_val)

def createDataset(names, image_paths, label_paths):
    dataset = Dataset.from_dict({"image": image_paths, "label": label_paths, "name": names})
    dataset = dataset.cast_column("image", Image())
    dataset = dataset.cast_column("label", Image())

    return dataset

[stodoran/elwha-segmentation-v1](stodoran/elwha-segmentation-v1)

In [20]:
train_paths, val_paths = loadImages("*_binary.png", "../data/useful_images.txt")
train_dataset = createDataset(*train_paths)
validation_dataset = createDataset(*val_paths)

dataset = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
})

# This function assumes you have ran the huggingface-cli login command in a terminal/notebook
dataset.push_to_hub("stodoran/elwha-segmentation-v1")

Found and loaded 4384 images with glob *_binary.png.
Subset of 4384 ground truth segmentation masks marked for Train.


100%|██████████| 1218/1218 [00:33<00:00, 36.36it/s]


Located 1097 train images
Located 121 validation images


Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/366 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Map:   0%|          | 0/366 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Map:   0%|          | 0/365 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/121 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/444 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/stodoran/elwha-segmentation-v1/commit/bdadc0d37acac852884ec84a0bdd885173b306ae', commit_message='Upload dataset', commit_description='', oid='bdadc0d37acac852884ec84a0bdd885173b306ae', pr_url=None, pr_revision=None, pr_num=None)

Manually created training dataset:

[stodoran/elwha-segmentation](https://huggingface.co/datasets/stodoran/elwha-segmentation/tree/main)

In [None]:
train_paths, val_paths = loadImages("*_corrected.png", "../data/train_images.txt")
train_dataset = createDataset(*train_paths)
validation_dataset = createDataset(*val_paths)

dataset = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
})
dataset.push_to_hub("stodoran/elwha-segmentation")

In [None]:
# all_data = TLDataset(
#     "../learning/", "imagery/", "masks/", "*_corrected.png", 
#     subset="Train", fraction=0, seed=1, # We want to select every image
# )

# indices = readSetFromFile("../data/train_images.txt")
# names = [all_data[index]["name"].replace(".png", "") for index in indices]

# cutoff = int(len(names) / 10) # fraction=0.1
# train_names = names[cutoff:]
# val_names = names[:cutoff]

# image_paths_train = [f"../learning/imagery/{name}.png" for name in train_names]
# label_paths_train = [f"../learning/masks/{name}_corrected.png" for name in train_names]

# image_paths_validation = [f"../learning/imagery/{name}.png" for name in val_names]
# label_paths_validation = [f"../learning/masks/{name}_corrected.png" for name in val_names]

# print(f"Tiny dataset train: {len(train_names)} images")
# print(f"Tiny dataset validation: {len(val_names)} images")

# train_dataset = createDataset(image_paths_train, label_paths_train)
# validation_dataset = createDataset(image_paths_validation, label_paths_validation)

# dataset = DatasetDict({
#     "train": train_dataset,
#     "validation": validation_dataset,
# })

# # This function assumes you have ran the huggingface-cli login command in a terminal/notebook
# dataset.push_to_hub("stodoran/elwha-segmentation")