In [1]:
import tensorflow as tf

In [4]:
GCS_PATH = "gs://sidewalks-tfx-hf/sidewalks-tfrecords"
BATCH_SIZE = 4
AUTO = tf.data.AUTOTUNE

In [8]:
def parse_tfr(proto):
    feature_description = {
        "image": tf.io.VarLenFeature(tf.float32),
        "image_shape": tf.io.VarLenFeature(tf.int64),
        "label": tf.io.VarLenFeature(tf.float32),
        "label_shape": tf.io.VarLenFeature(tf.int64),
    }
    rec = tf.io.parse_single_example(proto, feature_description)
    image_shape = tf.sparse.to_dense(rec["image_shape"])
    image = tf.reshape(tf.sparse.to_dense(rec["image"]), image_shape)
    label_shape = tf.sparse.to_dense(rec["label_shape"])
    label = tf.reshape(tf.sparse.to_dense(rec["label"]), label_shape)
    return {"pixel_values": image, "label": label}


def prepare_dataset(split="train", batch_size=BATCH_SIZE):
    if split not in ["train", "val"]:
        raise ValueError(
            "Invalid split provided. Supports splits are: `train` and `val`."
        )

    dataset = tf.data.TFRecordDataset(
        [filename for filename in tf.io.gfile.glob(f"{GCS_PATH}/{split}-*")],
        num_parallel_reads=AUTO,
    ).map(parse_tfr, num_parallel_calls=AUTO)

    if split == "train":
        dataset = dataset.shuffle(batch_size * 2)

    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(AUTO)
    return dataset

In [9]:
train_dataset = prepare_dataset()
val_dataset = prepare_dataset(split="val")

In [11]:
for batch in train_dataset.take(1):
    print(batch["pixel_values"].shape, batch["label"].shape)

(4, 3, 1080, 1920) (4, 1080, 1920)


In [13]:
for batch in val_dataset.take(1):
    print(batch["pixel_values"].shape, batch["label"].shape)

(4, 3, 1080, 1920) (4, 1080, 1920)
