# This notebook was used to:
- Downlaod LostAndFound dataset using TF.
- Convert to PyTorch-compatible datatypes
- Comress and save -> Export to Google Drive.

### Imports

In [9]:
import os
import torch
import tarfile
import tensorflow_datasets as tfds
from tqdm import tqdm

### Utils

In [2]:
def save_tf_dataset_as_pt(tf_dataset, output_dir, binary_mask=True):
    os.makedirs(output_dir, exist_ok=True)

    for i, item in enumerate(tqdm(tf_dataset, desc=f"Saving to {output_dir}")):
        # Extract raw tensors from TF
        image = item['image_left'].numpy()        # shape: (1024, 2048, 3)
        mask = item['segmentation_label'].numpy() # shape: (1024, 2048, 1)

        # Remove last channel from mask if needed
        if mask.shape[-1] == 1:
            mask = mask[..., 0]  # shape: (1024, 2048)

        image_tensor = torch.from_numpy(image)  # uint8 [H, W, 3]
        mask_tensor = torch.from_numpy(mask)    # uint8 or int

        # Save image and mask together
        save_path = os.path.join(output_dir, f"{i:05d}.pt")
        torch.save({
            'image': image_tensor,
            'mask': mask_tensor
        }, save_path)

    print(f"Saved {i+1} samples to {output_dir}")


def tar_folder(folder_path, tar_name):
    with tarfile.open(tar_name, "w:gz") as tar:
        tar.add(folder_path, arcname=os.path.basename(folder_path))
    print(f"Created tar.gz: {tar_name}")

### Load and convert data

In [None]:
dataset, info = tfds.load("lost_and_found", with_info=True, as_supervised=False)

In [10]:
# Save to disk
save_tf_dataset_as_pt(dataset['train'], output_dir="./laf_train")
save_tf_dataset_as_pt(dataset['test'], output_dir="./laf_test")

# Compress
tar_folder('./laf_test', 'laf_test.tar.gz')
tar_folder('./laf_train', 'laf_train.tar.gz')

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cp laf_train.tar.gz /content/drive/MyDrive/
!cp laf_test.tar.gz /content/drive/MyDrive/