In this notebook, we transform raw datasets to parquet format to enable faster loading speed during training and evaluation.

The raw format of released datasets is as follows:
```python
# train set
/train/real/...
/train/fake/...
/train/masks/...
# valid set
/valid/real/...
/valid/fake/...
/valid/masks/...
```

In [1]:
import os
from datasets import Dataset, DatasetDict
from datasets import Features, Image
from typing import List


def load_images_from_dir(directory: str) -> List[str]:
    return [
        os.path.join(directory, fname)
        for fname in os.listdir(directory)
        if fname.endswith(("jpg", "jpeg", "png"))
    ]


def create_split(root_dir: str, split: str) -> Dataset:
    fake_dir = os.path.join(root_dir, split, "fake")
    masks_dir = os.path.join(root_dir, split, "masks")
    real_dir = os.path.join(root_dir, split, "real")

    fake_images = load_images_from_dir(fake_dir)
    mask_images = load_images_from_dir(masks_dir)
    real_images = load_images_from_dir(real_dir)

    assert len(fake_images) == len(mask_images)

    return Dataset.from_dict(
        {
            "image": fake_images + real_images,
            "mask": mask_images + [None] * len(real_images),
        },
        features=Features(
            {"image": Image(), "mask": Image()}
        ),
    )


def create_dataset(root_dir: str) -> DatasetDict:
    train_dataset = create_split(root_dir, split="train")
    valid_dataset = create_split(root_dir, split="valid")

    return DatasetDict({"train": train_dataset, "valid": valid_dataset})

root_dir = "/gemini/space/lye/track1"

  from .autonotebook import tqdm as notebook_tqdm


We merge `real/` and `fake/` into `images` column for simplity. A image is real if there is no corresponding mask.

In [None]:
dataset = create_dataset(root_dir)
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'mask'],
        num_rows: 954931
    })
    valid: Dataset({
        features: ['image', 'mask'],
        num_rows: 238733
    })
})

Then save processed datasets to parquet.

In [None]:
trainset = dataset["train"]
validset = dataset["valid"]
trainset.to_parquet(os.path.join(root_dir, "train.parquet"))
validset.to_parquet(os.path.join(root_dir, "valid.parquet"))

Load from processed datasets to do whatever you want.

In [4]:
from datasets import load_dataset
validset = load_dataset("parquet", data_files=os.path.join("/gemini/space/jyc/track1", "valid.parquet"))
validset

Generating train split: 238733 examples [00:00, 596829.16 examples/s]


DatasetDict({
    train: Dataset({
        features: ['image', 'mask'],
        num_rows: 238733
    })
})

In [None]:
import timm
import sys

sys.path.insert(0, "./src")
import src.models.pe

model_timm = timm.create_model(
    "vit_pe_core_large_patch14_336",
    pretrained=True,
    pretrained_cfg_overlay=dict(file="/gemini/code/loupe/pretrained_weights/pe_timm/model.safetensors"),
)
model_timm = model_timm.cuda()

TypeError: PretrainedCfg.__init__() got an unexpected keyword argument 'proj_dim'

In [15]:
model_timm.default_cfg

{'file': '/gemini/code/loupe/pretrained_weights/pe_timm/model.safetensors',
 'hf_hub_id': 'timm/vit_pe_core_large_patch14_336',
 'architecture': 'vit_pe_core_large_patch14_336',
 'custom_load': False,
 'input_size': (3, 336, 336),
 'fixed_input_size': True,
 'interpolation': 'bilinear',
 'crop_pct': 0.875,
 'crop_mode': 'center',
 'mean': (0.5, 0.5, 0.5),
 'std': (0.5, 0.5, 0.5),
 'num_classes': 0,
 'pool_size': None,
 'first_conv': None,
 'classifier': None,
 'license': 'apache-2.0'}