In [None]:
!pip install -q albumentations==1.4.7 pillow tqdm datasets huggingface_hub

from huggingface_hub import login
import os, shutil, uuid, tempfile, json, random
from pathlib import Path


from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

In [None]:
RAW_DATASET_ID = "eceunal/bug-bite-images-hf"
AUG_DATASET_ID = "eceunal/insect-bite-aug-v2"
LOCAL_WORKDIR  = Path("/content/augmented_dataset")
N_AUG_PER_IMG  = 5
SEED = 42

In [None]:
login(HF_TOKEN)
random.seed(SEED)

In [None]:
if LOCAL_WORKDIR.exists(): shutil.rmtree(LOCAL_WORKDIR)
LOCAL_WORKDIR.mkdir(parents=True)

In [None]:
from datasets import load_dataset

raw_ds = load_dataset(RAW_DATASET_ID, split="train+test+validation")

print("Raw dataset rows:", len(raw_ds))
print("Example keys:", raw_ds.column_names)

In [None]:
import albumentations as A
from albumentations.pytorch import ToTensorV2
from PIL import Image
import numpy as np

In [None]:
from pathlib import Path, PurePosixPath
import uuid, random, numpy as np
from PIL import Image
from tqdm.auto import tqdm

In [None]:
augment = A.Compose([
    A.OneOf([                       # geometric flips/rotations
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.Rotate(limit=25, p=0.7),
    ], p=1.0),

    A.RandomResizedCrop(            # scale & crop
        height=224, width=224,
        scale=(0.8, 1.0), ratio=(0.9, 1.1), p=1.0),

    A.ColorJitter(0.2, 0.2, 0.25, 0.05, p=0.7),
    A.Perspective(scale=(0.05, 0.1), p=0.3),
    A.GaussianBlur(blur_limit=3,     p=0.2),

    A.OneOf([
        A.ElasticTransform(
            alpha=40, sigma=50, alpha_affine=20, p=0.9),
        A.CoarseDropout(
            max_holes=1, max_height=56, max_width=56,
            min_holes=1, fill_value=0, p=1.0),
    ], p=0.5),                       # apply one of them half the time
])

In [None]:
LOCAL_WORKDIR = Path("/content/augmented_dataset")

int2str = raw_ds.features["label"].int2str
train_ratio   = 0.80

for row in tqdm(raw_ds):
    pil_img = row["image"] if isinstance(row["image"], Image.Image) \
              else Image.open(row["image"]["path"])

    label_txt = int2str(row["label"]) if "label" in row else "0"

    split = "train" if random.random() < train_ratio else "validation"
    tgt_dir = LOCAL_WORKDIR / split / label_txt
    tgt_dir.mkdir(parents=True, exist_ok=True)

    img_name = f"{uuid.uuid4()}.jpg"
    pil_img.save(tgt_dir / img_name, "JPEG", quality=95)

    np_img = np.array(pil_img)
    for _ in range(N_AUG_PER_IMG):
        aug_img = augment(image=np_img)["image"]
        aug_name = f"{uuid.uuid4()}.jpg"
        Image.fromarray(aug_img).save(tgt_dir / aug_name, "JPEG", quality=95)

print("Augmented set stored at:", LOCAL_WORKDIR)

In [None]:
aug_ds = load_dataset("imagefolder", data_dir=str(LOCAL_WORKDIR))
print(aug_ds)

aug_ds.push_to_hub(
    "eceunal/bug-bite-images-aug_v3",
    max_shard_size="500MB",
    private=False,
)