In [None]:
"""
- Downloads RealWaste dataset (GitHub mirror) into data/RealWaste if not present.
- Resizes images to IMG_SIZE x IMG_SIZE (global setting).
- Normalizes pixel values to [0,1].
- Creates stratified train/validation/test splits (70/15/15).
- Computes class weights to mitigate imbalance.
- Saves prepared arrays to data/realwaste_prepared.npz.
- Writes a small sample gallery to outputs/sample_images.png.

Edit the top variables to change dataset path or image size.
"""

# -------------------------
# User-editable globals
# -------------------------
IMG_SIZE = 128         # image size (128x128)
BATCH_SIZE = 32        # global batch size (this is used here for caching plan; real use in training)
DATA_DIR = "../data"      # root data folder
RAW_DIR = f"{DATA_DIR}"   # where the raw dataset will be downloaded/extracted
OUT_PREPARED = f"{DATA_DIR}/realwaste_prepared.npz"
GITHUB_ZIP_URL = "https://github.com/sam-single/realwaste/archive/refs/heads/master.zip"
# -------------------------

In [58]:
import os
import zipfile
import shutil
import random
import numpy as np
from pathlib import Path
from collections import Counter
from PIL import Image, ImageOps
import matplotlib.pyplot as plt

In [59]:
# helper: download file
def download_file(url, out_path):
    import requests
    if os.path.exists(out_path):
        print("Found existing:", out_path)
        return out_path
    print("Downloading:", url)
    r = requests.get(url, stream=True)
    r.raise_for_status()
    with open(out_path, "wb") as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)
    print("Downloaded to:", out_path)
    return out_path

In [60]:
def unzip_members(zip_path, member_prefix, extract_to):
    with zipfile.ZipFile(zip_path, "r") as z:
        members = [m for m in z.namelist() if m.startswith(member_prefix)]
        z.extractall(path=extract_to, members=members)

In [61]:
def ensure_dataset_downloaded():
    """
    Download and extract the GitHub mirror of RealWaste into RAW_DIR.
    The repo contains a folder with images in subfolders per class.
    """
    if os.path.exists(RAW_DIR) and any(os.scandir(RAW_DIR)):
        print("Raw dataset already present in", RAW_DIR)
        return

    tmp_zip = "data/realwaste_master.zip"
    os.makedirs("data", exist_ok=True)
    download_file(GITHUB_ZIP_URL, tmp_zip)

    print("Extracting archive...")
    with zipfile.ZipFile(tmp_zip, 'r') as z:
        # extract entire archive and then move the inner folder to RAW_DIR
        z.extractall("data")
    # the archive typically extracts to data/realwaste-master or similar; detect it
    extracted_dirs = [d for d in Path("data").iterdir() if d.is_dir() and d.name.startswith("realwaste")]
    if not extracted_dirs:
        raise RuntimeError("Could not find extracted realwaste directory.")
    extracted_root = extracted_dirs[0]
    # Move contents to RAW_DIR
    if os.path.exists(RAW_DIR):
        shutil.rmtree(RAW_DIR)
    shutil.move(str(extracted_root), RAW_DIR)
    print("Dataset placed at:", RAW_DIR)
    # cleanup zip
    os.remove(tmp_zip)

In [62]:
def list_image_files(root):
    files = []
    classes = []
    for cls in sorted(os.listdir(root)):
        cls_path = os.path.join(root, cls)
        if not os.path.isdir(cls_path):
            continue
        classes.append(cls)
        for fname in os.listdir(cls_path):
            if fname.lower().endswith((".jpg", ".jpeg", ".png")):
                files.append((os.path.join(cls_path, fname), cls))
    return files, classes

In [63]:
def load_and_resize(path, size):
    # Load image with PIL, convert to RGB, resize with Lanczos for quality
    with Image.open(path) as im:
        im = im.convert("RGB")
        # optional: pad to square then resize (keeps aspect)
        im = ImageOps.fit(im, (size, size), Image.LANCZOS)
        arr = np.array(im, dtype=np.float32)
        return arr

In [64]:
def save_sample_gallery(images, labels, out_path, ncols=6):
    n = len(images)
    ncols = min(ncols, n)
    nrows = (n + ncols - 1) // ncols
    fig, ax = plt.subplots(nrows, ncols, figsize=(ncols*2, nrows*2))
    ax = np.array(ax).reshape(-1)
    for i in range(len(ax)):
        ax[i].axis("off")
    for i, (img, lbl) in enumerate(zip(images, labels)):
        ax[i].imshow(img.astype(np.uint8))
        ax[i].set_title(lbl, fontsize=8)
        ax[i].axis("off")
    plt.tight_layout()
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    plt.savefig(out_path)
    plt.close()

In [None]:
def prepare_and_save():
    # ensure dataset present
    ensure_dataset_downloaded()

    files, classes = list_image_files(RAW_DIR)
    if not files:
        raise RuntimeError("No image files found in dataset. Check RAW_DIR structure.")
    print(f"Found {len(files)} images across {len(classes)} classes.")
    random.seed(42)
    random.shuffle(files)

    # load images into memory
    X = []
    y = []
    for i, (fpath, cls) in enumerate(files):
        try:
            img = load_and_resize(fpath, IMG_SIZE)
            X.append(img)
            y.append(cls)
        except Exception as e:
            print("Error loading", fpath, e)
    X = np.stack(X, axis=0)   # shape (N, H, W, 3)
    y = np.array(y)
    print("Loaded array shape:", X.shape)

    # normalize to [0,1]
    X = X / 255.0

    # integer labels
    class_list = sorted(list(set(y)))
    cls_to_idx = {c: i for i, c in enumerate(class_list)}
    y_int = np.array([cls_to_idx[c] for c in y], dtype=np.int32)

    # stratified split: train 70%, then val/test 15% each
    from sklearn.model_selection import train_test_split
    X_train, X_rem, y_train, y_rem = train_test_split(
        X, y_int, train_size=0.70, stratify=y_int, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(
        X_rem, y_rem, test_size=0.5, stratify=y_rem, random_state=42)

    print("Split sizes: train", X_train.shape[0], "val", X_val.shape[0], "test", X_test.shape[0])

    # one-hot labels
    from tensorflow.keras.utils import to_categorical
    num_classes = len(class_list)
    y_train_oh = to_categorical(y_train, num_classes)
    y_val_oh = to_categorical(y_val, num_classes)
    y_test_oh = to_categorical(y_test, num_classes)

    # class weights
    counts = Counter(y_train)
    total = len(y_train)
    class_weights = {int(k): float(total / (len(counts) * v)) for k, v in counts.items()}
    print("Class counts (train):", counts)
    print("Class weights:", class_weights)

    # save prepared data
    os.makedirs(os.path.dirname(OUT_PREPARED), exist_ok=True)
    np.savez_compressed(OUT_PREPARED,
                        X_train=X_train, y_train=y_train_oh, y_train_int=y_train,
                        X_val=X_val, y_val=y_val_oh, y_val_int=y_val,
                        X_test=X_test, y_test=y_test_oh, y_test_int=y_test,
                        classes=np.array(class_list), class_weights=class_weights)
    print("Saved prepared dataset to", OUT_PREPARED)

    # save small sample gallery
    sample_idxs = list(range(min(24, X_train.shape[0])))
    sample_images = (X_train * 255.0).astype(np.uint8)[sample_idxs]
    sample_labels = [class_list[int(y_train[i])] for i in sample_idxs]
    save_sample_gallery(sample_images, sample_labels, "outputs/sample_images.png")
    print("Saved sample gallery to outputs/sample_images.png")

In [76]:
if __name__ == "__main__":
    os.makedirs("../outputs", exist_ok=True)
    prepare_and_save()
    print("Data preparation complete. Next step: training.")

Found 4752 images across 9 classes.
Loaded array shape: (4752, 128, 128, 3)
Split sizes: train 3326 val 713 test 713
Class counts (train): Counter({6: 645, 3: 553, 5: 350, 4: 346, 0: 323, 8: 305, 2: 294, 1: 288, 7: 222})
Class weights: {5: 1.0558730158730159, 6: 0.5729543496985358, 1: 1.283179012345679, 3: 0.668274060679124, 0: 1.1441348469212247, 7: 1.6646646646646646, 8: 1.2116575591985428, 2: 1.256991685563114, 4: 1.0680796403339756}
Saved prepared dataset to ../data/realwaste_prepared.npz
Saved sample gallery to outputs/sample_images.png
Data preparation complete. Next step: training.
