## Dataset Preparation - Lesion Images

In [None]:
from pathlib import Path
import numpy as np
from PIL import Image
import os
from tqdm import tqdm

# Original data
DATA_ROOT = Path("/kaggle/input/ham1000-segmentation-and-classification")
IMAGES_DIR = DATA_ROOT / "images"
MASKS_DIR = DATA_ROOT / "masks"

# Where to save cropped lesion images
CROPPED_DIR = Path("/kaggle/working/ham_lesion_crops")
CROPPED_DIR.mkdir(parents=True, exist_ok=True)

print("Saving cropped images to:", CROPPED_DIR)

margin_ratio = 0.10  # 10% padding around lesion

img_paths = sorted(IMAGES_DIR.glob("*.jpg"))
print("Total images found:", len(img_paths))

for img_path in tqdm(img_paths):
    img_id = img_path.stem
    mask_path = MASKS_DIR / f"{img_id}_segmentation.png"
    if not mask_path.exists():
        # if no mask (should not happen), just copy resized full image
        img = Image.open(img_path).convert("RGB")
        img = img.resize((224, 224), Image.BILINEAR)
        img.save(CROPPED_DIR / f"{img_id}.jpg")
        continue

    # Load image & mask
    img = Image.open(img_path).convert("RGB")
    mask = Image.open(mask_path).convert("L")

    img_np = np.array(img)
    mask_np = np.array(mask)
    mask_bin = mask_np > 0  # lesion = True

    if mask_bin.sum() == 0:
        # Fallback: no lesion pixels -> use full image
        crop = img
    else:
        # Find bounding box of lesion
        ys, xs = np.where(mask_bin)
        y_min, y_max = ys.min(), ys.max()
        x_min, x_max = xs.min(), xs.max()

        h, w = mask_bin.shape
        # Add some margin
        pad_y = int((y_max - y_min) * margin_ratio)
        pad_x = int((x_max - x_min) * margin_ratio)

        y_min = max(0, y_min - pad_y)
        y_max = min(h - 1, y_max + pad_y)
        x_min = max(0, x_min - pad_x)
        x_max = min(w - 1, x_max + pad_x)

        # Crop on original image
        crop = img.crop((x_min, y_min, x_max + 1, y_max + 1))

    # Resize crop to classifier input size (224x224)
    crop = crop.resize((224, 224), Image.BILINEAR)
    crop.save(CROPPED_DIR / f"{img_id}.jpg")

print("Done creating cropped lesion images.")

Saving cropped images to: /kaggle/working/ham_lesion_crops
Total images found: 10015


100%|██████████| 10015/10015 [06:51<00:00, 24.35it/s]

Done creating cropped lesion images.



