
# 02 — Tile & Filter

**Goal:** Tile large aerial images into model-friendly crops (e.g., 640×640) and build a tiles manifest.
- Discard overly uniform tiles to save compute (optional).
- Tiles inherit the image-level label (presence/absence) for MIL training.


In [None]:

%pip -q install numpy pandas pyarrow pillow tqdm


In [None]:

import os, numpy as np, pandas as pd
from pathlib import Path
from PIL import Image
from tqdm import tqdm

# ====== USER CONFIG ======
BASE = Path('/content')  # change if needed
MANIFEST_DIR = BASE/'data/manifests'
IM_TILE_DIR = BASE/'data/tiles'
IM_TILE_DIR.mkdir(parents=True, exist_ok=True)

TILE = 640
STRIDE = 320               # 50% overlap
UNIFORM_THRESH = 0.80      # drop tile if one color occupies >80% pixels (set None to disable)
SAVE_JPEG_QUALITY = 92


In [None]:

def tile_image(img_path: Path, tile_size:int=TILE, stride:int=STRIDE):
    im = Image.open(img_path).convert('RGB')
    W, H = im.size
    tiles = []
    for y in range(0, max(1, H - tile_size + 1), stride):
        for x in range(0, max(1, W - tile_size + 1), stride):
            box = (x, y, x+tile_size, y+tile_size)
            if box[2] <= W and box[3] <= H:
                tiles.append((x,y,im.crop(box)))
    return tiles, im.size

def too_uniform(pil_img: Image.Image, thresh=UNIFORM_THRESH):
    if thresh is None: return False
    arr = np.asarray(pil_img)
    # compute per-channel hist; rough uniform check by unique colors ratio
    uniq = np.unique(arr.reshape(-1, arr.shape[-1]), axis=0)
    ratio = uniq.shape[0] / (arr.shape[0]*arr.shape[1])
    return ratio < (1.0 - thresh) * 0.01  # heuristic; small unique-pixel ratio implies uniform


In [None]:

import pandas as pd

train_df = pd.read_parquet(MANIFEST_DIR/'manifest_train.parquet')
val_df   = pd.read_parquet(MANIFEST_DIR/'manifest_val.parquet')
test_df  = pd.read_parquet(MANIFEST_DIR/'manifest_test.parquet')

def process_split(name, df):
    rows = []
    out_dir = IM_TILE_DIR/name
    out_dir.mkdir(parents=True, exist_ok=True)
    for _, r in tqdm(df.iterrows(), total=len(df), desc=f"tiling {name}"):
        img_path = Path(r['path'])
        label = int(r['label'])
        image_id = r['image_id']
        tiles, (W,H) = tile_image(img_path)
        for (x,y,tile) in tiles:
            if not too_uniform(tile):
                tile_id = f"{image_id}_{x}_{y}"
                out_path = out_dir/f"{tile_id}.jpg"
                tile.save(out_path, quality=SAVE_JPEG_QUALITY)
                rows.append({
                    'image_id': image_id,
                    'tile_id': tile_id,
                    'tile_path': str(out_path),
                    'label': label,
                    'x': x, 'y': y, 'W': W, 'H': H,
                })
    tiles_df = pd.DataFrame(rows)
    tiles_df.to_parquet(IM_TILE_DIR/f'tiles_{name}.parquet', index=False)
    print(name, tiles_df.shape, tiles_df['label'].value_counts().to_dict())
    return tiles_df

tiles_train = process_split('train', train_df)
tiles_val   = process_split('val',   val_df)
tiles_test  = process_split('test',  test_df)
