In [None]:
!pip install rasterio

Collecting rasterio
  Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting cligj>=0.5 (from rasterio)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Collecting click-plugins (from rasterio)
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl.metadata (6.4 kB)
Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.2/22.2 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Downloading affine-2.4.0-py3-none-any.whl (15 kB)
Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Installing collected packages: cligj, click-plugins, affine, rasterio
Successfully installed affine-2.4.0 click-plugins-1.1.1 cligj-0.7.2 rasterio-1.4.3


In [None]:
import numpy as np
import rasterio
from rasterio.windows import Window
from rasterio.windows import bounds
import glob
import os
import pyproj
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import sys
import time
import math
import zipfile
import random
from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

In [None]:
brick_coords='/content/all_kilns_dec102024.xlsx'
coords_df = pd.read_excel(brick_coords)

  warn(msg)


In [None]:
coords_df.columns

Index(['uid', 'Latitude', 'Longitude', 'Type', 'Active', 'Boyd22',
       'stanford21', 'gob17_18', 'Type_correct', 'first_inactive',
       'last_active', 'first_start', 'last_before_start', 'fck_to_zzk',
       'zzk_to_fck', 'Boyd18', 'Union', 'ADM4_PCODE', 'Upazila', 'ADM3_PCODE',
       'District', 'ADM2_PCODE', 'Division'],
      dtype='object')

In [None]:
zip_path = '/content/S2B_MSIL2A_20250405T042659_N0511_R133_T46QBM_20250405T063356.SAFE.zip'

extract_path = '/content/extracted_data'

if not os.path.exists(extract_path):
    os.makedirs(extract_path)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"Extraction complete. Files are extracted to: {extract_path}")

Extraction complete. Files are extracted to: /content/extracted_data


In [None]:
BRICK_COORDS = Path('/content/all_kilns_dec102024.xlsx')
SAFE_DIR     = Path('/content/extracted_data'
                     '/S2B_MSIL2A_20250405T042659_N0511_R133_T46QBM_20250405T063356.SAFE')
OUTPUT_DIR   = Path('kiln_dataset_v2')
PATCH_SUBDIR = 'images'
LABELS_FILE  = 'labels.csv'

IMAGE_RES_M  = 10.0
SCALE_FACTOR = 10_000.0
KILN_DIAM_M  = 66.0
PATCH_PX     = 64
NEG_PER_POS  = 1
RNG_SEED     = 42

kiln_px = math.ceil(KILN_DIAM_M / IMAGE_RES_M)
if kiln_px % 2 == 0:
    kiln_px += 1
EST_KILN_PX = kiln_px
HALF_PATCH  = PATCH_PX // 2

patches_dir = OUTPUT_DIR / PATCH_SUBDIR
patches_dir.mkdir(parents=True, exist_ok=True)

print("––– Configuration –––")
print(f" coords file:   {BRICK_COORDS}")
print(f" SAFE dir:      {SAFE_DIR}")
print(f" output patches:{patches_dir}")
print(f" patch size:    {PATCH_PX} px")
print(f" kiln box:      {EST_KILN_PX} px")
print("––––––––––––––––––––––\n")


def overlaps_any_kiln(tile_row: int, tile_col: int, kiln_rc: np.ndarray) -> bool:
    """Return *True* if a 128 px tile intersects any kiln bounding box."""
    tile_bottom = tile_row + PATCH_PX - 1
    tile_right  = tile_col + PATCH_PX - 1
    r = kiln_rc[:, 0]
    c = kiln_rc[:, 1]
    hk = EST_KILN_PX // 2
    inside = (
        (r >= tile_row   - hk) & (r <= tile_bottom + hk) &
        (c >= tile_col   - hk) & (c <= tile_right  + hk)
    )
    return bool(inside.any())


def read_rgb_window(srcs: list[rasterio.io.DatasetReader], win: Window) -> np.ndarray:
    """Read bands 1–3 over *win* and return float32 array in [0,1]."""
    bands = [s.read(1, window=win).astype(np.float32) for s in srcs]
    data  = np.stack(bands, axis=-1) / SCALE_FACTOR
    return np.clip(data, 0, 1)


coords_df = pd.read_excel(BRICK_COORDS)
if not {"Latitude", "Longitude"}.issubset(coords_df.columns):
    raise ValueError("Excel file must contain 'Latitude' and 'Longitude' columns")
print(f"Loaded {len(coords_df)} kiln coordinates from Excel")

band_glob  = SAFE_DIR / 'GRANULE' / '*' / 'IMG_DATA' / 'R10m'
band_pats  = ['*_B04_10m.jp2', '*_B03_10m.jp2', '*_B02_10m.jp2']  # R, G, B
band_paths = [glob.glob(str(band_glob / pat))[0] for pat in band_pats]

srcs       = [rasterio.open(p) for p in band_paths]
transform  = srcs[0].transform
crs        = srcs[0].crs
width, height = srcs[0].width, srcs[0].height
print(f"Image: {width}×{height} px, CRS={crs.to_string()}")

wgs84   = pyproj.CRS('EPSG:4326')
img_crs = pyproj.CRS(crs)
proj    = pyproj.Transformer.from_crs(wgs84, img_crs, always_xy=True)

xs, ys = proj.transform(coords_df['Longitude'].to_numpy(),
                        coords_df['Latitude'].to_numpy())
rows, cols = rasterio.transform.rowcol(transform, xs, ys)
coords_df['pixel_row'] = rows
coords_df['pixel_col'] = cols
kiln_rc = coords_df[['pixel_row', 'pixel_col']].to_numpy()

labels: list[dict] = []
start_time = time.time()

pos_written = 0
for _, row in coords_df.iterrows():
    pr, pc = int(row['pixel_row']), int(row['pixel_col'])

    if pr < 0 or pc < 0 or pr >= height or pc >= width:
        continue

    tile_row = (pr // PATCH_PX) * PATCH_PX
    tile_col = (pc // PATCH_PX) * PATCH_PX
    win      = Window(tile_col, tile_row, PATCH_PX, PATCH_PX)

    patch = read_rgb_window(srcs, win)
    out_name = f"kiln_{pos_written+1:04d}_patch.tif"
    print(f"Saved {out_name} (bbox [{xmin},{ymin}]→[{xmax},{ymax}])")

    pos_written += 1
    pr, pc = int(row['pixel_row']), int(row['pixel_col'])

    if pr < 0 or pc < 0 or pr >= height or pc >= width:
        continue

    tile_row = (pr // PATCH_PX) * PATCH_PX
    tile_col = (pc // PATCH_PX) * PATCH_PX
    win      = Window(tile_col, tile_row, PATCH_PX, PATCH_PX)

    patch = read_rgb_window(srcs, win)

    out_name = f"kiln_{pos_written+1:04d}_patch.tif"
    out_path = patches_dir / out_name

    profile = srcs[0].profile.copy()
    profile.update(
        driver='GTiff', height=PATCH_PX, width=PATCH_PX,
        count=3, dtype=patch.dtype,
        transform=rasterio.windows.transform(win, transform)
    )
    for k in ('nodata', 'tiled', 'blockxsize', 'blockysize', 'compress', 'interleave'):
        profile.pop(k, None)

    with rasterio.open(out_path, 'w', **profile) as dst:
        dst.write(np.moveaxis(patch, -1, 0))

    hk   = EST_KILN_PX // 2
    cx   = pc - tile_col
    cy   = pr - tile_row
    xmin = max(0, cx - hk)
    ymin = max(0, cy - hk)
    xmax = min(PATCH_PX - 1, cx + hk)
    ymax = min(PATCH_PX - 1, cy + hk)

    labels.append(dict(
        filename=os.path.join(PATCH_SUBDIR, out_name),
        width=PATCH_PX, height=PATCH_PX, clazz='kiln',
        xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax
    ))

    pos_written += 1
    print(f"Saved {out_name} (bbox [{xmin},{ymin}]→[{xmax},{ymax}])")


neg_needed = pos_written
rng        = np.random.default_rng(RNG_SEED)
neg_written = 0

while neg_written < neg_needed:
    tile_row = int(rng.integers(0, height // PATCH_PX) * PATCH_PX)
    tile_col = int(rng.integers(0, width  // PATCH_PX) * PATCH_PX)

    if overlaps_any_kiln(tile_row, tile_col, kiln_rc):
        continue  # skip tiles that touch a kiln

    win   = Window(tile_col, tile_row, PATCH_PX, PATCH_PX)
    patch = read_rgb_window(srcs, win)

    out_name = f"background_{neg_written+1:04d}_patch.tif"
    out_path = patches_dir / out_name

    profile = srcs[0].profile.copy()
    profile.update(
        driver='GTiff', height=PATCH_PX, width=PATCH_PX,
        count=3, dtype=patch.dtype,
        transform=rasterio.windows.transform(win, transform)
    )
    for k in ('nodata', 'tiled', 'blockxsize', 'blockysize', 'compress', 'interleave'):
        profile.pop(k, None)

    with rasterio.open(out_path, 'w', **profile) as dst:
        dst.write(np.moveaxis(patch, -1, 0))

    labels.append(dict(
        filename=os.path.join(PATCH_SUBDIR, out_name),
        width=PATCH_PX, height=PATCH_PX, clazz='non_kiln',
        xmin='', ymin='', xmax='', ymax=''
    ))
    neg_written += 1
    print(f"Saved {out_name} (bbox [ ][ ]→[ ][ ])")

for s in srcs:
    s.close()

labels_df = pd.DataFrame(labels)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
labels_df.to_csv(OUTPUT_DIR / LABELS_FILE, index=False)

total_t = time.time() - start_time
print(f"\nFinished {pos_written} kiln + {neg_written} background patches in {total_t:.1f}s")
print(f"Labels saved to {OUTPUT_DIR / LABELS_FILE}\n")


––– Configuration –––
 coords file:   /content/all_kilns_dec102024.xlsx
 SAFE dir:      /content/extracted_data/S2B_MSIL2A_20250405T042659_N0511_R133_T46QBM_20250405T063356.SAFE
 output patches:kiln_dataset_v2/images
 patch size:    64 px
 kiln box:      7 px
 positives:     20
 negatives:     20
––––––––––––––––––––––



  warn(msg)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Saved kiln_3101_patch.tif (bbox [8,52]→[14,58])
Saved kiln_3102_patch.tif (bbox [19,47]→[25,53])
Saved kiln_3103_patch.tif (bbox [19,47]→[25,53])
Saved kiln_3104_patch.tif (bbox [44,42]→[50,48])
Saved kiln_3105_patch.tif (bbox [44,42]→[50,48])
Saved kiln_3106_patch.tif (bbox [17,43]→[23,49])
Saved kiln_3107_patch.tif (bbox [17,43]→[23,49])
Saved kiln_3108_patch.tif (bbox [24,33]→[30,39])
Saved kiln_3109_patch.tif (bbox [24,33]→[30,39])
Saved kiln_3110_patch.tif (bbox [49,33]→[55,39])
Saved kiln_3111_patch.tif (bbox [49,33]→[55,39])
Saved kiln_3112_patch.tif (bbox [60,27]→[63,33])
Saved kiln_3113_patch.tif (bbox [60,27]→[63,33])
Saved kiln_3114_patch.tif (bbox [23,3]→[29,9])
Saved kiln_3115_patch.tif (bbox [23,3]→[29,9])
Saved kiln_3116_patch.tif (bbox [31,1]→[37,7])
Saved kiln_3117_patch.tif (bbox [31,1]→[37,7])
Saved kiln_3118_patch.tif (bbox [29,51]→[35,57])
Saved kiln_3119_patch.tif (bbox [29,51]→[35,57])
Saved kiln_31