In [1]:
# SceneTextDetector — YOLOv4-like (from scratch) — fixed & runnable
# Put this cell in the same Kaggle notebook where your DATA_ROOT and data.yaml are accessible.

import os, glob, yaml, math, pathlib, sys
import numpy as np
import cv2
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, callbacks
import tensorflow.keras.backend as K
print("TensorFlow:", tf.__version__)

# ---------- CONFIG ----------
DATA_ROOT  = "/kaggle/input/yolodata/YOLO"   # change if needed
DATA_YAML  = "/kaggle/input/yoloyaml/data.yaml"  # change if needed or set to None to use Data1/*.npy
USE_PREPROCESSED_NUMPY = os.path.exists("Data1/X.npy") and os.path.exists("Data1/Y.npy")
# ===== Image size =====
IMG_SIZE   = 512        # (was 256)
GRID_H = GRID_W = IMG_SIZE // 32  # must be integer (16 for 512)
BATCH_SIZE = 2           # safer for 512px images under 30GB

EPOCHS = 10
LR = 1e-4

LAMBDA_COORD = 1.0
LAMBDA_SIZE  = 1.0
LAMBDA_OBJ   = 1.0
LAMBDA_NOOBJ = 0.5

CONF_THRESH = 0.5
IOU_NMS = 0.3

SAVE_BEST = "weights_best.h5"
SAVE_LAST = "weights_last.h5"
SAVE_DIR = "SceneTextDetector_savedmode"

MAX_TRAIN = 50000
MAX_VAL = 2000

# ---------- Utility functions (from your Utils.py with small fixes) ----------
def decode_to_boxes(output, ht, wd, threshold=CONF_THRESH):
    # output: (gh,gw,1,5)
    gh, gw = output.shape[:2]
    boxes = []
    scores = []
    for i in range(gh):
        for j in range(gw):
            sc = float(output[i,j,0,0])
            if sc > threshold:
                dx,dy,w,h = output[i,j,0,1:5]
                x = ((j + dx)/gw)*wd
                y = ((i + dy)/gh)*ht
                ww = float(w)*wd*1.3
                hh = float(h)*ht*1.3
                x1 = x - ww/2
                y1 = y - hh/2
                x2 = x + ww/2
                y2 = y + hh/2
                # clip
                x1, y1 = max(0, x1), max(0, y1)
                x2, y2 = min(wd-1, x2), min(ht-1, y2)
                if x2 > x1 and y2 > y1:
                    boxes.append([x1, y1, x2, y2])
                    scores.append(sc)
    return boxes, scores

def iou(box1, box2):
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])
    inter_w = max(0, x2 - x1)
    inter_h = max(0, y2 - y1)
    inter = inter_w * inter_h
    a1 = (box1[2]-box1[0])*(box1[3]-box1[1])
    a2 = (box2[2]-box2[0])*(box2[3]-box2[1])
    union = a1 + a2 - inter
    return inter/union if union > 0 else 0.0

def non_max(boxes, scores, iou_thr=IOU_NMS):
    if not boxes: return []
    order = np.argsort(scores).tolist()
    keep = []
    while order:
        i = order.pop()
        keep.append(i)
        new_order = []
        for j in order:
            if iou(boxes[i], boxes[j]) <= iou_thr:
                new_order.append(j)
        order = new_order
    return [boxes[i] for i in keep]

# ---------- Data utilities (YOLO-format loader with streaming) ----------
def _img_to_label(img_path: str) -> str:
    # images/... -> labels/... .txt
    # works for both windows/unix separators
    return os.path.splitext(
        img_path.replace(os.sep + "images" + os.sep, os.sep + "labels" + os.sep)
    )[0] + ".txt"

def _read_yolo_label_file(lbl_path: str):
    if not os.path.exists(lbl_path):
        return []
    out = []
    with open(lbl_path) as fh:
        for ln in fh:
            p = ln.strip().split()
            if len(p) < 5: continue
            try:
                # class x y w h
                x,y,w,h = map(float, p[1:5]) if len(p) >= 5 else map(float, p[:4])
            except Exception:
                continue
            if w > 0 and h > 0:
                x = min(max(x,0.0),1.0)
                y = min(max(y,0.0),1.0)
                w = min(max(w,0.0),1.0)
                h = min(max(h,0.0),1.0)
                out.append((x,y,w,h))
    return out

def _encode_label_grid(boxes):
    Y = np.zeros((GRID_H, GRID_W, 1, 5), np.float32)
    best_area = np.zeros((GRID_H, GRID_W), np.float32)
    for (xc,yc,w,h) in boxes:
        gx, gy = xc * GRID_W, yc * GRID_H
        j = int(min(GRID_W - 1, math.floor(gx)))
        i = int(min(GRID_H - 1, math.floor(gy)))
        dx, dy = gx - j, gy - i
        area = w * h
        if Y[i,j,0,0] == 0 or area > best_area[i,j]:
            Y[i,j,0,0] = 1.0
            Y[i,j,0,1] = dx
            Y[i,j,0,2] = dy
            Y[i,j,0,3] = w
            Y[i,j,0,4] = h
            best_area[i,j] = area
    return Y

def _py_load_and_encode(img_path_bytes):
    img_path = img_path_bytes.decode("utf-8")
    img = cv2.imread(img_path)
    if img is None:
        dummy_img = np.zeros((IMG_SIZE, IMG_SIZE, 3), np.float32)
        dummy_y   = np.zeros((GRID_H, GRID_W, 1, 5), np.float32)
        return dummy_img, dummy_y
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (IMG_SIZE, IMG_SIZE), interpolation=cv2.INTER_LINEAR)
    img = (img.astype(np.float32) - 127.5) / 127.5

    lbl_path = _img_to_label(img_path)
    boxes = _read_yolo_label_file(lbl_path)
    Y = _encode_label_grid(boxes)
    return img, Y

def _tf_map(img_path):
    img, Y = tf.numpy_function(_py_load_and_encode, [img_path], [tf.float32, tf.float32])
    img.set_shape((IMG_SIZE, IMG_SIZE, 3))
    Y.set_shape((GRID_H, GRID_W, 1, 5))
    return img, Y

def make_dataset(paths, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices(np.array(paths, dtype=str))
    if shuffle:
        ds = ds.shuffle(min(len(paths), 2048), reshuffle_each_iteration=True)
    ds = ds.map(_tf_map, num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.batch(BATCH_SIZE, drop_remainder=False)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

# ---------- Build dataset: either preprocessed numpy or streaming YOLO files ----------
if USE_PREPROCESSED_NUMPY:
    print("Loading preprocessed Data1/X.npy & Data1/Y.npy (fast mode)")
    X = np.load("Data1/X.npy")
    Y = np.load("Data1/Y.npy")
    # ensure shapes
    assert X.shape[1:3] == (IMG_SIZE, IMG_SIZE), f"X shape mismatch {X.shape}"
    assert Y.shape[1:3] == (GRID_H, GRID_W), f"Y shape mismatch {Y.shape}"
    # create tf dataset
    ds = tf.data.Dataset.from_tensor_slices((X.astype(np.float32), Y.astype(np.float32)))
    ds = ds.shuffle(2048).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    # split
    total = len(X)
    split = int(total * 0.9)
    train_ds = ds.take(split)
    val_ds = ds.skip(split)
else:
    # parse data.yaml
    with open(DATA_YAML, "r") as f:
        data_cfg = yaml.safe_load(f)
    def _resolve_split(p):
        if p is None: return []
        if not os.path.isabs(p):
            p = os.path.join(DATA_ROOT, p)
        if os.path.isdir(p):
            exts = ("*.jpg","*.jpeg","*.png","*.bmp","*.JPG","*.PNG")
            files=[]
            for e in exts:
                files += glob.glob(os.path.join(p, e))
            return sorted(files)
        if os.path.isfile(p) and p.lower().endswith(".txt"):
            with open(p) as fh:
                lines = [ln.strip() for ln in fh if ln.strip()]
            base = os.path.dirname(p)
            return [ln if os.path.isabs(ln) else os.path.join(base, ln) for ln in lines]
        return []
    train_imgs = _resolve_split(data_cfg.get("train"))
    val_imgs   = _resolve_split(data_cfg.get("val"))
    train_imgs = train_imgs[:MAX_TRAIN]
    val_imgs   = val_imgs[:MAX_VAL]
    assert len(train_imgs) > 0 and len(val_imgs) > 0, "No images found; check DATA_YAML and paths."
    print(f"Found {len(train_imgs)} train and {len(val_imgs)} val images.")
    train_ds = make_dataset(train_imgs, shuffle=True)
    val_ds   = make_dataset(val_imgs, shuffle=False)

print(f"Grid: {GRID_H}x{GRID_W}  |  IMG_SIZE: {IMG_SIZE}  |  Batch: {BATCH_SIZE}")

# ---------- Model (YOLOv4-inspired lightweight) ----------
def conv_bn_leaky(x, filters, k=3, s=1):
    x = layers.Conv2D(filters, k, s, padding="same", use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    return layers.LeakyReLU(0.1)(x)

def residual_block(x, filters, repeats=1):
    for _ in range(repeats):
        y = conv_bn_leaky(x, filters//2, 1)
        y = conv_bn_leaky(y, filters, 3)
        x = layers.Add()([x, y])
    return x

def build_model(input_shape=(IMG_SIZE, IMG_SIZE, 3), filters=(32,64,128,256,512)):
    inputs = layers.Input(shape=input_shape)
    x = conv_bn_leaky(inputs, filters[0], 3)
    # downsampling stages with residuals
    for f in filters:
        x = conv_bn_leaky(x, f, 3)
        x = residual_block(x, f, repeats=1)
        x = layers.MaxPooling2D(2,2)(x)
    # extra conv and output
    x = conv_bn_leaky(x, filters[-1], 3)
    x = layers.Conv2D(5, 1, padding="same", activation="sigmoid")(x)
    # ensure shape = (gh, gw, 1, 5)
    gh = x.shape[1] if x.shape[1] is not None else GRID_H
    gw = x.shape[2] if x.shape[2] is not None else GRID_W
    outputs = layers.Reshape((gh, gw, 1, 5))(x)
    return models.Model(inputs, outputs, name="SceneTextDetector")

model = build_model()
model.summary()

# ---------- Loss ----------
import tensorflow.keras.backend as K

import tensorflow.keras.backend as K

import tensorflow.keras.backend as K

def custom_loss(y_true, y_pred):
    # both: (B,gh,gw,1,5)
    obj   = y_true[..., 0:1]   # (B,gh,gw,1,1)
    noobj = 1.0 - obj

    pred_obj = y_pred[..., 0:1]   # (B,gh,gw,1,1)

    # objectness BCE
    obj_bce  = K.binary_crossentropy(obj, pred_obj)   # (B,gh,gw,1)
    obj_loss = tf.reduce_sum(obj_bce, axis=[1,2,3])   # (B,)

    # coords loss
    coord = tf.reduce_sum(
        obj * tf.square(y_true[...,1:3] - y_pred[...,1:3]), 
        axis=[1,2,3,4]   # reduce x,y
    )

    # size loss
    size = tf.reduce_sum(
        obj * tf.square(y_true[...,3:5] - y_pred[...,3:5]),
        axis=[1,2,3,4]
    )

    # no-objectness BCE
    noobj_bce  = K.binary_crossentropy(tf.zeros_like(pred_obj), pred_obj) # (B,gh,gw,1)
    noobj_loss = tf.reduce_sum(noobj * noobj_bce, axis=[1,2,3])           # (B,)

    total = (LAMBDA_OBJ   * obj_loss +
             LAMBDA_NOOBJ * noobj_loss +
             LAMBDA_COORD * coord +
             LAMBDA_SIZE  * size)

    return tf.reduce_mean(total)





# ---------- Compile ----------
opt = optimizers.Adam(LR)
model.compile(optimizer=opt, loss=custom_loss)

# ---------- Visualization callback (fixed) ----------
VIS_DIR = pathlib.Path("epoch_vis"); VIS_DIR.mkdir(exist_ok=True)

def decode_and_nms(output, ih, iw, conf=CONF_THRESH, iou_thr=IOU_NMS):
    boxes, scores = decode_to_boxes(output, ih, iw, conf)
    keep = non_max(boxes, np.array(scores), iou_thr)
    return keep

class SavePredictions(callbacks.Callback):
    def __init__(self, sample_paths, every=1, max_imgs=4):
        super().__init__()
        self.samples = sample_paths[:max_imgs]
        self.every = every

    def on_epoch_end(self, epoch, logs=None):
        if (epoch+1) % self.every != 0 or not self.samples:
            return
        for p in self.samples:
            img = cv2.imread(p)
            if img is None: continue
            ih, iw = img.shape[:2]
            rs = cv2.resize(cv2.cvtColor(img, cv2.COLOR_BGR2RGB), (IMG_SIZE, IMG_SIZE))
            inp = ((rs.astype(np.float32)-127.5)/127.5)[None,...]
            pred = self.model.predict(inp, verbose=0)[0]  # (gh,gw,1,5)
            boxes = decode_and_nms(pred, ih, iw)
            vis = img.copy()
            for (x1,y1,x2,y2) in boxes:
                cv2.rectangle(vis, (int(x1),int(y1)), (int(x2),int(y2)), (0,255,0), 2)
            out_path = str(VIS_DIR / f"ep{epoch+1}_{os.path.basename(p)}")
            cv2.imwrite(out_path, vis[:, :, ::-1])  # BGR write
        print(f"\nSaved visualizations to: {VIS_DIR}/")

# If using streaming mode, pick a few val samples for visualization
sample_paths = []
if not USE_PREPROCESSED_NUMPY:
    # take up to 4 val images for visualization
    sample_paths = val_imgs[:4] if 'val_imgs' in globals() else []

cbs = [
    callbacks.ModelCheckpoint(SAVE_BEST, monitor="val_loss", save_best_only=True, verbose=1),
    callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, min_lr=1e-6, verbose=1),
]
if sample_paths:
    cbs.append(SavePredictions(sample_paths, every=1, max_imgs=4))

# ---------- Train ----------
hist = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=cbs,
    verbose=1
)

# ---------- Save ----------
# Save as H5
model.save(SAVE_LAST)  

# Save as SavedModel directory (no extension, works in Keras 3)
model.export(SAVE_DIR)   # <-- this replaces save_format='tf'


2025-08-26 08:53:25.690025: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756198405.925825      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756198405.997917      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


TensorFlow: 2.18.0
Found 15404 train and 1712 val images.


I0000 00:00:1756198422.462641      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1756198422.463339      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


Grid: 16x16  |  IMG_SIZE: 512  |  Batch: 2


Epoch 1/10


I0000 00:00:1756198436.960611      62 service.cc:148] XLA service 0x7b8be0005770 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1756198436.961597      62 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1756198436.961620      62 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1756198438.166293      62 cuda_dnn.cc:529] Loaded cuDNN version 90300
E0000 00:00:1756198443.396653      62 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1756198443.612657      62 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.


[1m   1/7702[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m61:11:29[0m 29s/step - loss: 335.2079

I0000 00:00:1756198452.607130      62 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m7702/7702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step - loss: 26.1383
Epoch 1: val_loss improved from inf to 13.54559, saving model to weights_best.h5

Saved visualizations to: epoch_vis/
[1m7702/7702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m692s[0m 86ms/step - loss: 26.1371 - val_loss: 13.5456 - learning_rate: 1.0000e-04
Epoch 2/10
[1m7702/7702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step - loss: 12.4718
Epoch 2: val_loss improved from 13.54559 to 11.71604, saving model to weights_best.h5

Saved visualizations to: epoch_vis/
[1m7702/7702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m663s[0m 86ms/step - loss: 12.4718 - val_loss: 11.7160 - learning_rate: 1.0000e-04
Epoch 3/10
[1m7702/7702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step - loss: 11.0408
Epoch 3: val_loss improved from 11.71604 to 11.33063, saving model to weights_best.h5

Saved visualizations to: epoch_vis/
[1m7702/7702[0m [32m━━━━━━━━━━━━━━━━━━━━