In [None]:
import tensorflow as tf
import kagglehub
from datasets import load_dataset
from PIL import Image
import cv2
import numpy as np
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt

In [None]:
dataset = load_dataset("naver-clova-ix/cord-v2", split="test")
# gt = dataset[0]['ground_truth'] 
# print(gt)

In [None]:
image = dataset[0]['image']  # Sudah dalam bentuk PIL.Image
# image = image.convert("RGB")  # Pastikan formatnya RGB

In [None]:
# from tensorflow.keras.models import load_model

# # Coba load dulu modelnya
# model = load_model("/kaggle/input/obj_model/keras/default/1/obj_resnet50v2_v2.h5", compile=False)

# # Lihat input dan output model
# model.summary()

# # Cek bentuk input yang diminta
# print("\nModel Inputs:")
# print(model.input)

# # Cek bentuk output
# print("\nModel Outputs:")
# print(model.output)


In [None]:
import tensorflow.keras.backend as K

if isinstance(image, Image.Image):
    image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
else:
    image_np = image.copy()


def focal_loss(gamma=1., alpha=0.25):
# def focal_loss(gamma=1., alpha=0.25):
    """
    Focal Loss function for classification.
    
    Args:
        gamma: Focusing parameter, usually between 1 and 2.
        alpha: Balancing factor for the class weights.

    Returns:
        A callable function that computes focal loss.
    """
    def focal_loss_fixed(y_true, y_pred):
        # Clip predictions to prevent log(0) errors
        y_pred = tf.clip_by_value(y_pred, 1e-7, 1 - 1e-7)
        
        # Calculate cross entropy
        cross_entropy = -y_true * tf.math.log(y_pred)
        
        # Calculate the focal loss components
        loss = alpha * tf.pow(1 - y_pred, gamma) * cross_entropy
        
        return tf.reduce_mean(loss, axis=-1)  # Compute the mean loss across all examples
        
    return focal_loss_fixed

def smooth_l1_loss(delta=1.0):
    """
    Smooth L1 Loss function for bounding box regression.
    
    Args:
        delta: Threshold at which the loss function transitions from L2 to L1. Usually set to 1.
        
    Returns:
        A callable function that computes smooth L1 loss.
    """
    def smooth_l1_loss_fixed(y_true, y_pred):
        # Compute absolute difference
        diff = tf.abs(y_true - y_pred)
        
        # Condition for smooth L1 loss
        condition = tf.less(diff, delta)
        loss = tf.where(condition, 0.5 * diff ** 2, delta * (diff - 0.5 * delta))
        
        return tf.reduce_mean(loss, axis=-1)  # Compute the mean loss across all examples
    
    return smooth_l1_loss_fixed


# Load the model with custom_objects
model_path = '/kaggle/input/retinanet_v1/keras/default/1/best_retinanet.keras'
model = load_model(
    model_path,
    custom_objects={
        'focal_loss': focal_loss,  # TANPA KURUNG
        'smooth_l1_loss': smooth_l1_loss
    },
    compile=False
)

class_pred, bbox_pred = model.output

In [None]:
# Preprocessing gambar
image = np.array(image)
image_resized = cv2.resize(image, (640, 640))  # Resize sesuai dengan input model
image_array = np.array(image_resized) / 255.0  # Normalisasi
image_array = np.expand_dims(image_array, axis=0)  # Tambahkan batch dimension

# Prediksi untuk input data baru
class_pred, bbox_pred = model.predict(image_array)

In [None]:
print(bbox_pred)

In [None]:
print(class_pred)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

def generate_anchors(base_size, ratios, scales):
    """Generate base anchors dengan format [x1, y1, x2, y2]"""
    anchors = []
    for scale in scales:
        for ratio in ratios:
            w = base_size * scale * np.sqrt(ratio)
            h = base_size * scale / np.sqrt(ratio)
            # Format [x1, y1, x2, y2] relatif terhadap center (0,0)
            anchors.append([-w/2, -h/2, w/2, h/2])
    return np.array(anchors)


#ubah ke 64x64
def generate_all_anchors(image_shape=(640, 640), feature_strides=[16]):
    """Generate semua anchor untuk semua level feature map"""
    all_anchors = []
    base_anchors = generate_anchors(
        base_size=16,
        ratios=[0.5, 1.0, 2.0],
        scales=[1, 1.26, 1.58]
    )
    
    for stride in feature_strides:
        # Generate grid
        grid_width = image_shape[1] // stride
        grid_height = image_shape[0] // stride
        shift_x = np.arange(grid_width) * stride + stride//2  # Pusat anchor
        shift_y = np.arange(grid_height) * stride + stride//2
        
        shift_x, shift_y = np.meshgrid(shift_x, shift_y)
        shifts = np.stack([shift_x.ravel(), shift_y.ravel()], axis=1)
        
        # Generate level anchors
        level_anchors = base_anchors.reshape((1, -1, 4)) + np.concatenate(
            [shifts, shifts], axis=1).reshape((-1, 1, 4))
        
        all_anchors.append(level_anchors.reshape((-1, 4)))
    
    return np.concatenate(all_anchors, axis=0)

# Generate anchors
anchors = generate_all_anchors()

In [None]:
import cv2
import numpy as np

def decode_bbox(predictions, anchors, image_size):
    decoded_bboxes = []
    
    if len(predictions.shape) == 3:
        predictions = predictions[0]  # Drop batch dim

    for i, pred in enumerate(predictions):
        anchor = anchors[i]

        dy, dx, dh, dw = pred  # delta prediksi

        # anchor sekarang bentuknya [x1, y1, x2, y2] dalam RATIO (0-1)
        xa1, ya1, xa2, ya2 = anchor
        wa = xa2 - xa1
        ha = ya2 - ya1
        x_center_a = xa1 + 0.5 * wa
        y_center_a = ya1 + 0.5 * ha

        # Prediksi delta terhadap anchor
        x_center = x_center_a + dx * wa
        y_center = y_center_a + dy * ha
        w = np.exp(dw) * wa
        h = np.exp(dh) * ha

        # Box baru dalam rasio (0-1)
        x1 = (x_center - 0.5 * w)
        y1 = (y_center - 0.5 * h)
        x2 = (x_center + 0.5 * w)
        y2 = (y_center + 0.5 * h)

        # Clamp ke [0,1] biar gak keluar gambar
        x1 = np.clip(x1, 0, 1)
        y1 = np.clip(y1, 0, 1)
        x2 = np.clip(x2, 0, 1)
        y2 = np.clip(y2, 0, 1)

        # Skala ke pixel
        x1 = int(x1 * image_size[1])
        y1 = int(y1 * image_size[0])
        x2 = int(x2 * image_size[1])
        y2 = int(y2 * image_size[0])

        decoded_bboxes.append([x1, y1, x2, y2])

    return np.array(decoded_bboxes)


In [None]:
def visualize_predictions(image, bbox_pred, class_pred, anchors, category_map, conf_threshold=0.25):
    # Decode bboxes
    decoded_bboxes = decode_bbox(bbox_pred, anchors, image.shape[:2])

    # Karena class_pred masih shape (1, N, num_classes), kita ambil batch pertama
    class_pred = class_pred[0]

    output_image = image.copy()

    for idx in range(len(decoded_bboxes)):
        x1, y1, x2, y2 = decoded_bboxes[idx]
        
        cls_scores = class_pred[idx]  # cls_scores = array of scores untuk 4 class
        cls = np.argmax(cls_scores)   # cari index class dengan score tertinggi
        score = cls_scores[cls]        # ambil score tertinggi

        if score < conf_threshold:
            continue  # skip kalau terlalu kecil confidencenya

        label = f"{category_map.get(cls, 'unknown')} ({score*100:.1f}%)"

        # 🔥 Tambahin ini buat print hasilnya:
        print(f"BBox {idx}: {label}")

        # Draw bbox di gambar
        cv2.rectangle(output_image, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(output_image, label, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 
                    0.5, (0, 255, 0), 2)

    return output_image

In [None]:
category_map_reverse = {0: "item_name", 1: "price", 2: "quantity", 3: "background"}

# Misal sudah punya:
# image -> (H, W, 3)
# bbox_pred -> (1, N, 4)
# class_pred -> (N,)
# conf_pred -> (N,)
# anchors -> (N, 4)

image_with_boxes = visualize_predictions(
    image,
    bbox_pred,
    class_pred,
    anchors,
    category_map_reverse
)

plt.figure(figsize=(12, 12))
plt.imshow(image_with_boxes)
plt.axis('off')
plt.show()