# YOLO Graduate Evaluation
* In this notebook we are going test and compare the performance of all the graduate models that we trained in previous books. 
* We are going to break the evaluation process into 4 steps,
    * Step 1: Test Data Preparation
    * Step 2: Inference
    * Step 3: Prepare Data for evaluation
    * Step 4: Evaluate and Analyze

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.datasets import mnist
import matplotlib.pyplot as plt
import matplotlib.patches as patches



## validate tensorflow 
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

2025-12-03 10:28:30.700808: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764786510.789163  110935 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764786510.816371  110935 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764786511.018934  110935 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764786511.018963  110935 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764786511.018964  110935 computation_placer.cc:177] computation placer alr

Num GPUs Available:  1


## Constants

In [2]:
data_dir = Path("..","data")
models_dir = Path("..","models")

model_names = []

## Import Scripts

In [3]:
import os
import sys
# Build an absolute path from this notebook's parent directory
module_path = os.path.abspath(os.path.join('..'))

# Add to sys.path if not already present
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src import graph_compatible_data_generator,yolo_object_detection_model, post_processing as pp
from src import training_utils as tu

## logic to auto reload scripts without restarting the kernel
%load_ext autoreload
%autoreload 2

I0000 00:00:1764786516.163972  110935 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6053 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:2e:00.0, compute capability: 7.5


## Step 1: Test Data Preparation

In [4]:
SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)

(x_train, y_train), (x_test, y_test) = mnist.load_data()

X_tensor = tf.convert_to_tensor(x_test, dtype=tf.float32)
# X_tensor = tf.reshape(X_tensor, shape=(-1, 28, 28, 1))
y_tensor = tf.convert_to_tensor(y_test, dtype=tf.float32)

batch_size = 32
raw_dataset = tf.data.Dataset.from_tensor_slices((X_tensor, y_tensor))

# create a generator for 2 digits
data_gen_1_digits = graph_compatible_data_generator.create_data_generator(1)
data_gen_2_digits = graph_compatible_data_generator.create_data_generator(2)
data_gen_3_digits = graph_compatible_data_generator.create_data_generator(3)
data_gen_4_digits = graph_compatible_data_generator.create_data_generator(4)
data_gen_5_digits = graph_compatible_data_generator.create_data_generator(5)

test_batch_size = 2000
test_dataset_size = test_batch_size * 5

processed_dataset_1 = raw_dataset.map(
    data_gen_1_digits).batch(batch_size=batch_size).prefetch(tf.data.AUTOTUNE)

processed_dataset_2 = raw_dataset.skip(test_batch_size).map(
    data_gen_2_digits).batch(batch_size=batch_size).prefetch(tf.data.AUTOTUNE)

processed_dataset_3 = raw_dataset.skip(test_batch_size*2).map(
    data_gen_3_digits).batch(batch_size=batch_size).prefetch(tf.data.AUTOTUNE)

processed_dataset_4 = raw_dataset.skip(test_batch_size*3).map(
    data_gen_4_digits).batch(batch_size=batch_size).prefetch(tf.data.AUTOTUNE)

processed_dataset_5 = raw_dataset.skip(test_batch_size*4).map(
    data_gen_5_digits).batch(batch_size=batch_size).prefetch(tf.data.AUTOTUNE)



final_test_dataset = processed_dataset_1.unbatch().take(test_batch_size).concatenate(
    processed_dataset_2.unbatch().take(test_batch_size)).concatenate(
    processed_dataset_3.unbatch().take(test_batch_size)).concatenate(
        processed_dataset_4.unbatch().take(test_batch_size)).concatenate(
            processed_dataset_5.unbatch().take(test_batch_size))

golden_test_images, golden_test_labels = next(iter(final_test_dataset.batch(test_dataset_size)))


--- Loading and caching MNIST data... ---


In [5]:
print(tf.shape(golden_test_images))
print(tf.shape(golden_test_labels))

# Save to disk
# commenting out to avoid accidently overwrites
# np.save(f"{data_dir}/processed/golden_test_images.npy", golden_test_images.numpy())
# np.save(f"{data_dir}/processed/golden_test_labels.npy", golden_test_labels.numpy())

tf.Tensor([10000   100   100], shape=(3,), dtype=int32)
tf.Tensor([10000     5    15], shape=(3,), dtype=int32)


## Step 2: Inference 

In [6]:
## load numpy files
golden_test_images = np.load(f"{data_dir}/processed/golden_test_images.npy")
golden_test_labels= np.load(f"{data_dir}/processed/golden_test_labels.npy")

In [9]:
import tensorflow as tf
import numpy as np


class ModelBenchmark:
    def __init__(self, golden_test_images, golden_test_labels):
        # We unbatch the dataset so we can loop through it image-by-image
        self.test_images = []
        self.test_labels = []

        self.test_images = golden_test_images
        self.test_labels = golden_test_labels
        print(f"Loaded {len(self.test_images)} for benchmarking.")

    def __match_boxes(self, true_box,pred_box):
        print(f"true boxes shape : {tf.shape(true_box)}")
        print(f"pred boxes shape : {tf.shape(pred_box)}")
        return 0,0,0

    def evaluate(self, model, name):
        """
        Runs the full evaluation for a single model.
        """
        print(f"--- Benchmarking Model: {name} ---")

        # 1. Run Inference on the whole batch (Fast)
        print("Running inference...")
        raw_predictions = model.predict(self.test_images, verbose=0)

        # 2. Post-Process (NMS) to get clean boxes
        print("post processing...")
        predictions = pp.post_process(
            raw_predictions, confidence_score_threshold=0.98, iou_threshold=0.2, max_boxes=5)
        
        # We need to loop here because NMS output size varies per image
        total_tp = 0
        total_fp = 0
        total_fn = 0

        print("Calculating metrics...")
        for i in range(len(self.test_images)):
            # Get single prediction and truth
            pred_single = predictions[i]
            # This is the raw y_true (grid format)
            true_single = self.test_labels[i]

            # filter the ground truth using objectness flag
            true_boxes = true_single[true_single[...,0] == 1.0]
            # TODO: Match them and count TP/FP/FN
            tp, fp, fn = self.__match_boxes(true_boxes, pred_single)

            total_tp += tp
            total_fp += fp
            total_fn += fn

        # 3. Calculate Final Scores
        precision = total_tp / (total_tp + total_fp + 1e-7)
        recall = total_tp / (total_tp + total_fn + 1e-7)
        f1 = 2 * (precision * recall) / (precision + recall + 1e-7)

        print(f"Results for {name}:")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall:    {recall:.4f}")
        print(f"  F1 Score:  {f1:.4f}")

        # return {"precision": precision, "recall": recall, "f1": f1}
        return predictions

In [10]:
mb = ModelBenchmark(golden_test_images=golden_test_images,golden_test_labels=golden_test_labels)

custom_objects = {
    "calculate_model_loss": tu.calculate_model_loss,
    # "objectness_metrics": tu.objectness_metrics,
    # "bounding_box_metrics": tu.bounding_box_metrics,
    # "classification_metrics": tu.classification_metrics,
    "YoloObjectDetectionModel": yolo_object_detection_model.YoloObjectDetectionModel}


models = [
    {
        "name": "yolo_digits_5",
        "file_name": "yolo_experiment_1_digits_5_20_0.13.keras"
    }
]

for model_details in models:
    model = tf.keras.models.load_model(
        Path(models_dir, model_details["file_name"]), custom_objects=custom_objects)
    predictions = mb.evaluate(model=model,name=model_details["name"])

Loaded 10000 for benchmarking.
--- Benchmarking Model: yolo_digits_5 ---
Running inference...
post processing...
Calculating metrics...
true boxes shape : [ 1 15]
pred boxes shape : [ 1 15]
true boxes shape : [ 1 15]
pred boxes shape : [ 1 15]
true boxes shape : [ 1 15]
pred boxes shape : [ 1 15]
true boxes shape : [ 1 15]
pred boxes shape : [ 1 15]
true boxes shape : [ 1 15]
pred boxes shape : [ 1 15]
true boxes shape : [ 1 15]
pred boxes shape : [ 0 15]
true boxes shape : [ 1 15]
pred boxes shape : [ 1 15]
true boxes shape : [ 1 15]
pred boxes shape : [ 1 15]
true boxes shape : [ 1 15]
pred boxes shape : [ 1 15]
true boxes shape : [ 1 15]
pred boxes shape : [ 1 15]
true boxes shape : [ 1 15]
pred boxes shape : [ 1 15]
true boxes shape : [ 1 15]
pred boxes shape : [ 1 15]
true boxes shape : [ 1 15]
pred boxes shape : [ 1 15]
true boxes shape : [ 1 15]
pred boxes shape : [ 1 15]
true boxes shape : [ 1 15]
pred boxes shape : [ 1 15]
true boxes shape : [ 1 15]
pred boxes shape : [ 1 15]


In [27]:
x_center_coord = 1
y_center_coord = 2
width_coord = 3
height_coord = 4


def get_coords_from_pred_obj(pred_obj):
    x_min = tf.floor(pred_obj[..., x_center_coord] -
                     (pred_obj[..., width_coord] / 2))
    x_max = tf.floor(pred_obj[..., x_center_coord] +
                     (pred_obj[..., width_coord] / 2))
    y_min = tf.floor(pred_obj[..., y_center_coord] -
                     (pred_obj[..., height_coord] / 2))
    y_max = tf.floor(pred_obj[..., y_center_coord] +
                     (pred_obj[..., height_coord] / 2))
    return x_min, y_min, x_max, y_max


def calculate_iou_matrix(true_values, pred_values):
    # true_values shape: (M, 15)
    # pred_values shape: (N, 15)
    
    # --- 1. PREPARE DIMENSIONS FOR BROADCASTING ---
    # We expand dims to force an (M, N) comparison matrix
    
    # Ground Truth: Shape becomes (M, 1)
    # We grab columns 3 and 4 (width/height) and reshape
    true_width = tf.expand_dims(true_values[..., width_coord], axis=-1) 
    true_height = tf.expand_dims(true_values[..., height_coord], axis=-1)
    
    # Predictions: Shape becomes (1, N)
    pred_width = tf.expand_dims(pred_values[..., width_coord], axis=0)
    pred_height = tf.expand_dims(pred_values[..., height_coord], axis=0)

    # --- 2. GET COORDS & EXPAND ---
    # Helper returns (M,) and (N,). We need to expand them too.
    t_min_x, t_min_y, t_max_x, t_max_y = get_coords_from_pred_obj(true_values)
    p_min_x, p_min_y, p_max_x, p_max_y = get_coords_from_pred_obj(pred_values)

    print(t_min_x, t_min_y, t_max_x, t_max_y)
    print(p_min_x, p_min_y, p_max_x, p_max_y)

    # Expand True to (M, 1)
    t_min_x = tf.expand_dims(t_min_x, axis=-1)
    t_min_y = tf.expand_dims(t_min_y, axis=-1)
    t_max_x = tf.expand_dims(t_max_x, axis=-1)
    t_max_y = tf.expand_dims(t_max_y, axis=-1)

    # Expand Pred to (1, N)
    p_min_x = tf.expand_dims(p_min_x, axis=0)
    p_min_y = tf.expand_dims(p_min_y, axis=0)
    p_max_x = tf.expand_dims(p_max_x, axis=0)
    p_max_y = tf.expand_dims(p_max_y, axis=0)

    # --- 3. CALCULATE INTERSECTION (Vectorized) ---
    # Now (M, 1) vs (1, N) broadcasts to (M, N)
    inter_x_min = tf.maximum(t_min_x, p_min_x)
    inter_y_min = tf.maximum(t_min_y, p_min_y)
    inter_x_max = tf.minimum(t_max_x, p_max_x)
    inter_y_max = tf.minimum(t_max_y, p_max_y)

    inter_width = tf.maximum(0.0, inter_x_max - inter_x_min)
    inter_height = tf.maximum(0.0, inter_y_max - inter_y_min)
    
    inter_area = inter_width * inter_height
    
    # --- 4. CALCULATE UNION ---
    true_area = true_width * true_height # (M, 1)
    pred_area = pred_width * pred_height # (1, N)
    
    # (M, 1) + (1, N) - (M, N) = (M, N)
    union_area = true_area + pred_area - inter_area
    
    # --- 5. IOU ---
    iou = tf.math.divide_no_nan(inter_area, union_area)
    print(f"IOU: {iou}")
    return iou # Shape (M, N)


print("Calculating metrics...")
original_range = len(golden_test_images)
for i in range(original_range):
    # Get single prediction and truth
    pred_single = predictions[i]
    # This is the raw y_true (grid format)
    true_single = golden_test_labels[i]

    # filter the ground truth using objectness flag
    true_boxes = true_single[true_single[..., 0] == 1.0]
    calculate_iou_matrix(true_values=true_boxes, pred_values=pred_single)

Calculating metrics...
tf.Tensor([0.], shape=(1,), dtype=float32) tf.Tensor([0.], shape=(1,), dtype=float32) tf.Tensor([0.], shape=(1,), dtype=float32) tf.Tensor([0.], shape=(1,), dtype=float32)
tf.Tensor([0.], shape=(1,), dtype=float32) tf.Tensor([0.], shape=(1,), dtype=float32) tf.Tensor([0.], shape=(1,), dtype=float32) tf.Tensor([0.], shape=(1,), dtype=float32)
IOU: [[0.]]
tf.Tensor([0.], shape=(1,), dtype=float32) tf.Tensor([0.], shape=(1,), dtype=float32) tf.Tensor([0.], shape=(1,), dtype=float32) tf.Tensor([0.], shape=(1,), dtype=float32)
tf.Tensor([0.], shape=(1,), dtype=float32) tf.Tensor([0.], shape=(1,), dtype=float32) tf.Tensor([0.], shape=(1,), dtype=float32) tf.Tensor([0.], shape=(1,), dtype=float32)
IOU: [[0.]]
tf.Tensor([0.], shape=(1,), dtype=float32) tf.Tensor([0.], shape=(1,), dtype=float32) tf.Tensor([0.], shape=(1,), dtype=float32) tf.Tensor([0.], shape=(1,), dtype=float32)
tf.Tensor([0.], shape=(1,), dtype=float32) tf.Tensor([0.], shape=(1,), dtype=float32) tf.Tens

## DECISION
* I've had to make a lot of context switches in this project. 
* Right now creating evaluation script is time consuming and with little ROI on learning so for now I am going to pause this work.
* I'll focus on deploying this using VertexAI and will revisit evaluation in later on with a fresh mindset. 