## 1. Imports and Setup
First, we import the necessary libraries. We'll use TensorFlow and Keras for the model, `os` and `glob` for file handling, and OpenCV (`cv2`) for image processing and drawing bounding boxes during inference.

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os
import glob
import cv2
from matplotlib import pyplot as plt

2025-08-26 15:26:14.568768: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-08-26 15:26:15.262610: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-08-26 15:26:18.020906: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


## 2. Configuration
Set up the main configuration variables here. You should adjust `NUM_CLASSES` based on your specific dataset.

In [2]:
# Dataset paths (based on your image)
TRAIN_IMAGE_DIR = 'archive/images/train/'
TRAIN_LABEL_DIR = 'archive/labels/train/'
VAL_IMAGE_DIR = 'archive/images/val/'
VAL_LABEL_DIR = 'archive/labels/val/'

# Model and image parameters
IMG_SIZE = 512
BATCH_SIZE = 6
NUM_CLASSES = 10 # IMPORTANT: Change this to the number of classes in your dataset
EPOCHS = 25

## 3. Data Loading and Preprocessing
This function will parse the image and label directories to create a dataset. It reads the YOLO-style `.txt` files to get the bounding box coordinates and class labels. We then create a `tf.data.Dataset` for efficient training.

In [3]:
def parse_label_file(label_path):
    """Reads a YOLO-style label file and returns the class_id and bbox."""
    with open(label_path, 'r') as f:
        # Assuming one object per file for this simple model
        line = f.readline().strip().split()
        class_id = int(line[0])
        coords = np.array([float(x) for x in line[1:]], dtype=np.float32)
    return class_id, coords

def load_data(image_dir, label_dir):
    """Loads image paths and their corresponding labels."""
    image_paths = sorted(glob.glob(os.path.join(image_dir, '*.jpg'))) # Assuming .jpg, change if needed
    label_paths = sorted(glob.glob(os.path.join(label_dir, '*.txt')))
    
    labels = []
    for path in label_paths:
        class_id, bbox = parse_label_file(path)
        # One-hot encode the class ID
        one_hot_class = tf.keras.utils.to_categorical(class_id, num_classes=NUM_CLASSES)
        labels.append({'class': one_hot_class, 'bbox': bbox})
        
    return image_paths, labels

def data_generator(image_paths, labels):
    """A generator to load and preprocess images and labels."""
    for img_path, label in zip(image_paths, labels):
        # Load and resize image
        img = tf.io.read_file(img_path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, [IMG_SIZE, IMG_SIZE])
        img = img / 255.0  # Normalize to [0, 1]
        
        yield img, (label['bbox'], label['class'])

# Create TensorFlow datasets
train_images, train_labels = load_data(TRAIN_IMAGE_DIR, TRAIN_LABEL_DIR)
val_images, val_labels = load_data(VAL_IMAGE_DIR, VAL_LABEL_DIR)

train_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(train_images, train_labels),
    output_signature=(
        tf.TensorSpec(shape=(IMG_SIZE, IMG_SIZE, 3), dtype=tf.float32),
        (tf.TensorSpec(shape=(4,), dtype=tf.float32), tf.TensorSpec(shape=(NUM_CLASSES,), dtype=tf.float32))
    )
).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(val_images, val_labels),
    output_signature=(
        tf.TensorSpec(shape=(IMG_SIZE, IMG_SIZE, 3), dtype=tf.float32),
        (tf.TensorSpec(shape=(4,), dtype=tf.float32), tf.TensorSpec(shape=(NUM_CLASSES,), dtype=tf.float32))
    )
).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

E0000 00:00:1756202198.414375    1418 cuda_executor.cc:1309] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used.
W0000 00:00:1756202198.566878    1418 gpu_device.cc:2342] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


## 4. Modified Object Detection Model
Here, we adapt your original `CNN_model`. The main change is at the end. Instead of one output layer, we create two separate 'heads':
1.  **`bbox_head`**: Predicts the 4 bounding box coordinates.
2.  **`class_head`**: Predicts the class of the object, just like in classification.

In [None]:
def ObjectDetector_model(num_classes, input_shape=(512, 512, 3)):
    # Base feature extractor (your original model)
    base_model = keras.Sequential([
        keras.layers.Input(shape=input_shape),
        keras.layers.Conv2D(32, kernel_size=(3, 3), strides=(1, 1), padding='same'),
        keras.layers.Conv2D(32, kernel_size=(3, 3), strides=(1, 1), padding='same'),
        keras.layers.BatchNormalization(),
        keras.layers.ReLU(),
        keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'),

        keras.layers.Conv2D(64, kernel_size=(3, 3), strides=(1, 1), padding='same'),
        keras.layers.Conv2D(64, kernel_size=(3, 3), strides=(1, 1), padding='same'),
        keras.layers.BatchNormalization(),
        keras.layers.ReLU(),
        keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'),

        keras.layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='same'),
        keras.layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='same'),
        keras.layers.BatchNormalization(),
        keras.layers.ReLU(),
        keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'),

        keras.layers.Conv2D(256, kernel_size=(3, 3), strides=(1, 1), padding='same'),
        keras.layers.Conv2D(256, kernel_size=(3, 3), strides=(1, 1), padding='same'),
        keras.layers.BatchNormalization(),
        keras.layers.ReLU(),
        keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'),

        keras.layers.Flatten(),
    ], name='feature_extractor')

    # Common dense layers
    x = keras.layers.Dense(512, activation='relu')(base_model.output)
    x = keras.layers.Dense(256, activation='relu')(x)
    x = keras.layers.Dropout(0.5)(x)

    # Bounding Box Head - predicts coordinates (x_center, y_center, width, height)
    bbox_head = keras.layers.Dense(4, activation='sigmoid', name='bbox')(x)

    # Classification Head - predicts the object class
    class_head = keras.layers.Dense(num_classes, activation='softmax', name='class')(x)

    # Combine into the final model
    model = keras.Model(inputs=base_model.input, outputs=[bbox_head, class_head])
    
    return model

# Instantiate the model
model = ObjectDetector_model(NUM_CLASSES)

## 5. Compiling and Training the Model
We compile the model with two different loss functions: `MeanSquaredError` for the bounding box regression and `CategoricalCrossentropy` for the classification. We can also assign different weights to each loss.
Then, we train the model using `model.fit()`.

In [None]:
# Define losses for each output head
losses = {
    'bbox': tf.keras.losses.MeanSquaredError(),
    'class': tf.keras.losses.CategoricalCrossentropy()
}

# Define weights for each loss
loss_weights = {
    'bbox': 1.0,
    'class': 1.0
}

# Compile the model
model.compile(
    optimizer=keras.optimizers.Adam(),
    loss=losses,
    loss_weights=loss_weights,
    metrics=['accuracy']
)

model.summary()

# Train the model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCHS
)

## 6. Saving the Model
After training, we save the model in TensorFlow's standard `SavedModel` format. We also convert and save it as an `.onnx` file, as requested. You'll need to install the `tf2onnx` library for this step:

`pip install tf2onnx`

In [None]:
# Save in TensorFlow SavedModel format
TF_MODEL_PATH = 'object_detector_model_tf'
model.save(TF_MODEL_PATH)
print(f"Model saved in TensorFlow format at: {TF_MODEL_PATH}")

# Save in ONNX format
try:
    import tf2onnx
    ONNX_MODEL_PATH = 'object_detector_model.onnx'
    spec = (tf.TensorSpec((None, IMG_SIZE, IMG_SIZE, 3), tf.float32, name="input"),)
    model_proto, _ = tf2onnx.convert.from_keras(model, input_signature=spec, opset=13, output_path=ONNX_MODEL_PATH)
    print(f"Model saved in ONNX format at: {ONNX_MODEL_PATH}")
except ImportError:
    print("Could not save in ONNX format. Please run 'pip install tf2onnx'")

## 7. Inference and Visualization
Finally, we'll create an inference function to test our trained model. This function will:
1. Load the saved TensorFlow model.
2. Preprocess a test image.
3. Get predictions from the model.
4. Convert the normalized coordinates back to pixel values.
5. Draw the bounding box and class label on the image using OpenCV.

In [None]:
def run_inference(model_path, image_path, class_names):
    """Loads a model and an image, performs inference, and draws the bounding box."""
    # Load the model
    loaded_model = tf.keras.models.load_model(model_path)
    
    # Load and preprocess the image
    image_bgr = cv2.imread(image_path)
    original_h, original_w, _ = image_bgr.shape
    image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
    image_resized = cv2.resize(image_rgb, (IMG_SIZE, IMG_SIZE))
    image_normalized = image_resized / 255.0
    image_batch = np.expand_dims(image_normalized, axis=0) # Add batch dimension
    
    # Make predictions
    bbox_pred, class_pred = loaded_model.predict(image_batch)
    
    # Post-process the output
    bbox = bbox_pred[0] # Get the first (and only) prediction
    x_center, y_center, w, h = bbox
    
    # Denormalize coordinates
    x_min = int((x_center - w / 2) * original_w)
    y_min = int((y_center - h / 2) * original_h)
    x_max = int((x_center + w / 2) * original_w)
    y_max = int((y_center + h / 2) * original_h)
    
    # Find the predicted class
    predicted_class_id = np.argmax(class_pred[0])
    confidence = np.max(class_pred[0])
    class_label = class_names.get(predicted_class_id, 'Unknown')
    label_text = f'{class_label}: {confidence:.2f}'

    # Draw bounding box and label on the original image
    cv2.rectangle(image_bgr, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
    cv2.putText(image_bgr, label_text, (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
    
    # Display the result
    plt.imshow(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
    plt.title('Inference Result')
    plt.axis('off')
    plt.show()

# --- Example Usage ---
# Create a dummy dictionary for class names. Replace with your actual class names.
CLASS_NAMES = {0: 'cat', 1: 'dog', 2: 'car'} # etc. 

# Get a few test images from the validation set
test_image_paths = val_images[:3] 

for img_path in test_image_paths:
    print(f"Running inference on: {img_path}")
    run_inference(TF_MODEL_PATH, img_path, CLASS_NAMES)