# Model Comparison

I compared different models over a dataset of cars in junctions with more than 6000 images in order to see which one fits best the task of a smart traffic light controller.
The models compared were YOLO 11 with different parameter sizes (nano, medium and large) and Baidu's RT-DETR which is  a transformer based model for real time image processing.
The inference took some time so i logged the results in a csv.

In [None]:
import os
import glob
import csv
import time
from ultralytics import YOLO, RTDETR


# Put the data from getPhotos.sh in a data/ folder inside the project working directory

IMAGES_DIR = 'data/*/images'
LABELS_DIR = 'data/*/labels'

# yolo11n.pt, yolo11m.pt, yolo11l.pt, rtdetr-l.pt
MODEL_NAME = 'rtdetr-l.pt'
RESULTS_FILE = 'results_conf_05_with_process_time.csv'

ID_MAPPING = {
    0: 1, 1: 5, 2: 2, 3: 3, 4: 0
}
VALID_COCO_IDS = list(ID_MAPPING.values())

def calculate_iou(box1, box2):
    b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2
    b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2
    b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2
    b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2

    inter_x1 = max(b1_x1, b2_x1)
    inter_y1 = max(b1_y1, b2_y1)
    inter_x2 = min(b1_x2, b2_x2)
    inter_y2 = min(b1_y2, b2_y2)

    if inter_x2 < inter_x1 or inter_y2 < inter_y1:
        return 0.0

    inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1)
    b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
    b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)

    return inter_area / (b1_area + b2_area - inter_area + 1e-6)


print(f"Loading {MODEL_NAME}...")
if "rtdetr" in MODEL_NAME.lower():
    model = RTDETR(MODEL_NAME)
else:
    model = YOLO(MODEL_NAME)

label_files = glob.glob(os.path.join(LABELS_DIR, "*.txt"))
print(f"Found {len(label_files)} label files. Starting evaluation...")

total_tp = 0
total_fp = 0
total_fn = 0
total_gt_count = 0
total_pred_count = 0

total_inference_time = 0.0
processed_images = 0

for label_file in label_files:
    ground_truths = []
    with open(label_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            cls_id = int(parts[0])
            if cls_id in ID_MAPPING:
                mapped_id = ID_MAPPING[cls_id]
                box = [float(x) for x in parts[1:5]]
                ground_truths.append({'cls': mapped_id, 'box': box, 'matched': False})

    if not ground_truths: continue

    img_file = label_file.replace('labels', 'images').replace('.txt', '.jpg')
    if not os.path.exists(img_file): img_file = img_file.replace('.jpg', '.png')
    if not os.path.exists(img_file): continue

    start_time = time.time()
    results = model(img_file, verbose=False, conf=0.5)
    end_time = time.time()

    total_inference_time += (end_time - start_time)
    processed_images += 1

    predictions = []
    for r in results:
        for box in r.boxes:
            cls_id = int(box.cls[0])
            if cls_id in VALID_COCO_IDS:
                predictions.append({'cls': cls_id, 'box': box.xywhn[0].tolist()})

    total_gt_count += len(ground_truths)
    total_pred_count += len(predictions)

    for pred in predictions:
        best_iou = 0
        best_gt_idx = -1
        for i, gt in enumerate(ground_truths):
            if gt['cls'] == pred['cls'] and not gt['matched']:
                iou = calculate_iou(pred['box'], gt['box'])
                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = i

        if best_iou > 0.5:
            total_tp += 1
            ground_truths[best_gt_idx]['matched'] = True
        else:
            total_fp += 1

    for gt in ground_truths:
        if not gt['matched']:
            total_fn += 1

precision = total_tp / (total_tp + total_fp + 1e-6)
recall = total_tp / (total_tp + total_fn + 1e-6)
f1_score = 2 * (precision * recall) / (precision + recall + 1e-6)
count_ratio = total_pred_count / (total_gt_count + 1e-6)

if processed_images > 0:
    mean_time_ms = (total_inference_time / processed_images) * 1000
else:
    mean_time_ms = 0.0

print("\n" + "=" * 40)
print(f"RESULTS FOR: {MODEL_NAME}")
print(f"Precision: {precision:.2%}")
print(f"Recall:    {recall:.2%}")
print(f"F1 Score:  {f1_score:.2f}")
print(f"Count Ratio: {count_ratio:.2f}")
print(f"Mean Time:   {mean_time_ms:.2f} ms")
print("=" * 40)

file_exists = os.path.isfile(RESULTS_FILE)

with open(RESULTS_FILE, mode='a', newline='') as f:
    writer = csv.writer(f)

    if not file_exists:
        writer.writerow(
            ['Model Name', 'Precision', 'Recall', 'F1 Score', 'Count Ratio', 'True Positives', 'False Positives', 'False Negatives',  'Mean Process Time (ms)'])

    writer.writerow([
        MODEL_NAME,
        f"{precision:.4f}",
        f"{recall:.4f}",
        f"{f1_score:.4f}",
        f"{count_ratio:.4f}",
        total_tp,
        total_fp,
        total_fn,
        f"{mean_time_ms:.2f}"
    ])

print(f"Results saved to {RESULTS_FILE}")

### Results

I ran the models twice, one time was conf of 0.5 and also logged the mean process time to make sure the inference time fits the requirements of a traffic light (no more than a few seconds).
Then I also tried to run the models with conf 0.25 to check if lower conf level will improve the results.

In [6]:
import pandas as pd

results_conf_50 = pd.read_csv("results_conf_05_with_process_time.csv")
display(results_conf_50)

results_conf_25 = pd.read_csv("results_conf_025.csv")
display(results_conf_25)

Unnamed: 0,Model Name,Precision,Recall,F1 Score,Count Ratio,True Positives,False Positives,False Negatives,Mean Process Time (ms)
0,yolo11n.pt,0.6615,0.0867,0.1534,0.1311,6907,3534,72721,47.62
1,yolo11m.pt,0.7264,0.2283,0.3474,0.3143,18181,6849,61447,235.12
2,yolo11l.pt,0.7017,0.2283,0.3446,0.3254,18183,7730,61445,258.41
3,rtdetr-l.pt,0.6317,0.5087,0.5636,0.8053,40507,23619,39121,1802.55


Unnamed: 0,Model Name,Precision,Recall,F1 Score,Count Ratio,True Positives,False Positives,False Negatives
0,yolo11l.pt,0.5717,0.3369,0.424,0.5894,26830,20103,52798
1,yolo11m.pt,0.6127,0.3361,0.4341,0.5486,26763,16917,52865
2,yolo11n.pt,0.5032,0.1896,0.2754,0.3767,15096,14903,64532
3,rtdetr-l.pt,0.3751,0.6576,0.4777,1.753,52364,87226,27264


### Results

From the results we can see quite clearly that rtdetr-l.pt the transformer based model had the best performance.
It does take a bit more time to process each image, but it is still less than 2 seconds so I think it fits the requirements of a traffic light.