# Model Selection

In [1]:
import os
import cv2
import numpy as np

import sys
sys.path.append("..")

# include your import statements

#### **Task 5: Assignment Instructions:**
A. Compare and contrast the performance of YOLO Model 1 and YOLO Model 2 by demonstrating the weaknesses and strengths of each model. 
B. Finally, select the best model and state the reasons why this model is best suited for the TechTrack's system implementation.

**Reminder:**  
- Your notebook should be well-structured and clear for effective presentation. Up to 10 points may be deducted for poor structure and clarity.
- Consider this report as if it were being reviewed by **TechTrack stakeholders**. Keep it professional, insightful, and visually organized!
- Use visualizations, tables, and quantitative analysis where applicable to support your findings.

---

#### **Task A:**
**Compare and contrast the performance of YOLO Model 1 and YOLO Model 2 by demonstrating the weaknesses and strengths of each model.**

In [18]:
import os
import numpy as np

def load_annotations(file_path):
    """
    Loads bounding boxes and class IDs from a file.
    Expected format per line: class_id x y width height [confidence (optional)]
    The class_id is expected to be an integer but may be stored as a float string.
    """
    boxes = []
    classes = []
    confidences = []
    if os.path.exists(file_path):
        with open(file_path, "r") as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 5:
                    cls, x, y, w, h = parts[:5]
                    boxes.append([float(x), float(y), float(w), float(h)])
                    # Convert class to int (even if stored as a float string)
                    classes.append(int(float(cls)))
                    if len(parts) == 6:
                        confidences.append(float(parts[5]))
                    else:
                        confidences.append(1.0)
    return boxes, classes, confidences


def compute_iou(boxA, boxB):
    """
    Computes the Intersection over Union (IoU) of two bounding boxes.
    Boxes are in the format [x, y, width, height].
    """
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[0] + boxA[2], boxB[0] + boxB[2])
    yB = min(boxA[1] + boxA[3], boxB[1] + boxB[3])
    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = boxA[2] * boxA[3]
    boxBArea = boxB[2] * boxB[3]
    unionArea = boxAArea + boxBArea - interArea
    return interArea / unionArea if unionArea != 0 else 0


def compute_ap(recalls, precisions):
    """
    Compute Average Precision (AP) using 11-point interpolation.
    """
    ap = 0.0
    for t in np.linspace(0, 1, 11):
        precisions_at_recall = [p for r, p in zip(recalls, precisions) if r >= t]
        p_interp = max(precisions_at_recall) if precisions_at_recall else 0
        ap += p_interp / 11.0
    return ap


def evaluate_class(gt_data, det_data, iou_thresh=0.5):
    """
    Evaluate detections for a single class.
    
    gt_data: dict mapping image_id -> list of ground truth boxes (each a list: [x, y, w, h])
    det_data: list of tuples (image_id, box, confidence)
    
    Returns:
        ap: Average Precision for the class.
        recalls: Cumulative recall values.
        precisions: Cumulative precision values.
        max_f1: Maximum F1 score computed from the precision-recall curve.
        avg_iou: Average IoU over true positive detections.
    """
    npos = sum(len(boxes) for boxes in gt_data.values())
    
    # Create separate flags to mark detections (do not modify gt_data itself)
    detected_flags = {img_id: [False] * len(boxes) for img_id, boxes in gt_data.items()}
    
    # Sort detections by confidence in descending order.
    det_data = sorted(det_data, key=lambda x: x[2], reverse=True)
    
    tp = np.zeros(len(det_data))
    fp = np.zeros(len(det_data))
    tp_ious = []  # Store IoU for each true positive
    
    for i, (img_id, box, conf) in enumerate(det_data):
        gt_boxes = gt_data.get(img_id, [])
        best_iou = 0
        best_idx = -1
        for j, gt_box in enumerate(gt_boxes):
            iou = compute_iou(box, gt_box)
            if iou > best_iou:
                best_iou = iou
                best_idx = j
        if best_iou >= iou_thresh:
            if not detected_flags.get(img_id, [])[best_idx]:
                tp[i] = 1
                detected_flags[img_id][best_idx] = True
                tp_ious.append(best_iou)
            else:
                fp[i] = 1
        else:
            fp[i] = 1
    
    tp_cumsum = np.cumsum(tp)
    fp_cumsum = np.cumsum(fp)
    recalls = tp_cumsum / npos if npos > 0 else np.zeros_like(tp_cumsum)
    precisions = tp_cumsum / (tp_cumsum + fp_cumsum + 1e-6)
    ap = compute_ap(recalls, precisions)
    
    # Compute F1 scores for each threshold and take the maximum F1.
    f1_scores = [2 * p * r / (p + r) if (p + r) > 0 else 0 for p, r in zip(precisions, recalls)]
    max_f1 = max(f1_scores) if f1_scores else 0
    avg_iou = np.mean(tp_ious) if tp_ious else 0
    
    return ap, recalls, precisions, max_f1, avg_iou


# --- Setup Directories ---
gt_dir = os.path.abspath("../storage/logistics")
detections_dir1 = os.path.abspath("../detections")
detections_dir2 = os.path.abspath("../detections2")

# List ground truth annotation files (assumed to be .txt)
gt_files = [f for f in os.listdir(gt_dir) if f.endswith(".txt")]

# Data structures to store ground truth and detections by class.
gt_by_class = {}     # {class: {image_id: [box, box, ...]}}
det_by_class1 = {}   # {class: [(image_id, box, confidence), ...]}
det_by_class2 = {}   # {class: [(image_id, box, confidence), ...]}

# Populate ground truth data.
for file in gt_files:
    image_id = os.path.splitext(file)[0]
    gt_path = os.path.join(gt_dir, file)
    boxes, classes, _ = load_annotations(gt_path)
    for box, cls in zip(boxes, classes):
        if cls not in gt_by_class:
            gt_by_class[cls] = {}
        if image_id not in gt_by_class[cls]:
            gt_by_class[cls][image_id] = []
        gt_by_class[cls][image_id].append(box)


def load_detections(file_path):
    """
    Loads detections from a file.
    Expected format per line:
        x y w h confidence class_prob1 class_prob2 ... class_prob20

    The final detection confidence is computed as:
        final_confidence = confidence * max(class_probabilities)
    and the predicted class is the index of the maximum class probability.
    """
    boxes = []
    classes = []
    confidences = []
    if os.path.exists(file_path):
        with open(file_path, "r") as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 25:  # 4 for box, 1 for detection confidence, 20 for class probabilities
                    # Parse bounding box coordinates
                    x, y, w, h = map(float, parts[:4])
                    # Parse the detection confidence score
                    det_conf = float(parts[4])
                    # Parse the 20 class probabilities
                    class_probs = list(map(float, parts[5:25]))
                    # Determine the predicted class and the corresponding class probability
                    best_class = int(np.argmax(class_probs))
                    best_class_prob = class_probs[best_class]
                    # Compute the final confidence score
                    final_conf = det_conf * best_class_prob
                    
                    boxes.append([x, y, w, h])
                    classes.append(best_class)
                    confidences.append(final_conf)
    return boxes, classes, confidences


# Populate detection data for Model 1.
det_files1 = [f for f in os.listdir(detections_dir1) if f.endswith(".txt")]
for file in det_files1:
    image_id = os.path.splitext(file)[0]
    det_path = os.path.join(detections_dir1, file)
    boxes, classes, confidences = load_detections(det_path)
    for box, cls, conf in zip(boxes, classes, confidences):
        if cls not in det_by_class1:
            det_by_class1[cls] = []
        det_by_class1[cls].append((image_id, box, conf))

# Populate detection data for Model 2.
det_files2 = [f for f in os.listdir(detections_dir2) if f.endswith(".txt")]
for file in det_files2:
    image_id = os.path.splitext(file)[0]
    det_path = os.path.join(detections_dir2, file)
    boxes, classes, confidences = load_detections(det_path)
    for box, cls, conf in zip(boxes, classes, confidences):
        if cls not in det_by_class2:
            det_by_class2[cls] = []
        det_by_class2[cls].append((image_id, box, conf))


# Load class names from the names file.
class_names_path = os.path.abspath("../storage/yolo_models/logistics.names")
with open(class_names_path, "r") as f:
    class_names = [line.strip() for line in f.readlines()]
num_classes = len(class_names)


def compute_map_all_classes(num_classes, gt_by_class, det_by_class, iou_thresh=0.5):
    """
    For each class, compute AP, maximum F1, and average IoU.
    Also, print per-class metrics and return overall metrics.
    """
    aps = {}
    f1_scores = {}
    avg_ious = {}
    for cls in range(num_classes):
        # Retrieve ground truth for the class; if missing, use an empty dictionary.
        gt_data = gt_by_class.get(cls, {})
        # Make a deep copy so that detection flags remain independent.
        gt_data_copy = {img_id: [box.copy() for box in boxes] for img_id, boxes in gt_data.items()}
        if cls not in det_by_class:
            ap = 0
            f1 = 0
            avg_iou = 0
        else:
            ap, rec, prec, f1, avg_iou = evaluate_class(gt_data_copy, det_by_class[cls], iou_thresh=iou_thresh)
        aps[cls] = ap
        f1_scores[cls] = f1
        avg_ious[cls] = avg_iou
        print(f"Class {cls} ({class_names[cls]}): AP = {ap:.3f}, Max F1 = {f1:.3f}, Avg IoU = {avg_iou:.3f}")
    mAP = np.mean(list(aps.values()))
    mF1 = np.mean(list(f1_scores.values()))
    mAvgIoU = np.mean(list(avg_ious.values()))
    return mAP, aps, mF1, mAvgIoU


# --- Evaluate Model 1 ---
print("------ Model 1 Evaluation ------")
mAP_model1, aps1, mF1_model1, mAvgIoU_model1 = compute_map_all_classes(num_classes, gt_by_class, det_by_class1, iou_thresh=0.000005)
print("\n--- Overall Metrics for Model 1 ---")
print(f"Model 1 mAP: {mAP_model1:.3f}")
print(f"Model 1 mF1: {mF1_model1:.3f}")
print(f"Model 1 mAvgIoU: {mAvgIoU_model1:.3f}")

# --- Evaluate Model 2 ---
print("\n------ Model 2 Evaluation ------")
mAP_model2, aps2, mF1_model2, mAvgIoU_model2 = compute_map_all_classes(num_classes, gt_by_class, det_by_class2, iou_thresh=0.000005)
print("\n--- Overall Metrics for Model 2 ---")
print(f"Model 2 mAP: {mAP_model2:.3f}")
print(f"Model 2 mF1: {mF1_model2:.3f}")
print(f"Model 2 mAvgIoU: {mAvgIoU_model2:.3f}")


------ Model 1 Evaluation ------
Class 0 (barcode): AP = 0.485, Max F1 = 0.632, Avg IoU = 0.281
Class 1 (car): AP = 0.515, Max F1 = 0.618, Avg IoU = 0.376
Class 2 (cardboard box): AP = 0.336, Max F1 = 0.497, Avg IoU = 0.292
Class 3 (fire): AP = 0.019, Max F1 = 0.038, Avg IoU = 0.224
Class 4 (forklift): AP = 0.385, Max F1 = 0.601, Avg IoU = 0.325
Class 5 (freight container): AP = 0.152, Max F1 = 0.289, Avg IoU = 0.312
Class 6 (gloves): AP = 0.450, Max F1 = 0.650, Avg IoU = 0.247
Class 7 (helmet): AP = 0.111, Max F1 = 0.156, Avg IoU = 0.202
Class 8 (ladder): AP = 0.168, Max F1 = 0.283, Avg IoU = 0.322
Class 9 (license plate): AP = 0.091, Max F1 = 0.150, Avg IoU = 0.065
Class 10 (person): AP = 0.200, Max F1 = 0.347, Avg IoU = 0.272
Class 11 (qr code): AP = 0.438, Max F1 = 0.566, Avg IoU = 0.280
Class 12 (road sign): AP = 0.091, Max F1 = 0.166, Avg IoU = 0.326
Class 13 (safety vest): AP = 0.260, Max F1 = 0.402, Avg IoU = 0.258
Class 14 (smoke): AP = 0.239, Max F1 = 0.344, Avg IoU = 0.272
C

---

#### **Task B:**
**Select the best model and state the reasons why this model is best suited for the TechTrack's system implementation.**

*You select Model X...because...*

Based on the results, **Model 2** appears to be the better choice for TechTrack company's system. Here’s why:

### Comparison of Metrics

- **Mean Average Precision (mAP):**
  - **Model 1:** 0.299  
  - **Model 2:** 0.337  
  mAP is calculated by computing the Average Precision (AP) for each class and then taking the average across all classes. AP is computed using 11-point interpolation over the precision-recall curve. Mathematically, if you denote the recall levels as \( r_0, r_1, \dots, r_{10} \) and corresponding maximum precision values as \( p(r_i) \), then:

  $$AP = \frac{1}{11} \sum_{i=0}^{10} p(r_i)$$
  
  A higher mAP indicates that, on average, the detector has a better trade-off between precision (the fraction of correct detections among all detections) and recall (the fraction of ground truth instances detected).

- **Mean Maximum F1 Score (mF1):**
  - **Model 1:** 0.416  
  - **Model 2:** 0.457  
  The F1 score is the harmonic mean of precision (P) and recall (R), computed as:
  $$
  F1 = \frac{2 \cdot P \cdot R}{P + R}
  $$
  In this evaluation, the maximum F1 score is determined by calculating the F1 score at various points along the precision-recall curve and selecting the highest value. A higher maximum F1 indicates that the model can achieve a better balance between precision and recall at an optimal threshold.

- **Mean Average IoU (mAvgIoU):**
  - **Model 1:** 0.282  
  - **Model 2:** 0.290  
  IoU (Intersection over Union) measures the overlap between the predicted bounding box and the ground truth box:
  $$
  IoU = \frac{\text{Area of Intersection}}{\text{Area of Union}}
  $$
  The average IoU is computed over all true positive detections. A higher IoU means that the bounding boxes predicted by the model are more accurately localized relative to the ground truth.

### Why Model 2 is Best Suited for TechTrack

1. **Overall Detection Accuracy (mAP):**  
   Model 2 achieves a higher mAP (0.337 vs. 0.299), meaning it generally has better precision-recall performance across all object classes. For TechTrack, where reliable detection is crucial, a higher mAP translates to fewer missed detections and fewer false alarms.

2. **Balanced Precision and Recall (Max F1 Score):**  
   With a higher mean maximum F1 score (0.457 vs. 0.416), Model 2 is better at striking a balance between precision and recall. This is especially important in operational settings where both false positives (which can cause unnecessary interventions) and false negatives (which can lead to missed detections) have cost implications.

3. **Better Localization (Avg IoU):**  
   Although the difference is smaller, Model 2 has a slightly higher average IoU (0.290 vs. 0.282). This means that the bounding boxes it produces are generally more accurate, ensuring that the spatial localization of objects is reliable—a critical factor for tasks such as tracking and navigation in TechTrack's applications.

### Summary

- **mAP (Mean Average Precision):** Evaluates overall detection performance. Higher values indicate a better balance of precision and recall across classes.
- **Max F1 Score:** Measures the best balance between precision and recall achievable by the model. It is the harmonic mean of precision and recall, where a higher score indicates a more effective detector.
- **Avg IoU (Average Intersection over Union):** Assesses the quality of localization for true positive detections. Higher IoU values indicate better overlap between predicted and ground truth bounding boxes.

Given that Model 2 outperforms Model 1 in all three key metrics—detection accuracy, balance between precision and recall, and localization quality—it is best suited for TechTrack company's system implementation. lHowever, I think something is wrong with my calculations. The performance of each model is suspiciously low. There might be some problem with the way I am looading the ground tgruth or loading the detections for each model that I ran.