# YOLO Velo Baseline + Debugging (AI Projektarbeit)

**Autor:** Dominic Wagenhofer  
**Modul:** EDS - Artificial Intelligence (HSLU)  
**Abgabe:** Video + *ein* dokumentiertes Python Notebook auf GitHub  

## YOLO + Tracking: Fahrräder in Vorbeifahr-Videos zählen

Dieses Notebook erkennt Fahrräder in kurzen Vorbeifahr-Videos entlang einer Fahrradabstellanlage.
Da nie alle Fahrräder gleichzeitig sichtbar sind, wird eine Zählung über die Zeit benötigt.

Methodik:
- Detektion: YOLOv8 (COCO)
- Verknüpfung über Frames: ByteTrack (Multi-Object-Tracking)
- Ergebnis: Anzahl eindeutiger Track-IDs als Schätzung der Fahrradanzahl

Artefakte:
- outputs/tracks.csv: Track-Tabelle (1 Zeile pro ID)
- outputs/frames/: Debug-Frames mit Bounding Boxes + IDs
- outputs/crops/: Repräsentative Crops pro ID
- outputs/annotated.mp4: Video mit eingeblendeten IDs (optional)

# Vorbereitung

## 1) Imports

In [1]:
from __future__ import annotations
from pathlib import Path
import time

import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from ultralytics import YOLO

## 2) Pfade und Konfiguration

In [32]:
# Arbeitsverzeichnis: typischerweise .../eds-ai-git/notebooks
CWD = Path.cwd()

# Video liegt gemäß deiner Struktur unter notebooks/data/velopark.mp4
VIDEO_PATH = CWD / "data" / "SCHOE3.mp4"

# Outputs lokal im notebooks-Ordner
OUT_DIR    = CWD / "outputs"
FRAMES_DIR = OUT_DIR / "frames"
CROPS_DIR  = OUT_DIR / "crops"
OUT_DIR.mkdir(exist_ok=True, parents=True)
FRAMES_DIR.mkdir(exist_ok=True, parents=True)
CROPS_DIR.mkdir(exist_ok=True, parents=True)

# Modell: lokale Gewichte bevorzugen, sonst Ultralytics Download
MODEL_WEIGHTS = CWD / "yolov8n.pt"
MODEL_NAME = str(MODEL_WEIGHTS) if MODEL_WEIGHTS.exists() else "yolov8n.pt"

# Detektion/Tracking Parameter
CONF_THRES = 0.25
IOU_THRES  = 0.50
IMG_SIZE   = 640

# COCO: bicycle = 1
CLASS_IDS = [1]

# Performance: 1 = jeder Frame
FRAME_STRIDE = 1

# Debug/Exports
SAVE_ANNOTATED_VIDEO = True
SAVE_DEBUG_FRAMES_EVERY_N = 5   # 0 = deaktiviert

print("CWD:", CWD)
print("VIDEO_PATH:", VIDEO_PATH)

CWD: /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks
VIDEO_PATH: /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/data/SCHOE3.mp4


## 3) Video-Checks

In [33]:
def open_video_or_raise(video_path: Path) -> cv2.VideoCapture:
    """Öffnet ein Video mit OpenCV oder wirft eine verständliche Fehlermeldung."""
    if not video_path.exists():
        raise FileNotFoundError(
            f"Video nicht gefunden: {video_path}\n"
            f"Erwartet wird z. B. '{Path.cwd() / 'data' / 'velopark.mp4'}'."
        )
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        raise RuntimeError(
            f"OpenCV konnte das Video nicht öffnen: {video_path}\n"
            "Mögliche Ursachen: Codec nicht unterstützt, Datei beschädigt, Pfad falsch."
        )
    return cap


def get_video_meta(cap: cv2.VideoCapture) -> dict:
    """Liest grundlegende Video-Metadaten aus."""
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    duration_s = frame_count / fps if fps and fps > 0 else None
    return {"fps": fps, "frame_count": frame_count, "width": width, "height": height, "duration_s": duration_s}


cap = open_video_or_raise(VIDEO_PATH)
meta = get_video_meta(cap)
cap.release()
print("Video-Metadaten:", meta)

Video-Metadaten: {'fps': 29.994700582935877, 'frame_count': 283, 'width': 1080, 'height': 1920, 'duration_s': 9.435}


# Anwendung

## 4) Modell laden

In [34]:
model = YOLO(MODEL_NAME)
print("Modell geladen:", MODEL_NAME)

Modell geladen: /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/yolov8n.pt


## 5) Helper: Boxen/Overlay

In [35]:
def xyxy_to_int(xyxy: np.ndarray) -> tuple[int, int, int, int]:
    """Konvertiert Box-Koordinaten nach int."""
    x1, y1, x2, y2 = xyxy
    return int(x1), int(y1), int(x2), int(y2)


def clamp_box(x1: int, y1: int, x2: int, y2: int, w: int, h: int) -> tuple[int,int,int,int]:
    """Clamped Box-Koordinaten auf Bildgrenzen."""
    x1 = max(0, min(x1, w - 1))
    y1 = max(0, min(y1, h - 1))
    x2 = max(0, min(x2, w - 1))
    y2 = max(0, min(y2, h - 1))
    if x2 <= x1: x2 = min(w - 1, x1 + 1)
    if y2 <= y1: y2 = min(h - 1, y1 + 1)
    return x1, y1, x2, y2


def draw_tracks(frame_bgr: np.ndarray, tracks: list[dict]) -> np.ndarray:
    """Zeichnet Bounding Boxes und Track-IDs."""
    out = frame_bgr.copy()
    for t in tracks:
        x1, y1, x2, y2 = t["x1"], t["y1"], t["x2"], t["y2"]
        tid  = t["track_id"]
        conf = t["conf"]
        cv2.rectangle(out, (x1,y1), (x2,y2), (0,255,0), 2)
        cv2.putText(
            out,
            f"id={tid} conf={conf:.2f}",
            (x1, max(20, y1 - 5)),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.6,
            (0,255,0),
            2,
            cv2.LINE_AA
        )
    return out

## 6) Tracking-Loop (YOLO + ByteTrack)

In [36]:
def track_bicycles_in_video(
    video_path: Path,
    model: YOLO,
    conf_thres: float,
    iou_thres: float,
    img_size: int,
    class_ids: list[int],
    frame_stride: int,
    save_debug_frames_every_n: int,
    save_annotated_video: bool,
    out_dir: Path,
) -> tuple[pd.DataFrame, pd.DataFrame, dict]:
    """YOLO+ByteTrack über das gesamte Video.

    det_df:
      - 1 Zeile pro Detektion (Frame, Zeit, Track-ID, Box, Confidence)
    frame_df:
      - 1 Zeile pro verarbeitetem Frame (Counts, Confidence-Statistik)
    """

    cap = open_video_or_raise(video_path)
    meta = get_video_meta(cap)
    fps = meta["fps"] if meta["fps"] and meta["fps"] > 0 else 30.0
    w, h = meta["width"], meta["height"]

    writer = None
    if save_annotated_video:
        out_video = out_dir / "annotated.mp4"
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        writer = cv2.VideoWriter(str(out_video), fourcc, fps, (w, h))

    detections: list[dict] = []
    frame_rows: list[dict] = []

    processed = 0
    frame_idx = -1
    t_start = time.time()

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame_idx += 1

        if frame_stride > 1 and (frame_idx % frame_stride != 0):
            continue

        processed += 1
        t_sec = frame_idx / fps

        # persist=True hält den Tracker-Zustand über Frames hinweg
        results = model.track(
            frame,
            persist=True,
            tracker="bytetrack.yaml",
            conf=conf_thres,
            iou=iou_thres,
            imgsz=img_size,
            verbose=False,
            classes=class_ids,
        )

        r0 = results[0]
        boxes = r0.boxes
        tracks_this_frame: list[dict] = []

        if boxes is not None and len(boxes) > 0:
            xyxy = boxes.xyxy.cpu().numpy()
            conf = boxes.conf.cpu().numpy()
            cls  = boxes.cls.cpu().numpy().astype(int)

            ids = None
            if boxes.id is not None:
                ids = boxes.id.cpu().numpy().astype(int)

            for i in range(len(xyxy)):
                x1, y1, x2, y2 = xyxy_to_int(xyxy[i])
                x1, y1, x2, y2 = clamp_box(x1, y1, x2, y2, w, h)

                track_id = int(ids[i]) if ids is not None else -1

                det = {
                    "frame": frame_idx,
                    "time_s": t_sec,
                    "track_id": track_id,
                    "class_id": int(cls[i]),
                    "conf": float(conf[i]),
                    "x1": x1, "y1": y1, "x2": x2, "y2": y2,
                    "box_area": float((x2 - x1) * (y2 - y1)),
                }
                detections.append(det)
                tracks_this_frame.append(det)

        unique_ids = {t["track_id"] for t in tracks_this_frame if t["track_id"] != -1}
        conf_mean = float(np.mean([t["conf"] for t in tracks_this_frame])) if tracks_this_frame else 0.0
        area_mean = float(np.mean([t["box_area"] for t in tracks_this_frame])) if tracks_this_frame else 0.0

        frame_rows.append({
            "frame": frame_idx,
            "time_s": t_sec,
            "bike_count": len(tracks_this_frame),
            "unique_ids_in_frame": len(unique_ids),
            "conf_mean": conf_mean,
            "area_mean": area_mean,
        })

        # Debug-Frame exportieren
        if save_debug_frames_every_n and (processed % save_debug_frames_every_n == 0):
            overlay = draw_tracks(frame, tracks_this_frame)
            out_path = (out_dir / "frames") / f"frame_{frame_idx:06d}.jpg"
            out_path.parent.mkdir(exist_ok=True, parents=True)
            cv2.imwrite(str(out_path), overlay)

        # Annotiertes Video exportieren
        if writer is not None:
            overlay = draw_tracks(frame, tracks_this_frame)
            writer.write(overlay)

    cap.release()
    if writer is not None:
        writer.release()

    det_df = pd.DataFrame(detections)
    frame_df = pd.DataFrame(frame_rows)

    unique_track_ids = sorted([i for i in det_df["track_id"].unique().tolist() if i != -1]) if len(det_df) else []
    runtime_s = time.time() - t_start

    summary = {
        "video": str(video_path),
        "frames_processed": int(processed),
        "unique_track_ids": int(len(unique_track_ids)),
        "runtime_s": float(runtime_s),
        "fps": float(fps),
    }
    return det_df, frame_df, summary

## 7) Ausführen + Hauptausgabe

In [37]:
det_df, frame_df, summary = track_bicycles_in_video(
    VIDEO_PATH,
    model=model,
    conf_thres=CONF_THRES,
    iou_thres=IOU_THRES,
    img_size=IMG_SIZE,
    class_ids=CLASS_IDS,
    frame_stride=FRAME_STRIDE,
    save_debug_frames_every_n=SAVE_DEBUG_FRAMES_EVERY_N,
    save_annotated_video=SAVE_ANNOTATED_VIDEO,
    out_dir=OUT_DIR,
)

summary

{'video': '/Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/data/SCHOE3.mp4',
 'frames_processed': 283,
 'unique_track_ids': 39,
 'runtime_s': 19.890625953674316,
 'fps': 29.994700582935877}

In [38]:
if len(det_df) == 0:
    print("Keine Fahrräder erkannt. Parameter/Video prüfen.")
else:
    unique_ids = sorted([i for i in det_df["track_id"].unique() if i != -1])
    print(f"Geschätzte Anzahl Velos im Video (Unique Track-IDs): {len(unique_ids)}")
    print("Beispiel-IDs:", unique_ids[:20])

Geschätzte Anzahl Velos im Video (Unique Track-IDs): 39
Beispiel-IDs: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(14), np.int64(18), np.int64(22), np.int64(45), np.int64(51), np.int64(67), np.int64(70), np.int64(74), np.int64(83), np.int64(90), np.int64(95), np.int64(103), np.int64(104), np.int64(109), np.int64(121), np.int64(126)]


## 8) Track-Tabelle (IDs exportieren)

In [39]:
def build_track_table(det_df: pd.DataFrame) -> pd.DataFrame:
    """Aggregiert Detections pro Track-ID zu einer Track-Tabelle."""
    if len(det_df) == 0:
        return pd.DataFrame()

    df = det_df.copy()
    df = df[df["track_id"] != -1]

    # repräsentativer Treffer: höchste Confidence pro ID
    idx_max_conf = df.groupby("track_id")["conf"].idxmax()
    rep = df.loc[idx_max_conf, ["track_id","frame","time_s","conf","x1","y1","x2","y2"]].rename(
        columns={"frame":"rep_frame","time_s":"rep_time_s","conf":"rep_conf",
                 "x1":"rep_x1","y1":"rep_y1","x2":"rep_x2","y2":"rep_y2"}
    )

    agg = df.groupby("track_id").agg(
        first_frame=("frame","min"),
        last_frame=("frame","max"),
        first_time_s=("time_s","min"),
        last_time_s=("time_s","max"),
        detections=("frame","count"),
        conf_mean=("conf","mean"),
        conf_max=("conf","max"),
        area_mean=("box_area","mean"),
    ).reset_index()

    return agg.merge(rep, on="track_id", how="left").sort_values("track_id")


track_df = build_track_table(det_df)
track_df.head(10)

Unnamed: 0,track_id,first_frame,last_frame,first_time_s,last_time_s,detections,conf_mean,conf_max,area_mean,rep_frame,rep_time_s,rep_conf,rep_x1,rep_y1,rep_x2,rep_y2
0,1,0,37,0.0,1.233551,38,0.653817,0.775125,71704.578947,4,0.133357,0.775125,250,595,462,946
1,2,0,23,0.0,0.766802,24,0.665101,0.874754,338685.083333,17,0.566767,0.874754,630,562,1063,1483
2,3,0,57,0.0,1.900336,55,0.541855,0.760478,107504.363636,38,1.26689,0.760478,695,647,943,1216
3,4,0,31,0.0,1.033516,25,0.389417,0.584287,85413.32,26,0.86682,0.584287,784,646,1079,1181
4,14,24,77,0.800141,2.56712,52,0.508263,0.783416,109997.192308,71,2.367085,0.783416,855,666,1079,1141
5,18,26,27,0.86682,0.900159,2,0.530919,0.571101,40741.0,27,0.900159,0.571101,968,946,1079,1240
6,22,28,93,0.933498,3.100548,60,0.431775,0.668227,60086.616667,30,1.000177,0.668227,43,514,161,923
7,45,35,60,1.166873,2.000353,18,0.424274,0.617194,70208.777778,37,1.233551,0.617194,19,546,176,938
8,51,36,117,1.200212,3.900689,75,0.5328,0.6659,44732.56,84,2.800495,0.6659,530,718,664,1047
9,67,40,87,1.333569,2.900512,47,0.476034,0.803664,64919.021277,84,2.800495,0.803664,905,745,1079,1099


In [40]:
tracks_csv = OUT_DIR / "tracks.csv"
track_df.to_csv(tracks_csv, index=False)
print("Exportiert:", tracks_csv)

Exportiert: /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/outputs/tracks.csv


## 9) Pro ID: Frames/Crops exportieren

In [41]:
def read_frame_at(cap: cv2.VideoCapture, frame_idx: int) -> np.ndarray:
    """Liest einen Frame an einer spezifischen Frame-Position."""
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
    ok, frame = cap.read()
    if not ok:
        raise RuntimeError(f"Frame {frame_idx} konnte nicht gelesen werden.")
    return frame


def export_track_crops(video_path: Path, track_df: pd.DataFrame, crops_dir: Path, max_tracks: int | None = None) -> None:
    """Speichert pro Track-ID einen repräsentativen Crop (Max-Confidence-Frame)."""
    crops_dir.mkdir(exist_ok=True, parents=True)
    cap = open_video_or_raise(video_path)

    df = track_df.copy()
    if max_tracks is not None:
        df = df.head(max_tracks)

    saved = 0
    for _, row in df.iterrows():
        tid = int(row["track_id"])
        frame_idx = int(row["rep_frame"])
        x1, y1, x2, y2 = int(row["rep_x1"]), int(row["rep_y1"]), int(row["rep_x2"]), int(row["rep_y2"])

        frame = read_frame_at(cap, frame_idx)
        h, w = frame.shape[:2]
        x1, y1, x2, y2 = clamp_box(x1, y1, x2, y2, w, h)

        crop = frame[y1:y2, x1:x2]
        out_path = crops_dir / f"track_{tid:05d}.jpg"
        cv2.imwrite(str(out_path), crop)
        saved += 1

    cap.release()
    print(f"{saved} Crops gespeichert in {crops_dir}")


export_track_crops(VIDEO_PATH, track_df, CROPS_DIR)

39 Crops gespeichert in /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/outputs/crops


# Debugging

## 10) Track-Debug: Verdächtige IDs finden

In [42]:
if len(track_df) == 0:
    print("Keine Tracks vorhanden.")
else:
    # Kurzlebige Tracks: oft Fragmentierung oder False Positives
    short_tracks = track_df.sort_values("detections").head(12)[
        ["track_id","detections","conf_mean","conf_max","first_time_s","last_time_s"]
    ]

    # Niedrige Conf: oft schwierige Fälle (Blur/Winkel/Okklusion)
    low_conf_tracks = track_df.sort_values("conf_max").head(12)[
        ["track_id","detections","conf_mean","conf_max","first_time_s","last_time_s"]
    ]

    print("Kurzlebige Tracks:")
    display(short_tracks)

    print("Niedrige Max-Confidence:")
    display(low_conf_tracks)

Kurzlebige Tracks:


Unnamed: 0,track_id,detections,conf_mean,conf_max,first_time_s,last_time_s
38,275,1,0.499266,0.499266,9.00159,9.00159
37,269,2,0.555256,0.589928,8.101431,8.13477
5,18,2,0.530919,0.571101,0.86682,0.900159
13,90,3,0.366297,0.448186,1.600283,1.967014
14,95,4,0.409793,0.440634,1.833657,1.933675
28,211,4,0.317702,0.368635,4.73417,5.000883
15,103,5,0.3656,0.518307,2.033693,2.667138
23,186,7,0.32847,0.429194,4.234081,4.800848
36,267,8,0.686525,0.865683,7.968074,8.201449
32,247,8,0.354091,0.422645,6.267774,6.801201


Niedrige Max-Confidence:


Unnamed: 0,track_id,detections,conf_mean,conf_max,first_time_s,last_time_s
28,211,4,0.317702,0.368635,4.73417,5.000883
32,247,8,0.354091,0.422645,6.267774,6.801201
23,186,7,0.32847,0.429194,4.234081,4.800848
14,95,4,0.409793,0.440634,1.833657,1.933675
13,90,3,0.366297,0.448186,1.600283,1.967014
22,157,20,0.380174,0.483707,3.43394,4.167403
38,275,1,0.499266,0.499266,9.00159,9.00159
15,103,5,0.3656,0.518307,2.033693,2.667138
16,104,70,0.367319,0.540003,2.033693,4.667491
25,197,49,0.371364,0.55633,4.534134,7.467986


## 11) Frame-Debug: Verdächtige Frames automatisch wählen

In [43]:
df = frame_df.copy()
df["delta_count"] = df["bike_count"].diff().fillna(0)

suspicious = pd.concat([
    df.sort_values("bike_count", ascending=False).head(5),
    df.sort_values("conf_mean").head(5),
    df.sort_values("delta_count", ascending=False).head(5),
    df.sort_values("delta_count", ascending=True).head(5),
]).drop_duplicates().sort_values("frame")

suspicious[["frame","time_s","bike_count","conf_mean","delta_count"]]

Unnamed: 0,frame,time_s,bike_count,conf_mean,delta_count
44,44,1.466926,6,0.447385,-2.0
49,49,1.633622,10,0.466482,2.0
52,52,1.73364,9,0.439807,1.0
55,55,1.833657,9,0.427938,1.0
56,56,1.866996,6,0.477558,-3.0
57,57,1.900336,10,0.442092,4.0
58,58,1.933675,7,0.413686,-3.0
68,68,2.267067,8,0.459409,3.0
85,85,2.833834,9,0.475482,2.0
87,87,2.900512,9,0.465222,1.0


In [44]:
if len(track_df) == 0:
    print("Keine Track-Ausgabe vorhanden.")
else:
    est_total = int(track_df["track_id"].nunique())
    print("✅ Ergebnis")
    print("---------")
    print(f"Geschätzte Anzahl Velos im Video (Unique Track-IDs): {est_total}")
    print(f"Track-Export: {tracks_csv}")
    print(f"Debug-Frames: {FRAMES_DIR}")
    print(f"Track-Crops: {CROPS_DIR}")
    if SAVE_ANNOTATED_VIDEO:
        print(f"Annotiertes Video: {OUT_DIR / 'annotated.mp4'}")


✅ Ergebnis
---------
Geschätzte Anzahl Velos im Video (Unique Track-IDs): 39
Track-Export: /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/outputs/tracks.csv
Debug-Frames: /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/outputs/frames
Track-Crops: /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/outputs/crops
Annotiertes Video: /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/outputs/annotated.mp4


# Artefakt: Annotiertes Video (Detektion + Tracking)

Zur qualitativen Evaluation wird ein annotiertes Video erzeugt, das für jeden
Frame die erkannten Fahrräder, deren Track-IDs sowie die zugehörige
Detektionssicherheit (Confidence) darstellt.

Dieses Artefakt dient sowohl der Ergebnisinterpretation als auch dem Debugging
von Fehlklassifikationen, ID-Switches und kurzlebigen Tracks.


In [45]:
import cv2
from pathlib import Path
from ultralytics import YOLO

# ----------------------------
# Pfade & Parameter
# ----------------------------
VIDEO_PATH = Path("data/SCHOE3.mp4")          # dein Video
OUTPUT_VIDEO = Path("outputs/annotated.mp4")    # Output
OUTPUT_VIDEO.parent.mkdir(exist_ok=True)

CONF_THRES = 0.30
IOU_THRES = 0.50
IMG_SIZE = 640

# COCO-Klasse: bicycle = 1
CLASS_IDS = [1]

# ----------------------------
# Modell laden
# ----------------------------
model = YOLO("yolov8n.pt")

# ----------------------------
# Video öffnen
# ----------------------------
cap = cv2.VideoCapture(str(VIDEO_PATH))
if not cap.isOpened():
    raise RuntimeError(f"Video konnte nicht geöffnet werden: {VIDEO_PATH}")

fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# ----------------------------
# VideoWriter initialisieren
# ----------------------------
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
writer = cv2.VideoWriter(str(OUTPUT_VIDEO), fourcc, fps, (width, height))

frame_idx = 0

# ----------------------------
# Frame-by-Frame Verarbeitung
# ----------------------------
while True:
    ret, frame = cap.read()
    if not ret:
        break

    # YOLO + ByteTrack
    results = model.track(
        frame,
        persist=True,
        tracker="bytetrack.yaml",
        conf=CONF_THRES,
        iou=IOU_THRES,
        imgsz=IMG_SIZE,
        classes=CLASS_IDS,
        verbose=False,
    )

    annotated = frame.copy()

    boxes = results[0].boxes
    if boxes is not None and len(boxes) > 0:
        xyxy = boxes.xyxy.cpu().numpy()
        confs = boxes.conf.cpu().numpy()
        ids = boxes.id.cpu().numpy().astype(int) if boxes.id is not None else [-1] * len(xyxy)

        for (x1, y1, x2, y2), conf, tid in zip(xyxy, confs, ids):
            x1, y1, x2, y2 = map(int, [x1, y1, x2, y2])

            cv2.rectangle(annotated, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(
                annotated,
                f"id={tid} conf={conf:.2f}",
                (x1, max(20, y1 - 5)),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.6,
                (0, 255, 0),
                2,
                cv2.LINE_AA,
            )

    writer.write(annotated)
    frame_idx += 1

cap.release()
writer.release()

print(f"✅ Annotiertes Video erstellt: {OUTPUT_VIDEO.resolve()}")

✅ Annotiertes Video erstellt: /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/outputs/annotated.mp4
