# YOLO + Tracking: Fahrräder in Vorbeifahr-Videos zählen

Dieses Notebook erkennt Fahrräder in kurzen Vorbeifahr-Videos entlang einer Fahrradabstellanlage.
Da nie alle Fahrräder gleichzeitig sichtbar sind, wird eine Zählung über die Zeit benötigt.

Methodik:
- Detektion: YOLOv8 (COCO)
- Verknüpfung über Frames: ByteTrack (Multi-Object-Tracking)
- Ergebnis: Anzahl eindeutiger Track-IDs als Schätzung der Fahrradanzahl

Artefakte:
- outputs/tracks.csv: Track-Tabelle (1 Zeile pro ID)
- outputs/frames/: Debug-Frames mit Bounding Boxes + IDs
- outputs/crops/: Repräsentative Crops pro ID
- outputs/annotated.mp4: Video mit eingeblendeten IDs (optional)

# Vorbereitung

## 1) Imports

In [21]:
from __future__ import annotations

from pathlib import Path
import time

import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from ultralytics import YOLO

## 2) Pfade und Konfiguration

In [22]:
# Arbeitsverzeichnis: typischerweise .../eds-ai-git/notebooks
CWD = Path.cwd()

# Video liegt gemäß deiner Struktur unter notebooks/data/velopark.mp4
VIDEO_PATH = CWD / "data" / "SCHOE2.mp4"

# Outputs lokal im notebooks-Ordner
OUT_DIR    = CWD / "outputs"
FRAMES_DIR = OUT_DIR / "frames"
CROPS_DIR  = OUT_DIR / "crops"
OUT_DIR.mkdir(exist_ok=True, parents=True)
FRAMES_DIR.mkdir(exist_ok=True, parents=True)
CROPS_DIR.mkdir(exist_ok=True, parents=True)

# Modell: lokale Gewichte bevorzugen, sonst Ultralytics Download
MODEL_WEIGHTS = CWD / "yolov8n.pt"
MODEL_NAME = str(MODEL_WEIGHTS) if MODEL_WEIGHTS.exists() else "yolov8n.pt"

# Detektion/Tracking Parameter
CONF_THRES = 0.25
IOU_THRES  = 0.50
IMG_SIZE   = 640

# COCO: bicycle = 1
CLASS_IDS = [1]

# Performance: 1 = jeder Frame
FRAME_STRIDE = 1

# Debug/Exports
SAVE_ANNOTATED_VIDEO = True
SAVE_DEBUG_FRAMES_EVERY_N = 5   # 0 = deaktiviert

print("CWD:", CWD)
print("VIDEO_PATH:", VIDEO_PATH)

CWD: /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks
VIDEO_PATH: /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/data/SCHOE2.mp4


## 3) Video-Checks

In [23]:
def open_video_or_raise(video_path: Path) -> cv2.VideoCapture:
    """Öffnet ein Video mit OpenCV oder wirft eine verständliche Fehlermeldung."""
    if not video_path.exists():
        raise FileNotFoundError(
            f"Video nicht gefunden: {video_path}\n"
            f"Erwartet wird z. B. '{Path.cwd() / 'data' / 'velopark.mp4'}'."
        )
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        raise RuntimeError(
            f"OpenCV konnte das Video nicht öffnen: {video_path}\n"
            "Mögliche Ursachen: Codec nicht unterstützt, Datei beschädigt, Pfad falsch."
        )
    return cap


def get_video_meta(cap: cv2.VideoCapture) -> dict:
    """Liest grundlegende Video-Metadaten aus."""
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    duration_s = frame_count / fps if fps and fps > 0 else None
    return {"fps": fps, "frame_count": frame_count, "width": width, "height": height, "duration_s": duration_s}


cap = open_video_or_raise(VIDEO_PATH)
meta = get_video_meta(cap)
cap.release()
print("Video-Metadaten:", meta)

Video-Metadaten: {'fps': 29.986589181940097, 'frame_count': 559, 'width': 1080, 'height': 1920, 'duration_s': 18.641666666666666}


# Anwendung

## 4) Modell laden

In [24]:
model = YOLO(MODEL_NAME)
print("Modell geladen:", MODEL_NAME)

Modell geladen: /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/yolov8n.pt


## 5) Helper: Boxen/Overlay

In [25]:
def xyxy_to_int(xyxy: np.ndarray) -> tuple[int, int, int, int]:
    """Konvertiert Box-Koordinaten nach int."""
    x1, y1, x2, y2 = xyxy
    return int(x1), int(y1), int(x2), int(y2)


def clamp_box(x1: int, y1: int, x2: int, y2: int, w: int, h: int) -> tuple[int,int,int,int]:
    """Clamped Box-Koordinaten auf Bildgrenzen."""
    x1 = max(0, min(x1, w - 1))
    y1 = max(0, min(y1, h - 1))
    x2 = max(0, min(x2, w - 1))
    y2 = max(0, min(y2, h - 1))
    if x2 <= x1: x2 = min(w - 1, x1 + 1)
    if y2 <= y1: y2 = min(h - 1, y1 + 1)
    return x1, y1, x2, y2


def draw_tracks(frame_bgr: np.ndarray, tracks: list[dict]) -> np.ndarray:
    """Zeichnet Bounding Boxes und Track-IDs."""
    out = frame_bgr.copy()
    for t in tracks:
        x1, y1, x2, y2 = t["x1"], t["y1"], t["x2"], t["y2"]
        tid  = t["track_id"]
        conf = t["conf"]
        cv2.rectangle(out, (x1,y1), (x2,y2), (0,255,0), 2)
        cv2.putText(
            out,
            f"id={tid} conf={conf:.2f}",
            (x1, max(20, y1 - 5)),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.6,
            (0,255,0),
            2,
            cv2.LINE_AA
        )
    return out

## 6) Tracking-Loop (YOLO + ByteTrack)

In [26]:
def track_bicycles_in_video(
    video_path: Path,
    model: YOLO,
    conf_thres: float,
    iou_thres: float,
    img_size: int,
    class_ids: list[int],
    frame_stride: int,
    save_debug_frames_every_n: int,
    save_annotated_video: bool,
    out_dir: Path,
) -> tuple[pd.DataFrame, pd.DataFrame, dict]:
    """YOLO+ByteTrack über das gesamte Video.

    det_df:
      - 1 Zeile pro Detektion (Frame, Zeit, Track-ID, Box, Confidence)
    frame_df:
      - 1 Zeile pro verarbeitetem Frame (Counts, Confidence-Statistik)
    """

    cap = open_video_or_raise(video_path)
    meta = get_video_meta(cap)
    fps = meta["fps"] if meta["fps"] and meta["fps"] > 0 else 30.0
    w, h = meta["width"], meta["height"]

    writer = None
    if save_annotated_video:
        out_video = out_dir / "annotated.mp4"
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        writer = cv2.VideoWriter(str(out_video), fourcc, fps, (w, h))

    detections: list[dict] = []
    frame_rows: list[dict] = []

    processed = 0
    frame_idx = -1
    t_start = time.time()

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame_idx += 1

        if frame_stride > 1 and (frame_idx % frame_stride != 0):
            continue

        processed += 1
        t_sec = frame_idx / fps

        # persist=True hält den Tracker-Zustand über Frames hinweg
        results = model.track(
            frame,
            persist=True,
            tracker="bytetrack.yaml",
            conf=conf_thres,
            iou=iou_thres,
            imgsz=img_size,
            verbose=False,
            classes=class_ids,
        )

        r0 = results[0]
        boxes = r0.boxes
        tracks_this_frame: list[dict] = []

        if boxes is not None and len(boxes) > 0:
            xyxy = boxes.xyxy.cpu().numpy()
            conf = boxes.conf.cpu().numpy()
            cls  = boxes.cls.cpu().numpy().astype(int)

            ids = None
            if boxes.id is not None:
                ids = boxes.id.cpu().numpy().astype(int)

            for i in range(len(xyxy)):
                x1, y1, x2, y2 = xyxy_to_int(xyxy[i])
                x1, y1, x2, y2 = clamp_box(x1, y1, x2, y2, w, h)

                track_id = int(ids[i]) if ids is not None else -1

                det = {
                    "frame": frame_idx,
                    "time_s": t_sec,
                    "track_id": track_id,
                    "class_id": int(cls[i]),
                    "conf": float(conf[i]),
                    "x1": x1, "y1": y1, "x2": x2, "y2": y2,
                    "box_area": float((x2 - x1) * (y2 - y1)),
                }
                detections.append(det)
                tracks_this_frame.append(det)

        unique_ids = {t["track_id"] for t in tracks_this_frame if t["track_id"] != -1}
        conf_mean = float(np.mean([t["conf"] for t in tracks_this_frame])) if tracks_this_frame else 0.0
        area_mean = float(np.mean([t["box_area"] for t in tracks_this_frame])) if tracks_this_frame else 0.0

        frame_rows.append({
            "frame": frame_idx,
            "time_s": t_sec,
            "bike_count": len(tracks_this_frame),
            "unique_ids_in_frame": len(unique_ids),
            "conf_mean": conf_mean,
            "area_mean": area_mean,
        })

        # Debug-Frame exportieren
        if save_debug_frames_every_n and (processed % save_debug_frames_every_n == 0):
            overlay = draw_tracks(frame, tracks_this_frame)
            out_path = (out_dir / "frames") / f"frame_{frame_idx:06d}.jpg"
            out_path.parent.mkdir(exist_ok=True, parents=True)
            cv2.imwrite(str(out_path), overlay)

        # Annotiertes Video exportieren
        if writer is not None:
            overlay = draw_tracks(frame, tracks_this_frame)
            writer.write(overlay)

    cap.release()
    if writer is not None:
        writer.release()

    det_df = pd.DataFrame(detections)
    frame_df = pd.DataFrame(frame_rows)

    unique_track_ids = sorted([i for i in det_df["track_id"].unique().tolist() if i != -1]) if len(det_df) else []
    runtime_s = time.time() - t_start

    summary = {
        "video": str(video_path),
        "frames_processed": int(processed),
        "unique_track_ids": int(len(unique_track_ids)),
        "runtime_s": float(runtime_s),
        "fps": float(fps),
    }
    return det_df, frame_df, summary

## 7) Ausführen + Hauptausgabe

In [28]:
det_df, frame_df, summary = track_bicycles_in_video(
    VIDEO_PATH,
    model=model,
    conf_thres=CONF_THRES,
    iou_thres=IOU_THRES,
    img_size=IMG_SIZE,
    class_ids=CLASS_IDS,
    frame_stride=FRAME_STRIDE,
    save_debug_frames_every_n=SAVE_DEBUG_FRAMES_EVERY_N,
    save_annotated_video=SAVE_ANNOTATED_VIDEO,
    out_dir=OUT_DIR,
)

summary

{'video': '/Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/data/SCHOE2.mp4',
 'frames_processed': 559,
 'unique_track_ids': 40,
 'runtime_s': 34.22434115409851,
 'fps': 29.986589181940097}

In [29]:
if len(det_df) == 0:
    print("Keine Fahrräder erkannt. Parameter/Video prüfen.")
else:
    unique_ids = sorted([i for i in det_df["track_id"].unique() if i != -1])
    print(f"Geschätzte Anzahl Velos im Video (Unique Track-IDs): {len(unique_ids)}")
    print("Beispiel-IDs:", unique_ids[:20])

Geschätzte Anzahl Velos im Video (Unique Track-IDs): 40
Beispiel-IDs: [np.int64(308), np.int64(309), np.int64(310), np.int64(315), np.int64(319), np.int64(327), np.int64(332), np.int64(342), np.int64(343), np.int64(359), np.int64(366), np.int64(370), np.int64(372), np.int64(397), np.int64(399), np.int64(412), np.int64(415), np.int64(419), np.int64(423), np.int64(431)]


## 8) Track-Tabelle (IDs exportieren)

In [30]:
def build_track_table(det_df: pd.DataFrame) -> pd.DataFrame:
    """Aggregiert Detections pro Track-ID zu einer Track-Tabelle."""
    if len(det_df) == 0:
        return pd.DataFrame()

    df = det_df.copy()
    df = df[df["track_id"] != -1]

    # repräsentativer Treffer: höchste Confidence pro ID
    idx_max_conf = df.groupby("track_id")["conf"].idxmax()
    rep = df.loc[idx_max_conf, ["track_id","frame","time_s","conf","x1","y1","x2","y2"]].rename(
        columns={"frame":"rep_frame","time_s":"rep_time_s","conf":"rep_conf",
                 "x1":"rep_x1","y1":"rep_y1","x2":"rep_x2","y2":"rep_y2"}
    )

    agg = df.groupby("track_id").agg(
        first_frame=("frame","min"),
        last_frame=("frame","max"),
        first_time_s=("time_s","min"),
        last_time_s=("time_s","max"),
        detections=("frame","count"),
        conf_mean=("conf","mean"),
        conf_max=("conf","max"),
        area_mean=("box_area","mean"),
    ).reset_index()

    return agg.merge(rep, on="track_id", how="left").sort_values("track_id")


track_df = build_track_table(det_df)
track_df.head(10)

Unnamed: 0,track_id,first_frame,last_frame,first_time_s,last_time_s,detections,conf_mean,conf_max,area_mean,rep_frame,rep_time_s,rep_conf,rep_x1,rep_y1,rep_x2,rep_y2
0,308,1,113,0.033348,3.768351,111,0.727516,0.952615,119610.144144,72,2.401073,0.952615,234,902,664,1311
1,309,1,53,0.033348,1.767457,44,0.634166,0.921226,134535.75,49,1.634064,0.921226,628,853,1079,1396
2,310,3,6,0.100045,0.200089,4,0.426387,0.465396,123268.0,4,0.133393,0.465396,146,733,462,1140
3,315,10,78,0.333482,2.601163,62,0.608599,0.900862,68035.919355,67,2.234332,0.900862,833,931,1079,1262
4,319,29,51,0.967099,1.70076,14,0.435158,0.681209,104846.928571,34,1.13384,0.681209,628,907,915,1302
5,327,50,111,1.667412,3.701655,59,0.554446,0.790033,149589.491525,73,2.434422,0.790033,80,797,486,1196
6,332,77,206,2.567815,6.869738,119,0.621619,0.849698,94809.747899,135,4.502013,0.849698,366,876,655,1223
7,342,92,125,3.068038,4.16853,28,0.494796,0.744221,113405.25,99,3.301476,0.744221,131,433,526,718
8,343,92,218,3.068038,7.269917,106,0.644385,0.879434,84392.632075,152,5.068933,0.879434,217,894,510,1216
9,359,122,220,4.068485,7.336613,97,0.714267,0.906141,95326.927835,156,5.202326,0.906141,0,800,264,1131


In [31]:
tracks_csv = OUT_DIR / "tracks.csv"
track_df.to_csv(tracks_csv, index=False)
print("Exportiert:", tracks_csv)

Exportiert: /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/outputs/tracks.csv


## 9) Pro ID: Frames/Crops exportieren

In [32]:
def read_frame_at(cap: cv2.VideoCapture, frame_idx: int) -> np.ndarray:
    """Liest einen Frame an einer spezifischen Frame-Position."""
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
    ok, frame = cap.read()
    if not ok:
        raise RuntimeError(f"Frame {frame_idx} konnte nicht gelesen werden.")
    return frame


def export_track_crops(video_path: Path, track_df: pd.DataFrame, crops_dir: Path, max_tracks: int | None = None) -> None:
    """Speichert pro Track-ID einen repräsentativen Crop (Max-Confidence-Frame)."""
    crops_dir.mkdir(exist_ok=True, parents=True)
    cap = open_video_or_raise(video_path)

    df = track_df.copy()
    if max_tracks is not None:
        df = df.head(max_tracks)

    saved = 0
    for _, row in df.iterrows():
        tid = int(row["track_id"])
        frame_idx = int(row["rep_frame"])
        x1, y1, x2, y2 = int(row["rep_x1"]), int(row["rep_y1"]), int(row["rep_x2"]), int(row["rep_y2"])

        frame = read_frame_at(cap, frame_idx)
        h, w = frame.shape[:2]
        x1, y1, x2, y2 = clamp_box(x1, y1, x2, y2, w, h)

        crop = frame[y1:y2, x1:x2]
        out_path = crops_dir / f"track_{tid:05d}.jpg"
        cv2.imwrite(str(out_path), crop)
        saved += 1

    cap.release()
    print(f"{saved} Crops gespeichert in {crops_dir}")


export_track_crops(VIDEO_PATH, track_df, CROPS_DIR)

40 Crops gespeichert in /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/outputs/crops


# Debugging

## 10) Track-Debug: Verdächtige IDs finden

In [34]:
if len(track_df) == 0:
    print("Keine Tracks vorhanden.")
else:
    # Kurzlebige Tracks: oft Fragmentierung oder False Positives
    short_tracks = track_df.sort_values("detections").head(12)[
        ["track_id","detections","conf_mean","conf_max","first_time_s","last_time_s"]
    ]

    # Niedrige Conf: oft schwierige Fälle (Blur/Winkel/Okklusion)
    low_conf_tracks = track_df.sort_values("conf_max").head(12)[
        ["track_id","detections","conf_mean","conf_max","first_time_s","last_time_s"]
    ]

    print("Kurzlebige Tracks:")
    display(short_tracks)

    print("Niedrige Max-Confidence:")
    display(low_conf_tracks)

Kurzlebige Tracks:


Unnamed: 0,track_id,detections,conf_mean,conf_max,first_time_s,last_time_s
13,397,1,0.530295,0.530295,6.469559,6.469559
33,542,1,0.445665,0.445665,15.006708,15.006708
24,473,2,0.368931,0.375512,11.271705,11.305054
16,415,2,0.4991,0.545095,7.903533,7.936881
12,372,3,0.41116,0.456978,4.968888,5.035584
10,366,3,0.411472,0.466888,4.868843,5.068933
27,501,4,0.42527,0.468757,12.50559,12.605635
2,310,4,0.426387,0.465396,0.100045,0.200089
39,587,5,0.381391,0.482043,16.574076,16.87421
11,370,6,0.384254,0.563752,4.93554,5.102281


Niedrige Max-Confidence:


Unnamed: 0,track_id,detections,conf_mean,conf_max,first_time_s,last_time_s
24,473,2,0.368931,0.375512,11.271705,11.305054
33,542,1,0.445665,0.445665,15.006708,15.006708
12,372,3,0.41116,0.456978,4.968888,5.035584
2,310,4,0.426387,0.465396,0.100045,0.200089
10,366,3,0.411472,0.466888,4.868843,5.068933
27,501,4,0.42527,0.468757,12.50559,12.605635
18,423,11,0.367057,0.477685,8.303712,9.604293
39,587,5,0.381391,0.482043,16.574076,16.87421
36,559,11,0.427329,0.511723,15.506932,15.973807
13,397,1,0.530295,0.530295,6.469559,6.469559


## 11) Frame-Debug: Verdächtige Frames automatisch wählen

In [36]:
df = frame_df.copy()
df["delta_count"] = df["bike_count"].diff().fillna(0)

suspicious = pd.concat([
    df.sort_values("bike_count", ascending=False).head(5),
    df.sort_values("conf_mean").head(5),
    df.sort_values("delta_count", ascending=False).head(5),
    df.sort_values("delta_count", ascending=True).head(5),
]).drop_duplicates().sort_values("frame")

suspicious[["frame","time_s","bike_count","conf_mean","delta_count"]]

Unnamed: 0,frame,time_s,bike_count,conf_mean,delta_count
49,49,1.634064,2,0.907867,-2.0
274,274,9.137418,5,0.498223,3.0
367,367,12.238804,2,0.741627,-3.0
385,385,12.839073,6,0.431339,2.0
386,386,12.872421,6,0.441022,0.0
387,387,12.905769,2,0.558077,-4.0
388,388,12.939117,5,0.448446,3.0
470,470,15.673673,7,0.503125,2.0
473,473,15.773718,6,0.498568,3.0
474,474,15.807066,6,0.438351,0.0


In [37]:
if len(track_df) == 0:
    print("Keine Track-Ausgabe vorhanden.")
else:
    est_total = int(track_df["track_id"].nunique())
    print("✅ Ergebnis")
    print("---------")
    print(f"Geschätzte Anzahl Velos im Video (Unique Track-IDs): {est_total}")
    print(f"Track-Export: {tracks_csv}")
    print(f"Debug-Frames: {FRAMES_DIR}")
    print(f"Track-Crops: {CROPS_DIR}")
    if SAVE_ANNOTATED_VIDEO:
        print(f"Annotiertes Video: {OUT_DIR / 'annotated.mp4'}")


✅ Ergebnis
---------
Geschätzte Anzahl Velos im Video (Unique Track-IDs): 40
Track-Export: /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/outputs/tracks.csv
Debug-Frames: /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/outputs/frames
Track-Crops: /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/outputs/crops
Annotiertes Video: /Users/dominicwagenhofer/Desktop/Studium/7. Semester/Artificial Intelligence/Einzelarbeit/Git Repo/eds-ai-git/notebooks/outputs/annotated.mp4
