# Environment check and imports


In [None]:
# Core imports and environment check
import torch
import cv2
import numpy as np
from collections import defaultdict
from ultralytics import YOLO

print(f"PyTorch version: {torch.__version__}")
print(f"OpenCV version: {cv2.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
else:
    print("Running on CPU. For real-time performance, a GPU is recommended.")


PyTorch version: 2.7.1+cu118
OpenCV version: 4.12.0
CUDA available: True
GPU device: NVIDIA GeForce RTX 4050 Laptop GPU


# UModel loading and configuration

In [None]:
# Model configuration and loading
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_PATH = "yolov8m.pt"  # Use yolov8n.pt for faster but less accurate inference
CONFIDENCE_THRESHOLD = 0.5

model = YOLO(MODEL_PATH)
model.to(DEVICE)

print(f"Model loaded on: {DEVICE}")
print(f"Number of classes: {len(model.names)}")


Model loaded on: cuda
Number of classes: 80


# Utility functions (centroid tracking)

In [None]:
# Basic centroid-based multi-object tracking
track_history = defaultdict(list)
object_positions = {}
next_track_id = 0


def compute_centroid(box):
    """
    Compute the centroid of a bounding box.

    Parameters
    ----------
    box : tuple or list (x1, y1, x2, y2)
        Bounding box coordinates in pixel space.

    Returns
    -------
    np.ndarray
        Centroid as (cx, cy).
    """
    x1, y1, x2, y2 = box
    cx = (x1 + x2) / 2.0
    cy = (y1 + y2) / 2.0
    return np.array([cx, cy], dtype=np.float32)


def associate_detections_to_tracks(detections, prev_positions, distance_threshold=50.0):
    """
    Associate current detections with existing tracks using Euclidean distance.

    Parameters
    ----------
    detections : list
        List of tuples (box, confidence, class_name).
    prev_positions : dict
        Mapping track_id -> last centroid position.
    distance_threshold : float
        Maximum allowed distance to keep a track assignment.

    Returns
    -------
    dict
        Mapping track_id -> (box, confidence, class_name, centroid).
    """
    global next_track_id

    assignments = {}
    used_tracks = set()

    for box, confidence, class_name in detections:
        centroid = compute_centroid(box)
        best_track_id = None
        best_distance = distance_threshold

        for track_id, prev_centroid in prev_positions.items():
            if track_id in used_tracks:
                continue

            distance = np.linalg.norm(centroid - prev_centroid)
            if distance < best_distance:
                best_distance = distance
                best_track_id = track_id

        if best_track_id is not None:
            assignments[best_track_id] = (box, confidence, class_name, centroid)
            used_tracks.add(best_track_id)
        else:
            assignments[next_track_id] = (box, confidence, class_name, centroid)
            next_track_id += 1

    return assignments


# Webcam detection and tracking

In [6]:
# Real-time object detection and tracking from webcam
WEB_CAM_INDEX = 0  # Default webcam device

cap = cv2.VideoCapture(WEB_CAM_INDEX)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
cap.set(cv2.CAP_PROP_FPS, 30)

if not cap.isOpened():
    raise RuntimeError("Failed to open webcam. Check camera index or permissions.")

track_history.clear()
object_positions.clear()

print("Press ESC to exit the webcam window.")

while True:
    ret, frame = cap.read()
    if not ret:
        print("No frame received from webcam. Exiting.")
        break

    # YOLO inference
    results = model(frame, device=DEVICE, verbose=False)

    detections = []
    for result in results:
        for box in result.boxes:
            x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
            confidence = float(box.conf[0].cpu().numpy())
            class_id = int(box.cls[0].cpu().numpy())
            class_name = model.names[class_id]

            if confidence < CONFIDENCE_THRESHOLD:
                continue

            detections.append(
                (
                    [int(x1), int(y1), int(x2), int(y2)],
                    confidence,
                    class_name,
                )
            )

    # Tracking
    assignments = associate_detections_to_tracks(detections, object_positions)
    object_positions = {}

    # Drawing
    for track_id, (box, confidence, class_name, centroid) in assignments.items():
        object_positions[track_id] = centroid
        track_history[track_id].append(centroid)

        x1, y1, x2, y2 = box
        color = (0, 255, 0)

        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)

        label = f"ID {track_id} | {class_name} {confidence:.2f}"
        cv2.putText(
            frame,
            label,
            (x1, max(0, y1 - 10)),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.5,
            color,
            2,
            lineType=cv2.LINE_AA,
        )

        cv2.circle(frame, tuple(centroid.astype(int)), 4, (0, 0, 255), -1)

        if len(track_history[track_id]) > 1:
            points = np.array(track_history[track_id], dtype=np.int32)
            cv2.polylines(frame, [points], isClosed=False, color=(0, 255, 255), thickness=2)

    cv2.imshow("Webcam - Object Detection and Tracking", frame)

    key = cv2.waitKey(1) & 0xFF
    if key == 27:  # ESC
        break

cap.release()
cv2.destroyAllWindows()


Press ESC to exit the webcam window.


# Video file detection and tracking

In [7]:
# Object detection and tracking from a recorded video
VIDEO_PATH = "D:\\ai\\CodeAlpha\\Task_3_Object_Detection\\Test\\طلب_فيديو_بطيء_للكشف_عن_الوجوه.mp4"  
OUTPUT_PATH = "outputs/sample_01_tracked.mp4"

cap = cv2.VideoCapture(VIDEO_PATH)
if not cap.isOpened():
    raise RuntimeError(f"Failed to open video file: {VIDEO_PATH}")

frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0

fourcc = cv2.VideoWriter_fourcc(*"mp4v")
writer = cv2.VideoWriter(OUTPUT_PATH, fourcc, fps, (frame_width, frame_height))

track_history.clear()
object_positions.clear()

print("Processing video. Press ESC to stop early.")

while True:
    ret, frame = cap.read()
    if not ret:
        print("End of video stream.")
        break

    results = model(frame, device=DEVICE, verbose=False)

    detections = []
    for result in results:
        for box in result.boxes:
            x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
            confidence = float(box.conf[0].cpu().numpy())
            class_id = int(box.cls[0].cpu().numpy())
            class_name = model.names[class_id]

            if confidence < CONFIDENCE_THRESHOLD:
                continue

            detections.append(
                (
                    [int(x1), int(y1), int(x2), int(y2)],
                    confidence,
                    class_name,
                )
            )

    assignments = associate_detections_to_tracks(detections, object_positions)
    object_positions = {}

    for track_id, (box, confidence, class_name, centroid) in assignments.items():
        object_positions[track_id] = centroid
        track_history[track_id].append(centroid)

        x1, y1, x2, y2 = box
        color = (255, 0, 0)

        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)

        label = f"ID {track_id} | {class_name} {confidence:.2f}"
        cv2.putText(
            frame,
            label,
            (x1, max(0, y1 - 10)),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.5,
            color,
            2,
            lineType=cv2.LINE_AA,
        )

        cv2.circle(frame, tuple(centroid.astype(int)), 4, (0, 255, 255), -1)

    writer.write(frame)
    cv2.imshow("Video - Object Detection and Tracking", frame)

    key = cv2.waitKey(1) & 0xFF
    if key == 27:  # ESC
        print("Interrupted by user.")
        break

cap.release()
writer.release()
cv2.destroyAllWindows()

print(f"Tracked video saved to: {OUTPUT_PATH}")


Processing video. Press ESC to stop early.
End of video stream.
Tracked video saved to: outputs/sample_01_tracked.mp4


# Real-Time Object Detection and Tracking with YOLO and OpenCV

This notebook implements a real-time object detection and multi-object tracking system
using a pretrained YOLO model and OpenCV. The application supports both live webcam
input and recorded video files.

The main objectives are:
- Detect multiple objects in each video frame using a pretrained YOLO model.
- Track detected objects across consecutive frames with a lightweight centroid-based tracker.
- Visualize bounding boxes, labels, confidence scores, and track IDs on the video stream.

---

## 1. Project Overview

The goal of this project is to build a practical vision pipeline that can be used
for tasks such as basic surveillance, traffic analysis, or activity monitoring in a
controlled environment. The focus is on clarity, real-time performance, and a clean
implementation that can be extended later if needed.

---

## 2. Environment Setup

This notebook assumes a Python environment with:
- PyTorch (with CUDA support if a GPU is available)
- OpenCV for video capture and visualization
- Ultralytics YOLO for object detection

A GPU is strongly recommended to maintain real-time performance, particularly when
processing high-resolution video streams.

---

## 3. Model Loading and Configuration

A pretrained YOLO model is used for object detection, loaded through the Ultralytics
Python API. This interface simplifies running inference on images and video frames.

Key configuration parameters:
- `MODEL_PATH`: path to the pretrained YOLO weights (e.g., `yolov8n.pt` or `yolov8m.pt`).
- `DEVICE`: computation device (`cuda` if available, otherwise `cpu`).
- `CONFIDENCE_THRESHOLD`: minimum confidence required to accept a detection.

---

## 4. Tracking Logic

To maintain object identities across frames, a simple centroid-based tracker is used:
- The centroid of each detected bounding box is computed.
- Current detections are matched to existing tracks based on Euclidean distance.
- Each object is assigned a persistent track ID as long as it stays visible.

This approach is lightweight, requires no additional training, and is suitable for
scenes with moderate motion and limited occlusions.

---

## 5. Webcam Inference

The notebook includes a section that:
- Connects to the default webcam.
- Runs YOLO inference on each incoming frame.
- Applies the tracking logic.
- Displays the result in an OpenCV window with bounding boxes, labels, and IDs.

Press the `ESC` key in the OpenCV window to stop the webcam stream.

---

## 6. Video File Inference

The same detection and tracking pipeline can be applied to recorded video files:
- Frames are read sequentially from a video file.
- Detections and tracking are computed for each frame.
- The processed video is written to a new output file for later review.

This mode is useful for offline analysis, demonstrations, or creating sample results.

---

## 7. Notes and Possible Extensions

Notes:
- Tune `CONFIDENCE_THRESHOLD` to balance between detection recall and precision.
- Use lighter model variants or reduced frame resolution to increase throughput.

Possible extensions:
- Integrate a more advanced tracker such as SORT or Deep SORT.
- Restrict processing to specific regions of interest (ROIs) in the frame.
- Log and analyze trajectories, object counts, and dwell times for further analytics.
