<a href="https://colab.research.google.com/github/freida20git/bird-detection-tracking/blob/main/tuning_tracker_params.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Exploring different hyperparameters of deepSORT tracking algorithm**

In [None]:
!pip install ultralytics

In [None]:
!pip install deep_sort_realtime

In [None]:
!gdown 'https://drive.google.com/uc?id=1x-A9WOyZtZrOqlgM-EXg5g4ek6YgmCYO'

Downloading...
From: https://drive.google.com/uc?id=1x-A9WOyZtZrOqlgM-EXg5g4ek6YgmCYO
To: /content/bestbirdsonly.pt
100% 5.47M/5.47M [00:00<00:00, 27.0MB/s]


In [None]:
!gdown "https://drive.google.com/uc?id=1S9jEgd9m6O6O9srOapjyFG0tUU0IzfXM"

Downloading...
From: https://drive.google.com/uc?id=1S9jEgd9m6O6O9srOapjyFG0tUU0IzfXM
To: /content/birds-flying-in-blue-sky-preview.mp4
100% 5.23M/5.23M [00:00<00:00, 30.7MB/s]


In [None]:
from google.colab.patches import cv2_imshow
from deep_sort_realtime.deepsort_tracker import DeepSort
import cv2
import json  # Added for JSON support
from ultralytics import YOLO
import numpy as np
import json

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [None]:
# input video and detection model
video_path = '/content/birds-flying-in-blue-sky-preview.mp4'
model_path = '/content/bestbirdsonly.pt'

# DeepSORT parameter sets to test
experiments = [
    {"max_cosine_distance": 0.2, "max_iou_distance": 0.7, "max_overlap": 0.8},  # strict appearance
    {"max_cosine_distance": 0.3, "max_iou_distance": 0.5, "max_overlap": 0.7},  # relaxed appearance
    {"max_cosine_distance": 0.25, "max_iou_distance": 0.6, "max_overlap": 0.5}, # balanced
    {"max_cosine_distance": 0.15, "max_iou_distance": 0.5, "max_overlap": 0.9}, # super strict appearance
    {"max_cosine_distance": 0.35, "max_iou_distance": 0.4, "max_overlap": 0.6}, # feature noise tolerant
    {"max_cosine_distance": 0.2, "max_iou_distance": 0.8, "max_overlap": 0.9},  # relaxed IoU
    {"max_cosine_distance": 0.3, "max_iou_distance": 0.3, "max_overlap": 0.5},  # very tight matching
    {"max_cosine_distance": 0.25, "max_iou_distance": 0.75, "max_overlap": 0.8},# fast movement tolerant
    {"max_cosine_distance": 0.4, "max_iou_distance": 0.6, "max_overlap": 0.7},  # relaxed both
    {"max_cosine_distance": 0.2, "max_iou_distance": 0.6, "max_overlap": 0.6},  # general-purpose
]

model = YOLO(model_path)

# Loop through each experiment
for exp_num, params in enumerate(experiments):
    print(f"Running experiment {exp_num+1} with params: {params}")

    # Set up video capture
    cap = cv2.VideoCapture(video_path)

    # Get video properties
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    # Set up video writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(f'output_hyp{exp_num+1}.mp4', fourcc, fps, (frame_width, frame_height))

    # Initialize tracker with params
    tracker = DeepSort(
        max_cosine_distance=params["max_cosine_distance"],
        max_iou_distance=params["max_iou_distance"],
        nms_max_overlap=params["max_overlap"] )

    frame_number = 0
    all_annotations = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_number += 1
        results = model(frame, conf=0.3, classes=0)

        frame_annotations = {
            "frame_number": frame_number,
            "objects": []
        }

        detections = []
        for *xyxy, conf, cls in results[0].boxes.data:
            x1, y1, x2, y2 = map(int, xyxy)
            detections.append([[x1, y1, x2 - x1, y2 - y1], conf.item(), int(cls.item())])

        tracks = tracker.update_tracks(detections, frame=frame)

        for track in tracks:
            if not track.is_confirmed():
                continue
            track_id = track.track_id
            ltrb = track.to_ltrb()
            class_id = track.get_det_class()
            x1, y1, x2, y2 = map(int, ltrb)
            confidence = track.det_conf

            if confidence is not None:
                frame_annotations["objects"].append({
                    "track_id": track_id,
                    "class_id": class_id,
                    "class_name": model.names[class_id],
                    "confidence": confidence,
                    "bbox": {
                        "x1": x1,
                        "y1": y1,
                        "x2": x2,
                        "y2": y2
                    }
                })

                text = f"{track_id} - {model.names[class_id]}"
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(frame, text, (x1, y1 - 10), cv2.FONT_HERSHEY_DUPLEX, 0.9, (0, 255, 0), 2)

        all_annotations.append(frame_annotations)


        out.write(frame)

    # After the video is processed:
    cap.release()
    out.release()
    cv2.destroyAllWindows()

    # Save annotations JSON
    with open(f'annotations_hyp{exp_num+1}.json', 'w') as f:
        json.dump(all_annotations, f, indent=2)

    print(f"Saved output_hyp{exp_num+1}.mp4 and annotations_hyp{exp_num+1}.json")


check metrics for all sets of parameters: (compered to ground truth annotations)

In [None]:
#yolo11x annotations:
!gdown 'https://drive.google.com/uc?id=1VEjnRrq5Sxl9tYPirGeVNpNRNlieUSOQ'

Downloading...
From: https://drive.google.com/uc?id=1VEjnRrq5Sxl9tYPirGeVNpNRNlieUSOQ
To: /content/annotations11x.json
  0% 0.00/356k [00:00<?, ?B/s]100% 356k/356k [00:00<00:00, 125MB/s]


In [None]:
!pip install motmetrics

In [None]:
import random
import torch
import ultralytics
ultralytics.checks()
from IPython.display import Image
from collections import defaultdict
import numpy as np
import motmetrics as mm
import pandas as pd
import os

# Function to convert JSON annotations to MOT format
def convert_json_to_mot(json_path, output_path):
    with open(json_path, "r") as f:
        data = json.load(f)

    with open(output_path, "w") as f_out:
        for frame in data:
            frame_number = frame["frame_number"]
            for obj in frame["objects"]:
                track_id = obj["track_id"]
                confidence = obj["confidence"]
                class_id = obj["class_id"]
                x1, y1 = obj["bbox"]["x1"], obj["bbox"]["y1"]
                x2, y2 = obj["bbox"]["x2"], obj["bbox"]["y2"]
                width, height = x2 - x1, y2 - y1
                f_out.write(f"{frame_number},{track_id},{x1},{y1},{width},{height},{confidence},{class_id},1\n")

# Function to compute tracking metrics
def compute_tracking_metrics(gt_file, pred_file):
    acc = mm.MOTAccumulator(auto_id=True)
    gt_data = pd.read_csv(gt_file, header=None)
    pred_data = pd.read_csv(pred_file, header=None)
    frames = sorted(set(gt_data[0]) | set(pred_data[0]))

    for frame in frames:
        gt_frame = gt_data[gt_data[0] == frame]
        pred_frame = pred_data[pred_data[0] == frame]
        gt_ids = gt_frame[1].tolist()
        pred_ids = pred_frame[1].tolist()

        def iou(boxA, boxB):
            xA, yA, wA, hA = boxA
            xB, yB, wB, hB = boxB
            x1, y1 = max(xA, xB), max(yA, yB)
            x2, y2 = min(xA + wA, xB + wB), min(yA + hA, yB + hB)
            interArea = max(0, x2 - x1) * max(0, y2 - y1)
            boxAArea = wA * hA
            boxBArea = wB * hB
            return 1 - (interArea / float(boxAArea + boxBArea - interArea))

        gt_boxes = gt_frame.iloc[:, 2:6].values.tolist()
        pred_boxes = pred_frame.iloc[:, 2:6].values.tolist()
        distance_matrix = [[iou(gt, pred) for pred in pred_boxes] for gt in gt_boxes]
        acc.update(gt_ids, pred_ids, distance_matrix)

    mh = mm.metrics.create()
    summary = mh.compute(acc, metrics=['idf1', 'mota', 'motp', 'num_switches'], name="Tracking")
    return summary

# LIST OF YOUR GENERATED JSONS FROM DIFFERENT HYPERPARAMETERS
annotation_files = [
    '/content/annotations_hyp1.json',
    '/content/annotations_hyp2.json',
    '/content/annotations_hyp3.json',
    '/content/annotations_hyp4.json',
    '/content/annotations_hyp5.json',
    '/content/annotations_hyp6.json',
    '/content/annotations_hyp7.json',
    '/content/annotations_hyp8.json',
    '/content/annotations_hyp9.json',
    '/content/annotations_hyp10.json'
]

# Your ground truth MOT file
convert_json_to_mot('/content/annotations11x.json', '/content/gt_mot.txt')
gt_file=  '/content/gt_mot.txt'
# Make a folder to save intermediate MOT prediction files
os.makedirs('/content/mot_preds', exist_ok=True)

# Loop through each annotation file
for annotation_path in annotation_files:
    model_name = os.path.basename(annotation_path).replace('.json', '')

    pred_mot_path = f'/content/mot_preds/{model_name}_mot.txt'
    convert_json_to_mot(annotation_path, pred_mot_path)

    print(f"\n===== Metrics for {model_name} =====")
    summary = compute_tracking_metrics(gt_file, pred_mot_path)
    print(summary)
    print()

Ultralytics 8.3.119 🚀 Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
Setup complete ✅ (2 CPUs, 12.7 GB RAM, 41.2/112.6 GB disk)

===== Metrics for annotations_hyp0 =====
              idf1      mota      motp  num_switches
Tracking  0.830922  0.959337  0.138539            14


===== Metrics for annotations_hyp1 =====
              idf1      mota      motp  num_switches
Tracking  0.694615  0.923946  0.159816            31


===== Metrics for annotations_hyp2 =====
              idf1      mota      motp  num_switches
Tracking  0.974476  0.968373  0.134729             3


===== Metrics for annotations_hyp3 =====
              idf1      mota      motp  num_switches
Tracking  0.654335  0.925452  0.153961            28


===== Metrics for annotations_hyp4 =====
              idf1      mota      motp  num_switches
Tracking  0.470773  0.856175  0.214011            78


===== Metrics for annotations_hyp5 =====
              idf1      mota      motp  num_switches
Tracking  0.812024

set 3 gives best results (on this video and others).

Now will try to find the specific values for the parameters given this range of numbers.


In [None]:
from sklearn.model_selection import ParameterGrid

class LocationBasedDeepSORT_Tuner:
    def __init__(self, video_path, model_path, gt_annotations_path):
        """
        Initialize the location-focused DeepSORT tuner

        Args:
            video_path: Path to input video
            model_path: Path to YOLO detection model
            gt_annotations_path: Path to ground truth annotations JSON
        """
        self.video_path = video_path
        self.model_path = model_path
        self.gt_path = gt_annotations_path

        # Load models and video properties once
        self.model = YOLO(model_path)
        self.cap = cv2.VideoCapture(video_path)
        self.frame_width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        self.frame_height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        self.fps = int(self.cap.get(cv2.CAP_PROP_FPS))
        self.cap.release()

        # Metrics to optimize for (focusing on location-based metrics)
        self.target_metrics = ['mota', 'motp', 'num_switches']
        self.metric_weights = {
            'mota': 0.6,       # Overall tracking accuracy
            'motp': 0.3,       # Precision of bounding box overlap
            'num_switches': -0.1  # Penalize ID switches (negative weight)
        }

        # Create output directories
        os.makedirs('output_videos', exist_ok=True)
        os.makedirs('output_annotations', exist_ok=True)
        os.makedirs('mot_files', exist_ok=True)

        # Convert ground truth to MOT format for the first 150 frames
        self.gt_mot_path = os.path.join('mot_files', 'gt_mot_150.txt')
        self._convert_json_to_mot(gt_annotations_path, self.gt_mot_path, max_frames=150)

    def _convert_json_to_mot(self, json_path, output_path, max_frames=None):
        """Helper function to convert JSON annotations to MOT format for a limited number of frames"""
        with open(json_path, "r") as f:
            data = json.load(f)

        with open(output_path, "w") as f_out:
            for frame in data:
                frame_number = frame["frame_number"]
                if max_frames is not None and frame_number > max_frames:
                    break
                for obj in frame["objects"]:
                    track_id = obj["track_id"]
                    confidence = obj["confidence"]
                    class_id = obj["class_id"]
                    x1, y1 = obj["bbox"]["x1"], obj["bbox"]["y1"]
                    x2, y2 = obj["bbox"]["x2"], obj["bbox"]["y2"]
                    width, height = x2 - x1, y2 - y1
                    f_out.write(f"{frame_number},{track_id},{x1},{y1},{width},{height},{confidence},{class_id},1\n")

    def _compute_tracking_metrics(self, pred_file):
        """Compute tracking metrics between ground truth (first 150 frames) and predictions"""
        acc = mm.MOTAccumulator(auto_id=True)
        gt_data = pd.read_csv(self.gt_mot_path, header=None)
        pred_data = pd.read_csv(pred_file, header=None)
        gt_frames = sorted(gt_data[0].unique())
        pred_frames = sorted(pred_data[0].unique())
        all_frames = sorted(list(set(gt_frames) | set(pred_frames)))

        for frame in all_frames:
            if frame > 150:  # Limit evaluation to the first 150 frames
                continue
            gt_frame = gt_data[gt_data[0] == frame]
            pred_frame = pred_data[pred_data[0] == frame]
            gt_ids = gt_frame[1].tolist()
            pred_ids = pred_frame[1].tolist()

            def iou(boxA, boxB):
                xA, yA, wA, hA = boxA
                xB, yB, wB, hB = boxB
                x1, y1 = max(xA, xB), max(yA, yB)
                x2, y2 = min(xA + wA, xB + wB), min(yA + hA, yB + hB)
                interArea = max(0, x2 - x1) * max(0, y2 - y1)
                boxAArea = wA * hA
                boxBArea = wB * hB
                return 1 - (interArea / float(boxAArea + boxBArea - interArea))

            gt_boxes = gt_frame.iloc[:, 2:6].values.tolist()
            pred_boxes = pred_frame.iloc[:, 2:6].values.tolist()
            distance_matrix = [[iou(gt, pred) for pred in pred_boxes] for gt in gt_boxes]
            acc.update(gt_ids, pred_ids, distance_matrix)

        mh = mm.metrics.create()
        summary = mh.compute(acc, metrics=['idf1', 'mota', 'motp', 'num_switches', 'mostly_tracked', 'mostly_lost', 'num_fragmentations'], name="Tracking")
        return summary

    def _run_experiment(self, params, experiment_id):
        """
        Run tracking with location-focused parameters for the first 150 frames

        Args:
            params: Dictionary of DeepSORT parameters
            experiment_id: Unique ID for this experiment

        Returns:
            Dictionary containing metrics and output paths
        """
        # Initialize paths
        output_video_path = os.path.join('output_videos', f'output_loc{experiment_id}.mp4')
        output_annot_path = os.path.join('output_annotations', f'annotations_loc{experiment_id}.json')
        output_mot_path = os.path.join('mot_files', f'pred_loc{experiment_id}.txt')

        # Initialize tracker with current parameters
        tracker = DeepSort(
            nn_budget=0,  # Disable appearance features
            max_iou_distance=params["max_iou_distance"],
            max_cosine_distance=params["max_cosine_distance"], # Effectively disable appearance matching
            nms_max_overlap=params["nms_max_overlap"]
        )

        cap = cv2.VideoCapture(self.video_path)
        frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        fps = int(cap.get(cv2.CAP_PROP_FPS))

        # Set up video writer
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

        frame_number = 0
        all_annotations = []

        while True:
            ret, frame = cap.read()
            if not ret or frame_number >= 150:
                break

            results = self.model(frame, conf=0.3, classes=0)

            if results[0].boxes.data.shape[0] == 0:  # No detections in this frame
                frame_number += 1
                continue

            frame_annotations = {
                "frame_number": frame_number + 1,
                "objects": []
            }

            detections = []
            for *xyxy, conf, cls in results[0].boxes.data:
                x1, y1, x2, y2 = map(int, xyxy)
                detections.append([[x1, y1, x2 - x1, y2 - y1], conf.item(), int(cls.item())])

            tracks = tracker.update_tracks(detections, frame=frame)

            for track in tracks:
                if not track.is_confirmed():
                    continue
                track_id = track.track_id
                ltrb = track.to_ltrb()
                class_id = track.get_det_class()
                x1, y1, x2, y2 = map(int, ltrb)
                confidence = track.det_conf

                if confidence is not None:
                    frame_annotations["objects"].append({
                        "track_id": track_id,
                        "class_id": class_id,
                        "class_name": self.model.names[class_id],
                        "confidence": confidence,
                        "bbox": {
                            "x1": x1,
                            "y1": y1,
                            "x2": x2,
                            "y2": y2
                        }
                    })

                    text = f"{track_id} - {self.model.names[class_id]}"
                    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                    cv2.putText(frame, text, (x1, y1 - 10), cv2.FONT_HERSHEY_DUPLEX, 0.9, (0, 255, 0), 2)

            all_annotations.append(frame_annotations)
            out.write(frame)
            frame_number += 1

        cap.release()
        out.release()

        # Save annotations for the first 150 frames
        with open(output_annot_path, 'w') as f:
            json.dump(all_annotations, f, indent=2)

        # Convert predictions to MOT format for the first 150 frames
        self._convert_json_to_mot(output_annot_path, output_mot_path, max_frames=150)

        # Compute metrics
        metrics = self._compute_tracking_metrics(output_mot_path)

        return {
            "params": params,
            "metrics": metrics,
            "video_path": output_video_path,
            "annotations_path": output_annot_path,
            "mot_path": output_mot_path
        }

    def _calculate_score(self, metrics_df):
        """Calculate a weighted score based on location-focused metrics"""
        score = 0
        for metric, weight in self.metric_weights.items():
            if metric in ['mota', 'motp']:
                # Higher is better
                score += metrics_df[metric].iloc[0] * weight
            elif metric in ['num_switches']:
                # Lower is better, so we subtract
                max_val = max(metrics_df[metric].max(), 1)  # Avoid division by zero
                normalized = metrics_df[metric].iloc[0] / max_val
                score += (1 - normalized) * abs(weight)  # Use absolute value since weight is negative
        return score

    def optimize_location_params(self, num_iterations=10):
        """
        Optimize location-focused parameters using a more controlled search with a step of 0.05

        Args:
            num_iterations: Number of optimization iterations (will be less if all combinations are tested)

        Returns:
            Best parameters found and full history of evaluations
        """
        best_score = -np.inf
        best_params = None
        history = []
        tested_params = set()

        param_ranges = {
            "max_iou_distance": np.arange(0.6, 0.7, 0.05),
            "nms_max_overlap": np.arange(0.5, 0.7, 0.05),
            "max_cosine_distance": np.arange(0.15, 0.3, 0.05)
        }

        # Generate all combinations of parameters
        param_combinations = list(ParameterGrid(param_ranges))
        total_combinations = len(param_combinations)
        print(f"Total parameter combinations to test: {total_combinations}")

        for i, params in enumerate(param_combinations):
            params_tuple = tuple(sorted(params.items())) # For checking if params were already tested
            if params_tuple in tested_params:
                continue
            tested_params.add(params_tuple)

            experiment_result = self._run_experiment(params, i)
            score = self._calculate_score(experiment_result["metrics"])

            history.append({
                "params": params,
                "score": score,
                "mota": experiment_result["metrics"]['mota'].iloc[0],
                "motp": experiment_result["metrics"]['motp'].iloc[0],
                "num_switches": experiment_result["metrics"]['num_switches'].iloc[0]
            })

            if score > best_score:
                best_score = score
                best_params = params.copy()
                print(f"New best score: {best_score:.4f} with params: {best_params}")

        history_df = pd.DataFrame(history).sort_values('score', ascending=False)
        return best_params, history_df

# Example usage
if __name__ == "__main__":
    # Initialize tuner with your paths
    tuner = LocationBasedDeepSORT_Tuner(
        video_path='/content/birds-flying-in-blue-sky-preview.mp4',
        model_path='/content/bestbirdsonly.pt',
        gt_annotations_path='/content/annotations11x.json'
    )

    # Run optimization
    best_params, history = tuner.optimize_location_params(num_iterations=None) # Will test all combinations


    # Save full history
    history.to_csv('location_based_optimization_history_150frames.csv', index=False)

In [None]:
print("\nBest parameters found for location-based tracking:")
print(best_params)

print("\nTop 5 configurations:")
print(history.to_string())


Best parameters found for location-based tracking:
{'max_cosine_distance': np.float64(0.2), 'max_iou_distance': np.float64(0.65), 'nms_max_overlap': np.float64(0.5)}

Top 5 configurations:
                                                                                            params     score      mota      motp  num_switches
21                {'max_cosine_distance': 0.25, 'max_iou_distance': 0.65, 'nms_max_overlap': 0.55}  0.625984  0.975452  0.135709             1
12                  {'max_cosine_distance': 0.2, 'max_iou_distance': 0.65, 'nms_max_overlap': 0.5}  0.625984  0.975452  0.135709             1
20                 {'max_cosine_distance': 0.25, 'max_iou_distance': 0.65, 'nms_max_overlap': 0.5}  0.625984  0.975452  0.135709             1
22  {'max_cosine_distance': 0.25, 'max_iou_distance': 0.65, 'nms_max_overlap': 0.6000000000000001}  0.625984  0.975452  0.135709             1
14   {'max_cosine_distance': 0.2, 'max_iou_distance': 0.65, 'nms_max_overlap': 0.60000000000000

**Best parameters we got:**

max_cosine_distance: 0.25

max_iou_distance': 0.65

nms_max_overlap': 0.5