In [None]:
import cv2
import json
import os
import numpy as np
import subprocess

In [None]:
# Paths to video and annotation files
video_path = '/otif-dataset/dataset/amsterdam/train/video/0.mp4'
annotation_path = '/otif-dataset/dataset/amsterdam/train/yolov3-1280x736/0.json'
output_video_path = './0_annotated.mp4'

# Load annotation file
with open(annotation_path, 'r') as f:
    annotations = json.load(f)

# Print annotation structure to understand the format
print("Annotation keys:", annotations.keys() if isinstance(annotations, dict) else "List with", len(annotations), "items")
if isinstance(annotations, dict):
    print("Sample keys:", list(annotations.keys())[:5])
    if 'annotations' in annotations:
        print("Sample annotation:", annotations['annotations'][0] if annotations['annotations'] else "Empty")
elif isinstance(annotations, list):
    print("Sample item:", annotations[0] if annotations else "Empty")

In [None]:
# Open video
cap = cv2.VideoCapture(video_path)
assert cap.isOpened(), f"Could not open video {video_path}"

# Get video properties
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

print(f"Video properties: {width}x{height}, {fps} FPS, {frame_count} frames")

# Create video writer
fourcc = cv2.VideoWriter.fourcc('m', 'p', '4', 'v')
writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
assert writer.isOpened(), f"Could not create video writer for {output_video_path}"


In [None]:
# Parse annotations into a dictionary keyed by frame index
# Handle different annotation formats
frame_annotations = {}

if isinstance(annotations, dict):
    # COCO format or similar
    if 'annotations' in annotations and 'images' in annotations:
        # COCO format: map image_id to frame_index, then annotations to frames
        image_id_to_frame = {}
        for img in annotations['images']:
            # Try to extract frame index from filename or use image_id
            if 'frame_index' in img:
                image_id_to_frame[img['id']] = img['frame_index']
            elif 'file_name' in img:
                # Try to extract frame number from filename
                filename = img['file_name']
                try:
                    frame_idx = int(filename.split('_')[-1].split('.')[0])
                    image_id_to_frame[img['id']] = frame_idx
                except:
                    image_id_to_frame[img['id']] = img['id']
            else:
                image_id_to_frame[img['id']] = img['id']
        
        # Group annotations by frame
        for ann in annotations['annotations']:
            image_id = ann['image_id']
            frame_idx = image_id_to_frame.get(image_id, image_id)
            if frame_idx not in frame_annotations:
                frame_annotations[frame_idx] = []
            
            # Extract bbox: COCO format is [x, y, width, height]
            bbox = ann['bbox']
            x, y, w, h = bbox
            # Convert to [x1, y1, x2, y2] format
            frame_annotations[frame_idx].append([x, y, x + w, y + h])
    elif 'frames' in annotations:
        # Custom format with frames key
        for frame_data in annotations['frames']:
            frame_idx = frame_data.get('frame_idx', frame_data.get('frame_index', 0))
            frame_annotations[frame_idx] = frame_data.get('boxes', frame_data.get('detections', []))
    else:
        # Dictionary with frame indices as keys
        frame_annotations = annotations
elif isinstance(annotations, list):
    # List format: each item is a frame annotation
    for idx, frame_ann in enumerate(annotations):
        if isinstance(frame_ann, dict):
            frame_idx = frame_ann.get('frame_idx', frame_ann.get('frame_index', idx))
            frame_annotations[frame_idx] = frame_ann.get('boxes', frame_ann.get('detections', frame_ann.get('annotations', [])))
        elif isinstance(frame_ann, list):
            # Direct list of boxes
            frame_annotations[idx] = frame_ann

print(f"Loaded annotations for {len(frame_annotations)} frames")
print(f"Frame indices range: {min(frame_annotations.keys()) if frame_annotations else 'N/A'} to {max(frame_annotations.keys()) if frame_annotations else 'N/A'}")
if frame_annotations:
    sample_frame = list(frame_annotations.keys())[0]
    print(f"Sample frame {sample_frame} has {len(frame_annotations[sample_frame])} annotations")
    if frame_annotations[sample_frame]:
        print(f"Sample annotation format: {frame_annotations[sample_frame][0]}")


In [None]:
# Reopen video for processing (since we already read properties)
cap = cv2.VideoCapture(video_path)

# Process each frame and draw bounding boxes
frame_idx = 0
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    # Get annotations for this frame
    boxes = frame_annotations.get(frame_idx, [])
    
    # Draw bounding boxes
    for box in boxes:
        # Handle different box formats
        if isinstance(box, dict):
            # Dictionary format with keys like 'left', 'top', 'right', 'bottom'
            if 'left' in box and 'top' in box and 'right' in box and 'bottom' in box:
                x1, y1, x2, y2 = box['left'], box['top'], box['right'], box['bottom']
            elif 'x1' in box and 'y1' in box and 'x2' in box and 'y2' in box:
                x1, y1, x2, y2 = box['x1'], box['y1'], box['x2'], box['y2']
            elif 'x' in box and 'y' in box and 'width' in box and 'height' in box:
                # COCO format: [x, y, width, height]
                x1 = box['x']
                y1 = box['y']
                x2 = x1 + box['width']
                y2 = y1 + box['height']
            else:
                continue
        elif isinstance(box, (list, tuple)) and len(box) >= 4:
            if len(box) == 4:
                # [x1, y1, x2, y2] or [x, y, w, h]
                x1, y1, x2_or_w, y2_or_h = box
                # Check if it's [x, y, w, h] or [x1, y1, x2, y2]
                if x2_or_w < x1 or y2_or_h < y1:
                    # Likely [x, y, w, h] format
                    x1, y1, w, h = box
                    x2, y2 = x1 + w, y1 + h
                else:
                    # [x1, y1, x2, y2] format
                    x1, y1, x2, y2 = box
            else:
                # [track_id, x1, y1, x2, y2] or similar
                x1, y1, x2, y2 = box[-4:]
        else:
            continue
        
        # Convert to integers
        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
        
        # Draw bounding box in green
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
    
    # Write frame to output video
    writer.write(frame)
    
    frame_idx += 1
    
    if frame_idx % 100 == 0:
        print(f"Processed {frame_idx}/{frame_count} frames")

cap.release()
writer.release()

print(f"Video saved to {output_video_path}")


In [None]:
# Convert video to H.264 codec using ffmpeg
h264_output_path = output_video_path.replace('.mp4', '.h264.mp4')

# Run FFMPEG command to convert to H.264
cmd = [
    'ffmpeg', '-y',  # -y to overwrite output file
    '-loglevel', 'quiet',  # silence FFMPEG output
    '-i', output_video_path,  # input file
    '-c:v', 'libx264',  # H.264 codec
    '-preset', 'fast',  # encoding preset
    '-crf', '28',  # constant rate factor (lower quality, smaller file)
    '-profile:v', 'baseline',  # baseline profile for better compatibility
    '-level', '3.0',  # H.264 level for broader device support
    '-movflags', '+faststart',  # optimize for streaming/web playback
    '-pix_fmt', 'yuv420p',  # pixel format for maximum compatibility
    '-tune', 'fastdecode',  # optimize for faster decoding
    h264_output_path
]

print(f"Converting {output_video_path} to H.264 format...")
subprocess.run(cmd, capture_output=True, text=True, check=True)
print(f"H.264 video saved to {h264_output_path}")
