<a href="https://colab.research.google.com/github/emanueltay/Optimal_Subtitle_Placement/blob/main/caption_placement_preposition_testing_27_mar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [211]:
# !unzip /content/optimal_subtitle_copied.zip -d /content/

In [212]:
# !pip install ultralytics

### Detect Key Objects in the Frame

In [213]:
# ! pip install ultralytics

from ultralytics import YOLO
import cv2
import torch

# Load the trained model
model = YOLO("/content/optimal_subtitle_copied/best_100.pt")
# model = YOLO("/content/optimal_subtitle_copied/best.pt")

# # ✅ Automatically select GPU if available

device = "cuda" if torch.cuda.is_available() else "cpu"

# # ✅ Move model to the selected device
model.to(device).float()  # Use FP32 (FP16 can cause issues on CPU)

# ✅ Optimize PyTorch settings
torch.backends.cudnn.benchmark = True  # Optimize for fixed-size inputs
torch.backends.cudnn.enabled = True
torch.set_num_threads(torch.get_num_threads())  # Use optimal number of CPU threads


def detect_objects(frames):
    """
    Perform batch object detection on multiple frames.

    Parameters:
        frames (list): List of frames (each a NumPy array).

    Returns:
        list: A list containing detections for each frame.
              Each element is a NumPy array of detections.
    """
    # ✅ Run the model in batch mode
    results = model(frames, batch=len(frames), verbose=False)  # Run batch inference 640

    # ✅ Extract detections for each frame
    batch_detections = []
    for result in results:
        detections = result.boxes.data.cpu().numpy()  # Convert detections to NumPy array
        batch_detections.append(detections)

    return batch_detections  # List of detections per frame

### Define Safe Zones for Subtitle Placement

In [214]:
import json

# ✅ Global in-memory cache for dynamic & shifted safe zones
safe_zone_cache = {}
used_safe_zones = {}  # Dictionary to store all assigned safe zones

def calculate_safe_zone_with_prepositions_test_new(frame_width, frame_height, detections, pre_positions, subtitle_height, margin, shift_x=20):
    """
    Calculate the safe zone for subtitle placement using pre-defined positions.
    If blocked, it attempts to shift left/right before moving vertically.
    If no predefined position works, falls back to a dynamic safe zone and caches it in memory.

    Returns:
        tuple: (position_name, coordinates)
    """

    def zones_overlap(zone1, zone2):
        """Checks if two zones overlap."""
        x1a, y1a, x2a, y2a = zone1
        x1b, y1b, x2b, y2b = zone2
        return not (x2a < x1b or x1a > x2b or y2a < y1b or y1a > y2b)

    # ✅ Step 1: Try Predefined Positions
    for position_name, position in sorted(pre_positions.items(), key=lambda x: x[1].get("priority", 0), reverse=True):
        x1, y1, x2, y2 = position["coordinates"]

        if not any(zones_overlap((x1, y1, x2, y2), detection[:4]) for detection in detections):
            used_safe_zones[position_name] = {
                "coordinates": [x1, y1, x2, y2]
            }
            return (position_name, (x1, y1, x2, y2))

        # ✅ Step 2: Try shifting left and right
        min_width = max(0.6 * frame_width, 600)
        for shift_dir in ["left", "right"]:
            attempts = 0
            shift_value = shift_x
            while attempts < 10:
                if shift_dir == "left":
                    new_x1 = max(0, x1 - shift_value)
                    new_x2 = max(min_width, x2 - shift_value)
                else:
                    new_x1 = min(frame_width - min_width, x1 + shift_value)
                    new_x2 = min(frame_width, x2 + shift_value)

                shifted_zone = (new_x1, y1, new_x2, y2)
                if new_x2 > new_x1 and not any(zones_overlap(shifted_zone, detection[:4]) for detection in detections):
                    shifted_name = f"shifted_{position_name}"
                    used_safe_zones[shifted_name] = {
                        "coordinates": [new_x1, y1, new_x2, y2]
                    }
                    return (shifted_name, shifted_zone)

                shift_value *= 1.5
                attempts += 1

    # ✅ Step 3: Try Cached Safe Zone (if exists)
    cache_key = (frame_width, frame_height, tuple(tuple(d) for d in detections))
    if cache_key in safe_zone_cache:
        return safe_zone_cache[cache_key]

    # ✅ Step 4: No Predefined Positions Worked, Try Dynamic Safe Zone
    fallback_position_name = "dynamic_position"
    proposed = (0, frame_height - subtitle_height - margin, frame_width, frame_height - margin)

    # ✅ Gentle upward shift setup
    shift_step = max(4, int(0.05 * subtitle_height))  # Small step, e.g., 4 px
    max_shifts = 8
    shift_count = 0

    while shift_count < max_shifts:
        if all(not zones_overlap(proposed, detection[:4]) for detection in detections):
            safe_zone_cache[cache_key] = (fallback_position_name, proposed)
            used_safe_zones[fallback_position_name] = {
                "coordinates": list(proposed)
            }
            return (fallback_position_name, proposed)

        # Shift upwards slowly
        x1, y1, x2, y2 = proposed
        new_y1 = y1 - shift_step
        new_y2 = y2 - shift_step

        if new_y1 < 0:
            break  # Stop if it moves above the screen

        proposed = (x1, new_y1, x2, new_y2)
        shift_count += 1

    # ✅ Step 5: Final Fallback to Top
    fallback_position_name = "fallback_top"
    final_safe_zone = (0, margin, frame_width, subtitle_height + margin)
    safe_zone_cache[cache_key] = (fallback_position_name, final_safe_zone)
    used_safe_zones[fallback_position_name] = {
        "coordinates": list(final_safe_zone)
    }
    return (fallback_position_name, final_safe_zone)

def get_used_safe_zones():
    """
    Returns the used safe zones as a dictionary without the "priority" field.
    This can be directly used for updating the TTML layout.
    """
    return {
        key: {"coordinates": value["coordinates"]}
        for key, value in used_safe_zones.items()
    }

In [215]:
## testing

import numpy as np

# Global in-memory cache
safe_zone_cache = {}
used_safe_zones = {}

def calculate_safe_zone_with_prepositions_numpy(frame_width, frame_height, detections, pre_positions, subtitle_height, margin, shift_x=20):
    """
    Optimized subtitle safe zone selector using NumPy for faster batch overlap checks.
    Returns:
        tuple: (position_name, coordinates)
    """

    def zones_overlap_np(zone, detection_array):
        """Vectorized overlap check using NumPy."""
        x1a, y1a, x2a, y2a = zone
        xa = np.array([x1a, y1a, x2a, y2a])
        overlaps = ~(
            (detection_array[:, 2] < x1a) |  # x2 < x1a
            (detection_array[:, 0] > x2a) |  # x1 > x2a
            (detection_array[:, 3] < y1a) |  # y2 < y1a
            (detection_array[:, 1] > y2a)    # y1 > y2a
        )
        return np.any(overlaps)

    # Convert detections to NumPy array for vectorized overlap checks
    detections_np = np.array([d[:4] for d in detections], dtype=int)

    # Step 1: Check cache
    cache_key = (frame_width, frame_height, tuple(map(tuple, detections_np.tolist())))
    if cache_key in safe_zone_cache:
        return safe_zone_cache[cache_key]

    # Step 2: Try predefined positions
    for position_name, position in sorted(pre_positions.items(), key=lambda x: x[1].get("priority", 0), reverse=True):
        x1, y1, x2, y2 = position["coordinates"]

        if not zones_overlap_np((x1, y1, x2, y2), detections_np):
            safe_zone_cache[cache_key] = (position_name, (x1, y1, x2, y2))
            used_safe_zones[position_name] = {"coordinates": [x1, y1, x2, y2]}
            return (position_name, (x1, y1, x2, y2))

        # Try shifting
        min_width = max(0.6 * frame_width, 600)
        for shift_dir in ["left", "right"]:
            shift_attempts = 0
            shift = shift_x

            while shift_attempts < 10:
                if shift_dir == "left":
                    new_x1 = max(0, x1 - shift)
                    new_x2 = max(min_width, x2 - shift)
                else:
                    new_x1 = min(frame_width - min_width, x1 + shift)
                    new_x2 = min(frame_width, x2 + shift)

                if new_x2 > new_x1 and not zones_overlap_np((new_x1, y1, new_x2, y2), detections_np):
                    pos_name = f"shifted_{position_name}"
                    shifted_zone = (new_x1, y1, new_x2, y2)
                    safe_zone_cache[cache_key] = (pos_name, shifted_zone)
                    used_safe_zones[pos_name] = {"coordinates": [new_x1, y1, new_x2, y2]}
                    return (pos_name, shifted_zone)

                shift_attempts += 1
                shift *= 1.5

    # Step 3: Try dynamic fallback bottom zone
    fallback_position_name = "fallback_bottom"
    proposed_safe_zone = (0, frame_height - subtitle_height - margin, frame_width, frame_height - margin)

    while True:
        if not zones_overlap_np(proposed_safe_zone, detections_np):
            safe_zone_cache[cache_key] = (fallback_position_name, proposed_safe_zone)
            used_safe_zones[fallback_position_name] = {"coordinates": list(proposed_safe_zone)}
            return (fallback_position_name, proposed_safe_zone)

        x1, y1, x2, y2 = proposed_safe_zone
        # new_y1 = y1 - subtitle_height - margin
        # new_y2 = y2 - subtitle_height - margin

        new_y1 = y1 - int(0.5 * subtitle_height)
        new_y2 = y2 - int(0.5 * subtitle_height)

        if new_y1 < 0:
            break
        proposed_safe_zone = (x1, new_y1, x2, new_y2)

    # Step 4: Final fallback top
    fallback_position_name = "fallback_top"
    final_safe_zone = (0, margin, frame_width, subtitle_height + margin)
    safe_zone_cache[cache_key] = (fallback_position_name, final_safe_zone)
    used_safe_zones[fallback_position_name] = {"coordinates": list(final_safe_zone)}
    return (fallback_position_name, final_safe_zone)

### Subtitle size and margin

In [216]:
def get_subtitle_size(frame_height):
    """
    Dynamically calculate subtitle height and margin based on frame resolution.

    Parameters:
        frame_height (int): Height of the video frame.

    Returns:
        tuple: (subtitle_height, margin)
    """
    subtitle_height = max(0.12 * frame_height, 30)  # Minimum 18px for readability
    margin = max(0.02 * frame_height, 5)  # Minimum 5px to avoid text touching edges

    return int(subtitle_height), int(margin)

### Subtitle character calculation

In [217]:
# !pip install arabic-reshaper

In [218]:
# !pip install python-bidi

In [219]:
import cv2
import re
import textwrap
import numpy as np
import arabic_reshaper
from bidi.algorithm import get_display
from PIL import ImageFont, ImageDraw, Image

## universal font that fits all languages
def get_font(font_size):
    """
    Loads the universal OpenSans font.

    Parameters:
        font_size (int): The desired font size.

    Returns:
        PIL.ImageFont: The loaded font.
    """
    font_path = "/content/optimal_subtitle_copied/Font/GoNotoKurrent-Regular.ttf"
    return ImageFont.truetype(font_path, font_size)

# **2️⃣ Function to Detect Language Type**
def detect_language(text):
    """
    Detects if the text contains Latin, CJK, Arabic, or Indic characters.

    Returns:
        str: "latin", "cjk", "arabic", "indic", "thai" based on detected script.
    """

    # Extract the first two words
    words = re.findall(r'\b\w+\b', text)  # Split text into words
    text_snippet = " ".join(words[:2])  # Take only the first two words

    if any("\u0600" <= ch <= "\u06FF" for ch in text_snippet):  # Arabic script range
        return "arabic"
    elif any("\u4E00" <= ch <= "\u9FFF" for ch in text_snippet):  # Chinese script range
        return "cjk"
    elif any("\u3040" <= ch <= "\u30FF" for ch in text_snippet):  # Japanese script range
        return "cjk"
    elif any("\uAC00" <= ch <= "\uD7AF" for ch in text_snippet):  # Korean script range
        return "cjk"
    elif any("\u0900" <= ch <= "\u097F" for ch in text_snippet):  # Devanagari script (Hindi, Marathi, Sanskrit)
        return "indic"
    elif any("\u0E00" <= ch <= "\u0E7F" for ch in text_snippet):  # Thai script
        return "thai"
    return "latin"  # Default to Latin if nothing is detected

# **3️⃣ Main Subtitle Rendering Function**
def render_subtitle_multi_new(frame, subtitle_text, safe_zone, frame_width, frame_height, max_chars_per_line=40, opacity=0.8):
    """
    Render multi-line subtitles centered within the safe zone with a semi-transparent background.

    Parameters:
        frame (numpy array): The frame on which to render subtitles.
        subtitle_text (str): The text to display.
        safe_zone (tuple): (x1, y1, x2, y2) defining subtitle placement.
        frame_width (int): Width of the frame.
        frame_height (int): Height of the frame.
        opacity (float): Background opacity (0 = fully transparent, 1 = fully opaque).

    Returns:
        numpy array: The frame with subtitles rendered at an optimal position.
    """
    x1, y1, x2, y2 = safe_zone
    language = detect_language(subtitle_text)  # **Detect language**
    font_size = 28 if language == "cjk" else 26  # Adjust font size for CJK characters
    font = get_font(font_size)

    # **Handle Right-to-Left (RTL) text (e.g., Arabic)**
    if language == "arabic":
        subtitle_text = get_display(arabic_reshaper.reshape(subtitle_text))

    # **Calculate max width available for text**
    max_text_width = x2 - x1 - 30  # Ensure padding (30)
    # print("Max width in pixels:", max_text_width)

    # **Estimate Average Character Width Dynamically Using Subtitle Text**
    if len(subtitle_text) > 0:
        char_width = sum(font.getbbox(char)[2] - font.getbbox(char)[0] for char in subtitle_text) / len(subtitle_text)
    else:
        char_width = font_size // 2  # Fallback for empty text

    # **Determine Maximum Characters That Fit in Safe Zone**
    estimated_max_chars = max_text_width // char_width

    # **Use the Minimum of User-Defined or Estimated Max Chars**
    final_max_chars_per_line = min(estimated_max_chars, max_chars_per_line)

    # **Dynamically wrap text based on max character limit**
    wrapped_lines = []
    for line in subtitle_text.split("\n"):  # Handle existing line breaks
        new_lines = textwrap.wrap(line, width=int(final_max_chars_per_line))
        if new_lines:  # Only extend if wrapping produced text
            wrapped_lines.extend(new_lines)

    # **Fallback to prevent empty wrapped_lines**
    if not wrapped_lines:
        wrapped_lines = [" "]  # Ensures at least one blank line

    # **Measure Text Size**
    text_sizes = [font.getbbox(line) for line in wrapped_lines]
    text_width = max(size[2] - size[0] for size in text_sizes)  # Width (right - left)
    text_height = text_sizes[0][3] - text_sizes[0][1]  # Height (bottom - top)
    total_text_height = sum(size[3] - size[1] for size in text_sizes) + (len(wrapped_lines) - 1) * 10  # Extra spacing

    # **Center Text Within Safe Zone**
    text_x = x1 + (x2 - x1 - text_width) // 2  # **Horizontally centered**
    text_y = y1 + (y2 - y1 - total_text_height) // 2 - 20# **Vertically centered**

    # **Define Background Box**
    bg_x1 = max(text_x - 15, 0)
    bg_y1 = max(text_y - 5, 0)
    bg_x2 = min(text_x + text_width + 15, frame_width - 1)
    bg_y2 = min(text_y + total_text_height + 15, frame_height - 1)

    # **Create Semi-Transparent Background**
    overlay = frame.copy()
    cv2.rectangle(overlay, (bg_x1, bg_y1), (bg_x2, bg_y2), (0, 0, 0), -1)  # Black background
    cv2.addWeighted(overlay, opacity, frame, 1 - opacity, 0, frame)  # Blend overlay with frame

    # **Render Text Using PIL (for better font handling)**
    frame_pil = Image.fromarray(frame)
    draw = ImageDraw.Draw(frame_pil)

    y_offset = text_y
    for line in wrapped_lines:
        line_width = font.getbbox(line)[2] - font.getbbox(line)[0]  # Measure width
        line_x = x1 + (x2 - x1 - line_width) // 2  # Center per line
        draw.text((line_x, y_offset), line, font=font, fill=(255, 255, 255))  # White text
        y_offset += text_height + 10  # Extra line spacing

    return np.array(frame_pil)  # Convert back to OpenCV format

### Complete Pipeline for frames batch

In [220]:
import json

def get_pixel_pre_positions_from_json(json_path, frame_width, frame_height):
    """
    Reads percentage-based layout from a JSON file and converts to pixel coordinates.

    Args:
        json_path (str): Path to the JSON file containing percentages.
        frame_width (int): Width of the video frame.
        frame_height (int): Height of the video frame.

    Returns:
        dict: Dictionary of region names mapped to pixel coordinates and priority.
    """
    with open(json_path, 'r') as f:
        percentage_data = json.load(f)

    pixel_positions = {}
    for region, data in percentage_data.items():
        x1_pct, y1_pct, x2_pct, y2_pct = data["percentages"]
        pixel_positions[region] = {
            "coordinates": [
                int(x1_pct * frame_width),
                int(y1_pct * frame_height),
                int(x2_pct * frame_width),
                int(y2_pct * frame_height)
            ],
            "priority": data["priority"]
        }

    return pixel_positions

In [221]:
import json
from collections import Counter, deque
import numpy as np
import cv2

safe_zone_history = deque(maxlen=3)  # Stores past safe zones for consistency (4)
region_json_path = "/content/optimal_subtitle_copied/subtitle_regions_scaled_test.json"

def process_frames_batch_3fps_processed(frames, subtitles, process_fps=3, video_fps=30):
    """
    Process a batch of frames at 3 FPS:
    - Detects objects in frames sampled at 3 FPS
    - Computes one safe zone for the batch
    - Overlays subtitles using the same safe zone

    Parameters:
        frames (list): List of frames (NumPy arrays).
        subtitles (list): List of subtitles corresponding to each frame.
        video_fps (int): Original FPS of the video.
        process_fps (int): FPS at which YOLO will run.

    Returns:
        list: Processed frames with subtitles.
    """

    # ✅ Step 1: Select Frames at 3 FPS for YOLO Detection
    frame_interval = video_fps // process_fps  # Process every `frame_interval` frames
    selected_indices = list(range(0, len(frames), frame_interval))

    if not selected_indices:  # Prevent empty selection
        selected_indices = [0]  # Process at least one frame

    selected_frames = [frames[i] for i in selected_indices]  # Sampled frames for YOLO

    # ✅ Step 2: Batch Object Detection on Selected Frames
    batch_detections = detect_objects(selected_frames)  # YOLO runs only on sampled frames
    frame_height, frame_width = frames[0].shape[:2]
    # print(frame_height, frame_width)

    pre_positions = get_pixel_pre_positions_from_json(region_json_path, frame_width, frame_height)

    # # ✅ Load Predefined Safe Zones (JSON file loaded once)
    # with open("/content/optimal_subtitle_copied/news_video_subtitle_positions.json", "r") as file:
    #     pre_positions = json.load(file).get(f"{frame_width}x{frame_height}", {})

    # ✅ Step 3: Compute Safe Zone for Each Sampled Frame
    subtitle_height, margin = get_subtitle_size(frame_height)

    # ✅ Step 3: Collect Safe Zone Positions for Each Frame in Batch
    batch_safe_zones = [
        calculate_safe_zone_with_prepositions_test_new(
        #  calculate_safe_zone_with_prepositions_numpy(
            frame_width, frame_height, batch_detections[i], pre_positions, subtitle_height, margin
        )[0]  # ✅ Extract only the position name
        for i in range(len(selected_frames))
    ]

    # ✅ Step 4: Determine the Most Used Safe Zone
    combined_safe_zones = batch_safe_zones + list(safe_zone_history)  # Merge with history
    print(combined_safe_zones)
    zone_counts = Counter(combined_safe_zones)  # Count occurrences
    # zone_counts = Counter(batch_safe_zones)

    # ✅ Assign the most frequently used zone
    if zone_counts:
        most_common_zones = zone_counts.most_common()  # Get all zones sorted by frequency
        highest_frequency = most_common_zones[0][1]  # Find the highest occurrence count

        # ✅ Get all zones with the highest frequency
        top_zones = [zone for zone, count in most_common_zones if count == highest_frequency]

        # ✅ If there's a tie, choose the last used zone from combined_safe_zones
        final_safe_zone = next((zone for zone in reversed(combined_safe_zones) if zone in top_zones), "bottom")
    else:
        final_safe_zone = "bottom"  # ✅ Default fallback
    # print(f"✅ Final Safe Zone: {final_safe_zone}")


    # ✅ Store the final safe zone for future frames
    safe_zone_history.append(final_safe_zone)

    return final_safe_zone

In [222]:
import xml.etree.ElementTree as ET

def print_ttml_with_updated_regions(ttml_file_path, subtitle_data):
    """
    Prints the TTML <p> elements with updated regions, removing 'region' if it's None.

    Parameters:
        ttml_file_path (str): Path to the TTML file.
        subtitle_data (list): List of subtitles in the format:
            [{"start": start_time, "end": end_time, "text": text, "region": "region_id"}]
    """

    # ✅ Load TTML File
    tree = ET.parse(ttml_file_path)
    root = tree.getroot()
    ns = {'ttml': 'http://www.w3.org/ns/ttml'}

    # ✅ Find All <p> Elements (Subtitles) and Update Regions
    for p in root.findall('.//ttml:p', ns):
        start_time = convert_ttml_time_to_seconds(p.attrib.get("begin", "0.0s"))
        end_time = convert_ttml_time_to_seconds(p.attrib.get("end", "0.0s"))

        # ✅ Find Matching Subtitle
        matched_subtitle = next((sub for sub in subtitle_data if sub["start"] <= start_time <= sub["end"]), None)

        if matched_subtitle:
            if matched_subtitle["region"] is not None:
                p.attrib["region"] = matched_subtitle["region"]  # ✅ Assign Correct Region
            elif "region" in p.attrib:
                del p.attrib["region"]  # ✅ Remove `region` if it's None

    # ✅ Print Updated TTML Content
    updated_ttml = ET.tostring(root, encoding="utf-8").decode("utf-8")
    print(updated_ttml)  # ✅ Print instead of writing to a file

In [223]:
import xml.etree.ElementTree as ET
import json
import math

def generate_updated_ttml(ttml_file_path, output_ttml_path, json_data, subtitle_data, frame_width, frame_height):
    """
    Generates a new TTML file with updated subtitle styles, layout regions, and assigned regions for subtitles.

    Parameters:
        ttml_file_path (str): Path to the input TTML file.
        output_ttml_path (str): Path to save the updated TTML file.
        json_data (dict): JSON data containing subtitle positions.
        subtitle_data (list): List of subtitles with timestamps and regions.
        frame_width (int): Width of the video frame.
        frame_height (int): Height of the video frame.

    Returns:
        None (Writes updated TTML file to disk)
    """

    # ✅ Load TTML File
    tree = ET.parse(ttml_file_path)
    root = tree.getroot()

    # # ✅ Define Namespace for TTML
    # ns = {'ttml': 'http://www.w3.org/ns/ttml'}
    # ET.register_namespace("", ns["ttml"])

    # # ✅ Find or Create the <head> Element
    # head_element = root.find('.//ttml:head', ns)
    # if head_element is None:
    #     head_element = ET.Element("{http://www.w3.org/ns/ttml}head")
    #     root.insert(0, head_element)  # Insert <head> at the top

    # ✅ Load TTML File
    tree = ET.parse(ttml_file_path)
    root = tree.getroot()

    # ✅ Preserve All Original Root Attributes (Ensuring All Namespaces Remain)
    root_attribs = root.attrib.copy()  # Copy attributes before modification

    # ✅ Extract Namespace (from <tt> root tag)
    namespace_uri = root.tag.split("}")[0].strip("{")  # Extracts URI from "{namespace}tag"
    ns = {"ttml": namespace_uri} if namespace_uri else {}

    # ✅ Restore All Root Attributes (Explicitly Add Missing Namespaces)
    root.attrib.clear()
    root.attrib.update(root_attribs)  # ✅ Restore original attributes

    # ✅ Ensure `xmlns:tts` is Explicitly Set (if missing)
    if "xmlns:tts" not in root.attrib:
        root.set("xmlns:tts", "http://www.w3.org/ns/ttml#styling")  # ✅ Add missing styling namespace

    # ✅ Find or Create the <head> Element (Using Preserved Namespace)
    head_element = root.find(f'.//{{{namespace_uri}}}head', ns)
    if head_element is None:
        head_element = ET.Element(f"{{{namespace_uri}}}head")
        root.insert(0, head_element)  # Insert <head> as the first child

    # ✅ Find or Create the <styling> Element
    styling_element = head_element.find('.//ttml:styling', ns)
    if styling_element is None:
        styling_element = ET.Element("{http://www.w3.org/ns/ttml}styling")
        head_element.insert(0, styling_element)  # Insert before layout

    # ✅ Remove Any Existing <style> Elements (Always Replacing)
    for style in styling_element.findall('.//ttml:style', ns):
        styling_element.remove(style)

    # ✅ Define and Add the New Style Element
    new_style = ET.Element("{http://www.w3.org/ns/ttml}style", attrib={
        "xml:id": "s0",
        "tts:color": "white",
        "tts:fontSize": "70%",
        "tts:fontFamily": "sansSerif",
        "tts:backgroundColor": "black",
        "tts:displayAlign": "center",
        "tts:wrapOption": "wrap"
    })
    styling_element.append(new_style)

    # ✅ Find or Create the <layout> Element
    layout_element = head_element.find('.//ttml:layout', ns)
    if layout_element is None:
        layout_element = ET.Element("{http://www.w3.org/ns/ttml}layout")
        head_element.append(layout_element)

    # ✅ Remove ALL existing <region> elements inside <layout>
    for region in list(layout_element):
        layout_element.remove(region)

    # ✅ Insert Subtitle Regions from JSON
    for region_name, region_data in json_data.items():
        x1, y1, x2, y2 = region_data["coordinates"]

        print(frame_height,frame_width)

        # Convert absolute pixel values to TTML percentages
        origin_x = (x1 / frame_width) * 100
        origin_y = (y1 / frame_height) * 100
        extent_x = ((x2 - x1) / frame_width) * 100
        extent_y = ((y2 - y1) / frame_height) * 100

        # Construct the region XML element
        region_element = ET.Element("{http://www.w3.org/ns/ttml}region", attrib={
            "tts:origin": f"{math.ceil(origin_x)}% {math.ceil(origin_y)}%",
            "tts:extent": f"{math.ceil(extent_x)}% {math.ceil(extent_y)}%",
            "tts:displayAlign": "center",
            "tts:textAlign": "center",
            "xml:id": region_name
        })

        # Add to <layout>
        layout_element.append(region_element)

    # ✅ Find All <p> Elements (Subtitles) and Update Regions
    for p in root.findall('.//ttml:p', ns):
        start_time = convert_ttml_time_to_seconds(p.attrib.get("begin", "0.0s"))
        end_time = convert_ttml_time_to_seconds(p.attrib.get("end", "0.0s"))

        # ✅ Find Matching Subtitle
        matched_subtitle = next((sub for sub in subtitle_data if sub["start"] <= start_time <= sub["end"]), None)

        if matched_subtitle:
            if matched_subtitle["region"] is not None:
                p.attrib["region"] = matched_subtitle["region"]  # ✅ Assign Correct Region
            elif "region" in p.attrib:
                del p.attrib["region"]  # ✅ Remove `region` if it's None

    # ✅ Save Updated TTML File
    # tree.write(output_ttml_path, encoding="utf-8", xml_declaration=True)
    # print(f"✅ Updated TTML file saved: {output_ttml_path}")
    updated_ttml = ET.tostring(root, encoding="utf-8").decode("utf-8")
    print(updated_ttml)

## Testing the Code

### Integrate with srt file and video fps

In [224]:
# !pip install pysrt

In [225]:
def get_subtitles_for_frames(frame_times, subtitle_data):
    """
    Retrieves subtitle texts for a batch of frame timestamps.

    Parameters:
        frame_times (list): List of timestamps (in seconds).
        subtitle_data (list): List of subtitles in the format:
            [{"start": start_time, "end": end_time, "text": text, "region": region}, ...]

    Returns:
        list: List of subtitle texts corresponding to each frame timestamp.
    """
    frame_subtitles = []

    for time in frame_times:
        subtitle_text = ""  # Default to empty string

        for subtitle in subtitle_data:
            if subtitle["start"] <= time <= subtitle["end"]:
                subtitle_text = subtitle["text"].replace("\n", " ")  # Remove newlines
                break  # Stop once we find a match

        frame_subtitles.append(subtitle_text)

    return frame_subtitles

In [226]:
import subprocess

def get_video_fps(video_path):
    """Extracts FPS from a video using FFmpeg."""
    cmd = ["ffmpeg", "-i", video_path]

    # ✅ Use stdout and stderr explicitly
    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    # ✅ Parse FPS from FFmpeg output
    for line in result.stderr.split("\n"):
        if "Stream" in line and "Video" in line and "fps" in line:
            fps_value = float(line.split("fps")[0].strip().split()[-1])  # Extract FPS
            return fps_value

    return 30  # Default to 30 FPS if not found

In [227]:
def should_process_frame(frame_time, subtitle_timestamps, fps, tolerance=0.01):
    """
    Checks if a frame's timestamp falls within any subtitle duration, with a small tolerance.

    Returns True if the frame should be processed, False otherwise.
    """
    frame_tolerance = 1 / fps  # The duration of a single frame in seconds
    adjusted_tolerance = min(tolerance, 2 * frame_tolerance)  # Limit to 2 frames

    result = any((start - adjusted_tolerance) <= frame_time <= (end + adjusted_tolerance)
                 for start, end in subtitle_timestamps)

    # print(f"Frame Time: {frame_time:.3f}s | Process: {result}")  # ✅ Print Result
    return result

In [228]:
## combined srt and ttml timestamp

import pysrt
import xml.etree.ElementTree as ET
import re

def get_subtitle_timestamps(subtitle_file, file_type="auto"):
    """
    Extracts subtitle timestamps from an SRT or TTML file.

    Parameters:
        subtitle_file (str): Path to the subtitle file.
        file_type (str): "srt" for SRT, "ttml" for TTML, or "auto" to detect from extension.

    Returns:
        list of tuples: Each tuple contains (start_time, end_time) in seconds.
    """
    # Auto-detect file type
    if file_type == "auto":
        if subtitle_file.endswith(".srt"):
            file_type = "srt"
        elif subtitle_file.endswith(".ttml") or subtitle_file.endswith(".xml"):
            file_type = "ttml"
        else:
            raise ValueError("Unsupported subtitle file format. Use 'srt' or 'ttml'.")

    # Process SRT
    if file_type == "srt":
        return get_srt_timestamps(subtitle_file)

    # Process TTML
    elif file_type == "ttml":
        return get_ttml_timestamps(subtitle_file)

    else:
        raise ValueError("Invalid file type specified. Use 'srt' or 'ttml'.")

def get_srt_timestamps(srt_file):
    """Extracts subtitle timestamps from an SRT file."""
    subs = pysrt.open(srt_file)
    subtitle_timestamps = []

    for sub in subs:
        start_time = sub.start.hours * 3600 + sub.start.minutes * 60 + sub.start.seconds + sub.start.milliseconds / 1000
        end_time = sub.end.hours * 3600 + sub.end.minutes * 60 + sub.end.seconds + sub.end.milliseconds / 1000
        subtitle_timestamps.append((start_time, end_time))

    return subtitle_timestamps

def get_ttml_timestamps(ttml_file):
    """Extracts subtitle timestamps from a TTML file."""
    tree = ET.parse(ttml_file)
    root = tree.getroot()
    ns = {'ttml': 'http://www.w3.org/ns/ttml'}

    subtitle_timestamps = []

    for p in root.findall('.//ttml:p', ns):
        start_time = p.attrib.get("begin", "0.0s")
        end_time = p.attrib.get("end", "0.0s")

        start_seconds = convert_ttml_time_to_seconds(start_time)
        end_seconds = convert_ttml_time_to_seconds(end_time)

        subtitle_timestamps.append((start_seconds, end_seconds))

    return subtitle_timestamps

# def convert_ttml_time_to_seconds(ttml_time):
#     """
#     Converts TTML time format (HH:MM:SS.mmm or MM:SS.mmm or SS.mmm or SS.mmm's') to seconds.

#     Parameters:
#         ttml_time (str): TTML-formatted time.

#     Returns:
#         float: Time in seconds.
#     """
#     ttml_time = ttml_time.rstrip('s')  # Remove trailing 's' if present
#     parts = ttml_time.split(":")

#     if len(parts) == 3:  # HH:MM:SS.mmm
#         hours, minutes, seconds = map(float, parts)
#     elif len(parts) == 2:  # MM:SS.mmm
#         hours, minutes, seconds = 0, *map(float, parts)
#     else:  # SS.mmm
#         hours, minutes, seconds = 0, 0, float(parts[0])

#     return hours * 3600 + minutes * 60 + seconds

def convert_ttml_time_to_seconds(ttml_time):
    """
    Converts TTML time format (HH:MM:SS.mmm, MM:SS.mmm, SS.mmm, or SS,mmm) to seconds.

    Parameters:
        ttml_time (str): TTML-formatted time.

    Returns:
        float: Time in seconds (with millisecond precision).
    """

    # ✅ Remove trailing 's' if present and replace ',' with '.'
    ttml_time = ttml_time.rstrip('s').replace(',', '.')

    # ✅ Use regex to extract time components
    match = re.match(r"(?:(\d+):)?(?:(\d+):)?(\d+)(?:\.(\d+))?", ttml_time)

    if not match:
        raise ValueError(f"Invalid TTML time format: {ttml_time}")

    # ✅ Extract components safely
    hours = int(match.group(1)) if match.group(1) else 0
    minutes = int(match.group(2)) if match.group(2) else 0
    seconds = int(match.group(3)) if match.group(3) else 0
    milliseconds = int(match.group(4)) if match.group(4) else 0

    return hours * 3600 + minutes * 60 + seconds + milliseconds / 1000.0

In [229]:
import xml.etree.ElementTree as ET
import pysrt
import os

def parse_subtitle_file(file_path):
    """
    Parses either an SRT or TTML subtitle file and extracts subtitles.

    Parameters:
        file_path (str): Path to the subtitle file.

    Returns:
        list: List of subtitles in the format:
            [
                {"start": start_time, "end": end_time, "text": "subtitle text", "region": "region_id"}
            ]
    """
    extension = os.path.splitext(file_path)[-1].lower()
    subtitle_data = []

    if extension == ".srt":
        subs = pysrt.open(file_path)
        for sub in subs:
            start_time = (
                sub.start.hours * 3600 + sub.start.minutes * 60 + sub.start.seconds + sub.start.milliseconds / 1000
            )
            end_time = (
                sub.end.hours * 3600 + sub.end.minutes * 60 + sub.end.seconds + sub.end.milliseconds / 1000
            )
            text = sub.text.replace("\n", " ")  # Convert newlines to spaces

            subtitle_data.append({
                "start": start_time,
                "end": end_time,
                "text": text,
                "region": None  # SRT doesn't support regions
            })

    elif extension == ".ttml":
        # ✅ Register TTML Namespaces
        ET.register_namespace('', "http://www.w3.org/ns/ttml")  # Default TTML namespace
        ET.register_namespace('ttp', "http://www.w3.org/ns/ttml#parameter")
        ET.register_namespace('tts', "http://www.w3.org/ns/ttml#styling")
        ET.register_namespace('ttm', "http://www.w3.org/ns/ttml#metadata")

        # ✅ Parse TTML File
        tree = ET.parse(file_path)
        root = tree.getroot()
        ns = {'ttml': 'http://www.w3.org/ns/ttml'}

        # ✅ Extract Subtitle Data
        for p in root.findall('.//ttml:p', ns):
            start_time = convert_ttml_time_to_seconds(p.attrib.get("begin", "0.0s"))
            end_time = convert_ttml_time_to_seconds(p.attrib.get("end", "0.0s"))
            text = " ".join(p.itertext()).strip()
            region = p.attrib.get("region", None)

            subtitle_data.append({
                "start": start_time,
                "end": end_time,
                "text": text,
                "region": region
            })

    else:
        raise ValueError("Unsupported subtitle format. Only SRT and TTML are supported.")

    return subtitle_data

In [230]:
# import os
# import cv2
# import time

# # ✅ Start Load Timer
# start_load_time = time.time()

# # ✅ Define file paths
# video_input_path = "/content/optimal_subtitle_copied/test_video_4.mp4"
# file_path = "/content/optimal_subtitle_copied/TTML_file/test_video_4_eng.ttml"
# output_path = "/content/output.ttml"

# # ✅ Load Video Metadata
# fps = get_video_fps(video_input_path)
# print(f"✅ Corrected FPS: {fps}")

# subtitle_data = parse_subtitle_file(file_path)
# subtitle_timestamps = get_subtitle_timestamps(file_path)

# cap = cv2.VideoCapture(video_input_path)
# frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
# frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# print(f"🎞 Frame Dimensions: {frame_width}x{frame_height}")

# total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# video_duration = total_frames / fps

# # ✅ End Load Timer
# end_load_time = time.time()
# load_duration = end_load_time - start_load_time
# print(f"📦 Total Load Time: {load_duration:.2f} seconds")

# # ✅ Start Run Timer
# start_run_time = time.time()

# frame_buffer = []
# timestamp_buffer = []
# subtitle_index = 0
# frame_number = 0
# total_video_read_time = 0
# total_yolo_time = 0
# total_region_assign_time = 0

# while cap.isOpened():
#     read_start = time.time()
#     ret, frame = cap.read()
#     read_end = time.time()

#     total_video_read_time += (read_end - read_start)
#     if not ret:
#         break

#     frame_time = frame_number / fps

#     if subtitle_index < len(subtitle_data):
#         current_subtitle = subtitle_data[subtitle_index]
#         current_start = current_subtitle["start"]
#         current_end = current_subtitle["end"]

#         if current_start <= frame_time <= current_end:
#             frame_buffer.append(frame)
#             timestamp_buffer.append(frame_time)

#         if frame_time > current_end:
#             if frame_buffer:
#                 subtitles = [current_subtitle]

#                 detect_start = time.time()
#                 processed_frames = process_frames_batch_3fps_processed(frame_buffer, subtitles)
#                 detect_end = time.time()
#                 total_yolo_time += (detect_end - detect_start)

#                 region_start = time.time()
#                 current_subtitle["region"] = processed_frames
#                 region_end = time.time()
#                 total_region_assign_time += (region_end - region_start)

#                 # print(f"⏱ Processed '{current_subtitle['text']}' in {detect_end - detect_start:.2f}s")

#                 frame_buffer.clear()
#                 timestamp_buffer.clear()
#                 subtitle_index += 1

#     frame_number += 1

# # ✅ Final batch
# if frame_buffer and subtitle_index < len(subtitle_data):
#     subtitles = [subtitle_data[subtitle_index]]
#     detect_start = time.time()
#     processed_frames = process_frames_batch_3fps_processed(frame_buffer, subtitles)
#     detect_end = time.time()
#     total_yolo_time += (detect_end - detect_start)
#     subtitle_data[subtitle_index]["region"] = processed_frames

# cap.release()

# # ✅ End Run Timer
# end_run_time = time.time()
# run_duration = end_run_time - start_run_time

# # ✅ Generate TTML Layout
# ttml_gen_start = time.time()
# layout = get_used_safe_zones()
# generate_updated_ttml(file_path, output_path, layout, subtitle_data, frame_width, frame_height)
# ttml_gen_end = time.time()
# ttml_generation_time = ttml_gen_end - ttml_gen_start

# minutes, seconds = divmod(video_duration, 60)

# # ✅ Final Timing Summary
# print("\n📊 PROFILING SUMMARY")
# print(f"🎬 Video Duration: {int(minutes)}m {int(seconds)}s")
# print(f"📦 Load Time: {load_duration:.2f}s")
# print(f"🚀 Run Time: {run_duration:.2f}s")
# print(f"📥 Video Read Time: {total_video_read_time:.2f}s")
# print(f"🔍 YOLO Detection Time: {total_yolo_time:.2f}s")
# print(f"📐 Region Assignment Time: {total_region_assign_time:.2f}s")
# print(f"📝 TTML Generation Time: {ttml_generation_time:.2f}s")
# print(f"✅ Output TTML saved to: {output_path}")

In [232]:
import os
import cv2
import time

# ✅ Optional Resize Parameter (None = no resize)
# resize_resolution = (640, 360)  # Example: downscale to 640x360; set to None to disable resizing
# resize_resolution = (1280, 720)
# resize_resolution = (3840, 2160)
resize_resolution = None

# ✅ Start Load Timer
start_load_time = time.time()

# ✅ Define file paths
video_input_path = "/content/optimal_subtitle_copied/test_video_4.mp4"
file_path = "/content/optimal_subtitle_copied/TTML_file/test_video_4_eng.ttml"
output_path = "/content/output.ttml"

# ✅ Load Video Metadata
fps = get_video_fps(video_input_path)
print(f"✅ Corrected FPS: {fps}")

subtitle_data = parse_subtitle_file(file_path)
subtitle_timestamps = get_subtitle_timestamps(file_path)

cap = cv2.VideoCapture(video_input_path)

# ✅ Original Resolution for TTML generation
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
print(f"🎞 Frame Dimensions: {frame_width}x{frame_height}")

total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
video_duration = total_frames / fps

# ✅ End Load Timer
end_load_time = time.time()
load_duration = end_load_time - start_load_time
print(f"📦 Total Load Time: {load_duration:.2f} seconds")

# ✅ Start Run Timer
start_run_time = time.time()

frame_buffer = []
timestamp_buffer = []
subtitle_index = 0
frame_number = 0
total_video_read_time = 0
total_yolo_time = 0
total_region_assign_time = 0

while cap.isOpened():
    read_start = time.time()
    ret, frame = cap.read()
    read_end = time.time()

    total_video_read_time += (read_end - read_start)
    if not ret:
        break

    # ✅ Resize frame for faster processing
    if resize_resolution:
        frame = cv2.resize(frame, resize_resolution)
        frame_width, frame_height = resize_resolution

    frame_time = frame_number / fps

    if subtitle_index < len(subtitle_data):
        current_subtitle = subtitle_data[subtitle_index]
        current_start = current_subtitle["start"]
        current_end = current_subtitle["end"]

        if current_start <= frame_time <= current_end:
            frame_buffer.append(frame)
            timestamp_buffer.append(frame_time)

        if frame_time > current_end:
            if frame_buffer:
                subtitles = [current_subtitle]

                detect_start = time.time()
                processed_frames = process_frames_batch_3fps_processed(frame_buffer, subtitles)
                detect_end = time.time()
                total_yolo_time += (detect_end - detect_start)

                region_start = time.time()
                current_subtitle["region"] = processed_frames
                region_end = time.time()
                total_region_assign_time += (region_end - region_start)

                frame_buffer.clear()
                timestamp_buffer.clear()
                subtitle_index += 1

    frame_number += 1

# ✅ Final batch
if frame_buffer and subtitle_index < len(subtitle_data):
    subtitles = [subtitle_data[subtitle_index]]
    detect_start = time.time()
    processed_frames = process_frames_batch_3fps_processed(frame_buffer, subtitles)
    detect_end = time.time()
    total_yolo_time += (detect_end - detect_start)
    subtitle_data[subtitle_index]["region"] = processed_frames

cap.release()

# ✅ End Run Timer
end_run_time = time.time()
run_duration = end_run_time - start_run_time

# ✅ Generate TTML Layout using original resolution
ttml_gen_start = time.time()
layout = get_used_safe_zones()
generate_updated_ttml(file_path, output_path, layout, subtitle_data, frame_width, frame_height)
ttml_gen_end = time.time()
ttml_generation_time = ttml_gen_end - ttml_gen_start

minutes, seconds = divmod(video_duration, 60)

# ✅ Final Timing Summary
print("\n📊 PROFILING SUMMARY")
print(f"🎬 Video Duration: {int(minutes)}m {int(seconds)}s")
print(f"📦 Load Time: {load_duration:.2f}s")
print(f"🚀 Run Time: {run_duration:.2f}s")
print(f"📥 Video Read Time: {total_video_read_time:.2f}s")
print(f"🔍 YOLO Detection Time: {total_yolo_time:.2f}s")
print(f"📐 Region Assignment Time: {total_region_assign_time:.2f}s")
print(f"📝 TTML Generation Time: {ttml_generation_time:.2f}s")
print(f"✅ Output TTML saved to: {output_path}")

✅ Corrected FPS: 29.97
🎞 Frame Dimensions: 1280x720
📦 Total Load Time: 0.12 seconds
['above_3', 'above_3', 'above_3', 'above_3', 'above_3', 'above_3', 'above_3', 'shifted_middle3', 'shifted_middle3', 'shifted_middle3']
['above_3', 'above_3', 'above_3', 'middle3', 'middle3', 'middle3', 'middle3', 'middle3', 'middle3', 'middle3', 'middle3', 'middle3', 'shifted_middle3', 'shifted_middle3', 'above_3']
['middle3', 'middle3', 'middle3', 'middle3', 'middle3', 'middle3', 'middle3', 'shifted_middle3', 'above_3', 'middle3']
['middle3', 'middle3', 'above_3', 'middle3', 'middle3', 'middle3', 'middle3', 'above_3', 'middle3', 'middle3']
['middle3', 'middle3', 'middle3', 'middle3', 'middle3', 'middle3', 'middle3', 'shifted_middle3', 'shifted_middle3', 'middle3', 'middle3', 'middle3']
['shifted_middle3', 'shifted_middle3', 'shifted_middle3', 'shifted_middle3', 'shifted_middle3', 'shifted_middle3', 'shifted_middle3', 'shifted_middle3', 'shifted_middle3', 'middle3', 'middle3', 'middle3']
['shifted_middl

In [481]:
# ! pip install av

In [482]:
# import av
# import time
# import numpy as np

# # ✅ Start Load Timer
# start_load_time = time.time()

# # ✅ Define file paths
# video_input_path = "/content/optimal_subtitle_copied/test_video_4.mp4"
# file_path = "/content/optimal_subtitle_copied/TTML_file/test_video_4_eng.ttml"
# output_path = "/content/output.ttml"

# # ✅ Load Metadata
# fps = get_video_fps(video_input_path)
# print(f"✅ Corrected FPS: {fps}")

# subtitle_data = parse_subtitle_file(file_path)

# container = av.open(video_input_path)
# video_stream = container.streams.video[0]
# frame_width = video_stream.codec_context.width
# frame_height = video_stream.codec_context.height
# print(f"🎞 Frame Dimensions: {frame_width}x{frame_height}")

# total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# video_duration = total_frames / fps

# # ✅ End Load Timer
# end_load_time = time.time()
# load_duration = end_load_time - start_load_time
# print(f"📦 Total Load Time: {load_duration:.2f} seconds")

# # ✅ Start Run Timer
# start_run_time = time.time()

# # ✅ Initialize frame buffers for each subtitle
# for subtitle in subtitle_data:
#     subtitle["frames"] = []
#     subtitle["timestamps"] = []

# total_video_read_time = 0
# total_yolo_time = 0
# total_region_assign_time = 0

# # ✅ Decode frames linearly
# for frame in container.decode(video=0):
#     frame_pts = float(frame.pts * frame.time_base)

#     read_start = time.time()
#     img = frame.to_ndarray(format="bgr24")
#     read_end = time.time()
#     total_video_read_time += (read_end - read_start)

#     # ✅ Append to all matching subtitle intervals
#     for subtitle in subtitle_data:
#         if subtitle["start"] <= frame_pts <= subtitle["end"]:
#             subtitle["frames"].append(img)
#             subtitle["timestamps"].append(frame_pts)

# container.close()

# # ✅ Process each subtitle's collected frames
# for subtitle in subtitle_data:
#     if subtitle["frames"]:
#         detect_start = time.time()
#         processed_frames = process_frames_batch_3fps_processed(subtitle["frames"], [subtitle])
#         detect_end = time.time()
#         total_yolo_time += (detect_end - detect_start)

#         region_start = time.time()
#         subtitle["region"] = processed_frames
#         region_end = time.time()
#         total_region_assign_time += (region_end - region_start)

# # ✅ End Run Timer
# end_run_time = time.time()
# run_duration = end_run_time - start_run_time

# # ✅ Generate TTML Layout
# ttml_gen_start = time.time()
# layout = get_used_safe_zones()
# generate_updated_ttml(file_path, output_path, layout, subtitle_data, frame_width, frame_height)
# ttml_gen_end = time.time()
# ttml_generation_time = ttml_gen_end - ttml_gen_start

# minutes, seconds = divmod(video_duration, 60)

# # ✅ Final Timing Summary
# print("\n📊 PROFILING SUMMARY")
# print(f"🎬 Video Duration: {int(minutes)}m {int(seconds)}s")
# print(f"🚀 Run Time: {run_duration:.2f}s")
# print(f"📥 Video Read Time: {total_video_read_time:.2f}s")
# print(f"🔍 YOLO Detection Time: {total_yolo_time:.2f}s")
# print(f"📐 Region Assignment Time: {total_region_assign_time:.2f}s")
# print(f"📝 TTML Generation Time: {ttml_generation_time:.2f}s")
# print(f"✅ Output TTML saved to: {output_path}")