<a href="https://colab.research.google.com/github/emanueltay/Optimal_Subtitle_Placement/blob/main/caption_placement_preposition_testing_18_mar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!unzip /content/optimal_subtitle_copied.zip -d /content/

# !unzip /content/test_long_video.zip -d /content/

Archive:  /content/optimal_subtitle_copied.zip
   creating: /content/optimal_subtitle_copied/
   creating: /content/optimal_subtitle_copied/TTML_file/
  inflating: /content/optimal_subtitle_copied/TTML_file/test_video_3_eng.ttml  
  inflating: /content/optimal_subtitle_copied/subtitle_positions_ttml.json  
   creating: /content/optimal_subtitle_copied/Font/
  inflating: /content/optimal_subtitle_copied/Font/GoNotoKurrent-Regular.ttf  
  inflating: /content/optimal_subtitle_copied/Font/OpenSans-Italic-VariableFont_wdth,wght.ttf  
  inflating: /content/optimal_subtitle_copied/Font/OpenSans-VariableFont_wdth,wght.ttf  
  inflating: /content/optimal_subtitle_copied/Font/DejaVuSans-Bold.ttf  
  inflating: /content/optimal_subtitle_copied/Font/NotoNaskhArabic-Regular.ttf  
  inflating: /content/optimal_subtitle_copied/Font/NotoSansDevanagari-Regular.ttf  
  inflating: /content/optimal_subtitle_copied/Font/NotoSansCJK-Regular.ttc  
  inflating: /content/optimal_subtitle_copied/Font/NotoSansTh

In [2]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.93-py3-none-any.whl.metadata (35 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading nv

### Detect Key Objects in the Frame

In [3]:
# ! pip install ultralytics

from ultralytics import YOLO
import cv2
import torch

# Load the trained model
model = YOLO("/content/optimal_subtitle_copied/best_100.pt")
# model = YOLO("/content/optimal_subtitle_copied/best.pt")

# # ✅ Automatically select GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# # ✅ Move model to the selected device
model.to(device).float()  # Use FP32 (FP16 can cause issues on CPU)

# ✅ Optimize PyTorch settings
torch.backends.cudnn.benchmark = True  # Optimize for fixed-size inputs
torch.backends.cudnn.enabled = True
torch.set_num_threads(torch.get_num_threads())  # Use optimal number of CPU threads


def detect_objects(frames):
    """
    Perform batch object detection on multiple frames.

    Parameters:
        frames (list): List of frames (each a NumPy array).

    Returns:
        list: A list containing detections for each frame.
              Each element is a NumPy array of detections.
    """
    # ✅ Run the model in batch mode
    results = model(frames, batch=len(frames), imgsz=640, verbose=True)  # Run batch inference

    # ✅ Extract detections for each frame
    batch_detections = []
    for result in results:
        detections = result.boxes.data.cpu().numpy()  # Convert detections to NumPy array
        batch_detections.append(detections)

    return batch_detections  # List of detections per frame

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


### Define Safe Zones for Subtitle Placement

In [41]:
import json

# ✅ Global in-memory cache for dynamic & shifted safe zones
safe_zone_cache = {}
used_safe_zones = {}  # Dictionary to store all assigned safe zones

def calculate_safe_zone_with_prepositions_test_new(frame_width, frame_height, detections, pre_positions, subtitle_height=30, margin=10, shift_x=20):
    """
    Calculate the safe zone for subtitle placement using pre-defined positions.
    If blocked, it attempts to shift left/right before moving vertically.
    If no predefined position works, falls back to a dynamic safe zone and caches it in memory.

    Returns:
        tuple: (position_name, coordinates)
    """

    def zones_overlap(zone1, zone2):
        """Checks if two zones overlap."""
        x1a, y1a, x2a, y2a = zone1
        x1b, y1b, x2b, y2b = zone2
        return not (x2a < x1b or x1a > x2b or y2a < y1b or y1a > y2b)

    # ✅ Step 1: Check if we already computed a safe zone for this frame in memory
    cache_key = (frame_width, frame_height, tuple(tuple(d) for d in detections))  # Unique key per resolution + detections
    if cache_key in safe_zone_cache:
        return safe_zone_cache[cache_key]  # Return cached value

    # ✅ Step 2: Try Predefined Positions
    for position_name, position in sorted(pre_positions.items(), key=lambda x: x[1].get("priority", 0), reverse=True):
        x1, y1, x2, y2 = position["coordinates"]

        # Check if the original pre-position is available
        if not any(zones_overlap((x1, y1, x2, y2), detection[:4]) for detection in detections):
            safe_zone_cache[cache_key] = (position_name, (x1, y1, x2, y2))  # ✅ Store in cache

            # ✅ Store in used safe zones for JSON output
            used_safe_zones[position_name] = {
                "coordinates": [x1, y1, x2, y2]
            }
            return (position_name, (x1, y1, x2, y2))

        min_width = max(0.6 * frame_width, 600)  # 60% of frame width, but at least 600px

        # ✅ Try shifting left and right before moving to fallback
        for shift_dir in ["left", "right"]:
            shift_attempts = 0
            while shift_attempts < 10:  # Try shifting multiple times
                if shift_dir == "left":
                    new_x1, new_x2 = max(0, x1 - shift_x), max(min_width, x2 - shift_x)
                else:
                    new_x1, new_x2 = min(frame_width - min_width, x1 + shift_x), min(frame_width, x2 + shift_x)

                shifted_zone = (new_x1, y1, new_x2, y2)

                if new_x2 > new_x1 and not any(zones_overlap(shifted_zone, detection[:4]) for detection in detections):
                    safe_zone_cache[cache_key] = (f"shifted_{position_name}", shifted_zone)  # ✅ Store shifted result in cache

                    # ✅ Store in used safe zones with "shifted_" prefix
                    used_safe_zones[f"shifted_{position_name}"] = {
                        "coordinates": [new_x1, y1, new_x2, y2]
                    }
                    return (f"shifted_{position_name}", shifted_zone)  # Return valid shifted zone

                shift_attempts += 1
                shift_x *= 1.5  # Increase shift size if first shift attempt fails

    # ✅ Step 3: No Predefined Positions Worked, Try Dynamic Safe Zone
    fallback_position_name = "fallback_bottom"
    proposed_safe_zone = (0, frame_height - subtitle_height - margin, frame_width, frame_height - margin)

    while True:
        if all(not zones_overlap(proposed_safe_zone, (int(d[0]), int(d[1]), int(d[2]), int(d[3]))) for d in detections):
            safe_zone_cache[cache_key] = (fallback_position_name, proposed_safe_zone)  # ✅ Store in cache

            # ✅ Store dynamic position as "fallback_bottom"
            used_safe_zones[fallback_position_name] = {
                "coordinates": list(proposed_safe_zone)
            }
            return (fallback_position_name, proposed_safe_zone)

        # Try shifting up
        x1, y1, x2, y2 = proposed_safe_zone
        new_y1 = y1 - subtitle_height - margin
        new_y2 = y2 - subtitle_height - margin

        if new_y1 < 0:
            break  # No valid space above, fallback required

        proposed_safe_zone = (x1, new_y1, x2, new_y2)

    # ✅ Step 4: Final Fallback to Top and Cache It
    fallback_position_name = "fallback_top"
    final_safe_zone = (0, margin, frame_width, subtitle_height + margin)
    safe_zone_cache[cache_key] = (fallback_position_name, final_safe_zone)  # ✅ Store fallback result in cache

    # ✅ Store fallback position as "fallback_top"
    used_safe_zones[fallback_position_name] = {
        "coordinates": list(final_safe_zone)
    }

    return (fallback_position_name, final_safe_zone)

def get_used_safe_zones():
    """
    Returns the used safe zones as a dictionary without the "priority" field.
    This can be directly used for updating the TTML layout.
    """
    return {
        key: {"coordinates": value["coordinates"]}
        for key, value in used_safe_zones.items()
    }

In [5]:
import cv2
import textwrap

def calculate_safe_zone_with_prepositions_test(frame_width, frame_height, detections, pre_positions, subtitle_height=30, margin=10, shift_x=20):
    """
    Calculate the safe zone for subtitle placement using pre-defined positions.
    If blocked, it attempts to shift left/right before moving vertically.

    Returns:
        tuple: Coordinates of the safe zone (x1, y1, x2, y2).
    """

    def zones_overlap(zone1, zone2):
        """Checks if two zones overlap."""
        x1a, y1a, x2a, y2a = zone1
        x1b, y1b, x2b, y2b = zone2
        return not (x2a < x1b or x1a > x2b or y2a < y1b or y1a > y2b)

    # Step 1: Try Predefined Positions
    for position_name, position in sorted(pre_positions.items(), key=lambda x: x[1].get("priority", 0), reverse=True):
        x1, y1, x2, y2 = position["coordinates"]
        # print(position["coordinates"])
        # print(f"Checking {position_name}...")

        # Check if the original pre-position is available
        if not any(zones_overlap((x1, y1, x2, y2), detection[:4]) for detection in detections):
            # print(f"✅ Using original {position_name}")
            return (x1, y1, x2, y2)  # Return if it's available

        min_width = max(0.6 * frame_width, 600)  # 60% of frame width, but at least 600px
        for shift_dir in ["left", "right"]:
            shift_attempts = 0
            while shift_attempts < 10:  # Try shifting multiple times
                if shift_dir == "left":
                    new_x1, new_x2 = max(x1*10, x1 - shift_x), max(min_width, x2 - shift_x)
                else:
                    new_x1, new_x2 = min(frame_width - min_width, x1 + shift_x), min(frame_width, x2 + shift_x)

                shifted_zone = (new_x1, y1, new_x2, y2)

                if new_x2 > new_x1 and not any(zones_overlap(shifted_zone, detection[:4]) for detection in detections):
                    print(f"✅ Shifted {position_name} {shift_dir} and found a free spot.")
                    return shifted_zone  # Found a valid shifted zone

                shift_attempts += 1
                shift_x *= 1.5  # Increase shift size if first shift attempt fails

    # Step 2: Fallback to Dynamic Safe Zone Calculation (Starting from Bottom)
    # print("⚠ No predefined positions worked. Trying dynamic safe zone...")
    proposed_safe_zone = (0, frame_height - subtitle_height - margin, frame_width, frame_height - margin)

    while True:
        if all(not zones_overlap(proposed_safe_zone, (int(d[0]), int(d[1]), int(d[2]), int(d[3]))) for d in detections):
            # print("✅ Dynamic safe zone found.")
            return proposed_safe_zone  # Found a safe area

        # Try shifting up
        x1, y1, x2, y2 = proposed_safe_zone
        new_y1 = y1 - subtitle_height - margin
        new_y2 = y2 - subtitle_height - margin

        if new_y1 < 0:
            break  # No valid space above, fallback required

        proposed_safe_zone = (x1, new_y1, x2, new_y2)

    # Step 3: Final fallback to the top of the frame
    # print("⚠ No valid spaces found, defaulting to top position.")
    return (0, margin, frame_width, subtitle_height + margin)

### Subtitle size and margin

In [6]:
def get_subtitle_size(frame_height):
    """
    Dynamically calculate subtitle height and margin based on frame resolution.

    Parameters:
        frame_height (int): Height of the video frame.

    Returns:
        tuple: (subtitle_height, margin)
    """
    subtitle_height = max(0.05 * frame_height, 18)  # Minimum 18px for readability
    margin = max(0.02 * frame_height, 5)  # Minimum 5px to avoid text touching edges

    return int(subtitle_height), int(margin)

### Subtitle character calculation

In [7]:
!pip install arabic-reshaper

Collecting arabic-reshaper
  Downloading arabic_reshaper-3.0.0-py3-none-any.whl.metadata (12 kB)
Downloading arabic_reshaper-3.0.0-py3-none-any.whl (20 kB)
Installing collected packages: arabic-reshaper
Successfully installed arabic-reshaper-3.0.0


In [8]:
!pip install python-bidi

Collecting python-bidi
  Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (292 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.9/292.9 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-bidi
Successfully installed python-bidi-0.6.6


In [9]:
import cv2
import re
import textwrap
import numpy as np
import arabic_reshaper
from bidi.algorithm import get_display
from PIL import ImageFont, ImageDraw, Image

## universal font that fits all languages
def get_font(font_size):
    """
    Loads the universal OpenSans font.

    Parameters:
        font_size (int): The desired font size.

    Returns:
        PIL.ImageFont: The loaded font.
    """
    font_path = "/content/optimal_subtitle_copied/Font/GoNotoKurrent-Regular.ttf"
    return ImageFont.truetype(font_path, font_size)

# **2️⃣ Function to Detect Language Type**
def detect_language(text):
    """
    Detects if the text contains Latin, CJK, Arabic, or Indic characters.

    Returns:
        str: "latin", "cjk", "arabic", "indic", "thai" based on detected script.
    """

    # Extract the first two words
    words = re.findall(r'\b\w+\b', text)  # Split text into words
    text_snippet = " ".join(words[:2])  # Take only the first two words

    if any("\u0600" <= ch <= "\u06FF" for ch in text_snippet):  # Arabic script range
        return "arabic"
    elif any("\u4E00" <= ch <= "\u9FFF" for ch in text_snippet):  # Chinese script range
        return "cjk"
    elif any("\u3040" <= ch <= "\u30FF" for ch in text_snippet):  # Japanese script range
        return "cjk"
    elif any("\uAC00" <= ch <= "\uD7AF" for ch in text_snippet):  # Korean script range
        return "cjk"
    elif any("\u0900" <= ch <= "\u097F" for ch in text_snippet):  # Devanagari script (Hindi, Marathi, Sanskrit)
        return "indic"
    elif any("\u0E00" <= ch <= "\u0E7F" for ch in text_snippet):  # Thai script
        return "thai"
    return "latin"  # Default to Latin if nothing is detected

# **3️⃣ Main Subtitle Rendering Function**
def render_subtitle_multi_new(frame, subtitle_text, safe_zone, frame_width, frame_height, max_chars_per_line=40, opacity=0.8):
    """
    Render multi-line subtitles centered within the safe zone with a semi-transparent background.

    Parameters:
        frame (numpy array): The frame on which to render subtitles.
        subtitle_text (str): The text to display.
        safe_zone (tuple): (x1, y1, x2, y2) defining subtitle placement.
        frame_width (int): Width of the frame.
        frame_height (int): Height of the frame.
        opacity (float): Background opacity (0 = fully transparent, 1 = fully opaque).

    Returns:
        numpy array: The frame with subtitles rendered at an optimal position.
    """
    x1, y1, x2, y2 = safe_zone
    language = detect_language(subtitle_text)  # **Detect language**
    font_size = 28 if language == "cjk" else 26  # Adjust font size for CJK characters
    font = get_font(font_size)

    # **Handle Right-to-Left (RTL) text (e.g., Arabic)**
    if language == "arabic":
        subtitle_text = get_display(arabic_reshaper.reshape(subtitle_text))

    # **Calculate max width available for text**
    max_text_width = x2 - x1 - 30  # Ensure padding (30)
    # print("Max width in pixels:", max_text_width)

    # **Estimate Average Character Width Dynamically Using Subtitle Text**
    if len(subtitle_text) > 0:
        char_width = sum(font.getbbox(char)[2] - font.getbbox(char)[0] for char in subtitle_text) / len(subtitle_text)
    else:
        char_width = font_size // 2  # Fallback for empty text

    # **Determine Maximum Characters That Fit in Safe Zone**
    estimated_max_chars = max_text_width // char_width

    # **Use the Minimum of User-Defined or Estimated Max Chars**
    final_max_chars_per_line = min(estimated_max_chars, max_chars_per_line)

    # **Dynamically wrap text based on max character limit**
    wrapped_lines = []
    for line in subtitle_text.split("\n"):  # Handle existing line breaks
        new_lines = textwrap.wrap(line, width=int(final_max_chars_per_line))
        if new_lines:  # Only extend if wrapping produced text
            wrapped_lines.extend(new_lines)

    # **Fallback to prevent empty wrapped_lines**
    if not wrapped_lines:
        wrapped_lines = [" "]  # Ensures at least one blank line

    # **Measure Text Size**
    text_sizes = [font.getbbox(line) for line in wrapped_lines]
    text_width = max(size[2] - size[0] for size in text_sizes)  # Width (right - left)
    text_height = text_sizes[0][3] - text_sizes[0][1]  # Height (bottom - top)
    total_text_height = sum(size[3] - size[1] for size in text_sizes) + (len(wrapped_lines) - 1) * 10  # Extra spacing

    # **Center Text Within Safe Zone**
    text_x = x1 + (x2 - x1 - text_width) // 2  # **Horizontally centered**
    text_y = y1 + (y2 - y1 - total_text_height) // 2 - 20# **Vertically centered**

    # **Define Background Box**
    bg_x1 = max(text_x - 15, 0)
    bg_y1 = max(text_y - 5, 0)
    bg_x2 = min(text_x + text_width + 15, frame_width - 1)
    bg_y2 = min(text_y + total_text_height + 15, frame_height - 1)

    # **Create Semi-Transparent Background**
    overlay = frame.copy()
    cv2.rectangle(overlay, (bg_x1, bg_y1), (bg_x2, bg_y2), (0, 0, 0), -1)  # Black background
    cv2.addWeighted(overlay, opacity, frame, 1 - opacity, 0, frame)  # Blend overlay with frame

    # **Render Text Using PIL (for better font handling)**
    frame_pil = Image.fromarray(frame)
    draw = ImageDraw.Draw(frame_pil)

    y_offset = text_y
    for line in wrapped_lines:
        line_width = font.getbbox(line)[2] - font.getbbox(line)[0]  # Measure width
        line_x = x1 + (x2 - x1 - line_width) // 2  # Center per line
        draw.text((line_x, y_offset), line, font=font, fill=(255, 255, 255))  # White text
        y_offset += text_height + 10  # Extra line spacing

    return np.array(frame_pil)  # Convert back to OpenCV format

### Complete Pipeline for frames batch

In [46]:
import json
from collections import Counter, deque
import numpy as np
import cv2

safe_zone_history = deque(maxlen=10)  # Stores past safe zones for consistency (4)

def process_frames_batch_3fps_processed(frames, subtitles, process_fps=3, video_fps=30):
    """
    Process a batch of frames at 3 FPS:
    - Detects objects in frames sampled at 3 FPS
    - Computes one safe zone for the batch
    - Overlays subtitles using the same safe zone

    Parameters:
        frames (list): List of frames (NumPy arrays).
        subtitles (list): List of subtitles corresponding to each frame.
        video_fps (int): Original FPS of the video.
        process_fps (int): FPS at which YOLO will run.

    Returns:
        list: Processed frames with subtitles.
    """

    # ✅ Step 1: Select Frames at 3 FPS for YOLO Detection
    frame_interval = video_fps // process_fps  # Process every `frame_interval` frames
    selected_indices = list(range(0, len(frames), frame_interval))

    if not selected_indices:  # Prevent empty selection
        selected_indices = [0]  # Process at least one frame

    selected_frames = [frames[i] for i in selected_indices]  # Sampled frames for YOLO

    # ✅ Step 2: Batch Object Detection on Selected Frames
    batch_detections = detect_objects(selected_frames)  # YOLO runs only on sampled frames
    frame_height, frame_width = frames[0].shape[:2]

    # ✅ Load Predefined Safe Zones (JSON file loaded once)
    with open("/content/optimal_subtitle_copied/news_video_subtitle_positions.json", "r") as file:
        pre_positions = json.load(file).get(f"{frame_width}x{frame_height}", {})

    # ✅ Step 3: Compute Safe Zone for Each Sampled Frame
    subtitle_height, margin = get_subtitle_size(frame_height)
    # batch_safe_zones = [
    #     tuple(map(int, calculate_safe_zone_with_prepositions_test_new(
    #         frame_width, frame_height, batch_detections[i], pre_positions, subtitle_height, margin
    #     ))) for i in range(len(selected_frames))
    # ]

    # # ✅ Step 4: Determine a Common Safe Zone for the Batch
    # combined_safe_zones = batch_safe_zones + list(safe_zone_history)  # Merge with history
    # most_common_zone, count = Counter(combined_safe_zones).most_common(1)[0]

    # # ✅ Choose the Final Safe Zone
    # final_safe_zone = (
    #     most_common_zone if count >= len(combined_safe_zones) * 0.6
    #     else tuple(map(int, np.mean(batch_safe_zones, axis=0)))
    # )

    # ✅ Step 3: Collect Safe Zone Positions for Each Frame in Batch
    batch_safe_zones = [
        calculate_safe_zone_with_prepositions_test_new(
            frame_width, frame_height, batch_detections[i], pre_positions, subtitle_height, margin
        )[0]  # ✅ Extract only the position name
        for i in range(len(selected_frames))
    ]

    # ✅ Step 4: Determine the Most Used Safe Zone
    combined_safe_zones = batch_safe_zones + list(safe_zone_history)  # Merge with history
    zone_counts = Counter(combined_safe_zones)  # Count occurrences

    # ✅ Assign the most frequently used zone as the final safe zone
    if zone_counts:
        final_safe_zone = max(zone_counts, key=zone_counts.get)  # ✅ Find the most used zone
    else:
        final_safe_zone = "bottomCenter"  # ✅ Default fallback

    print(f"✅ Final Safe Zone: {final_safe_zone}")



    # ✅ Store the final safe zone for future frames
    safe_zone_history.append(final_safe_zone)

    # ✅ Step 5: Apply Safe Zone to All Frames in the Batch
    # return [
    #     cv2.rectangle(frames[i], (final_safe_zone[0], final_safe_zone[1]),
    #                   (final_safe_zone[2], final_safe_zone[3]), (255, 0, 0), 3)

    #     for i in range(len(frames))
    # ]

    # processed_frames = []
    # for i, frame in enumerate(frames):
    #     subtitle_text = subtitles[i]
    #     processed_frame = render_subtitle_multi_new(frame, subtitle_text, final_safe_zone, frame_width, frame_height)
    #     processed_frames.append(processed_frame)

    return final_safe_zone

In [54]:
import xml.etree.ElementTree as ET

def print_ttml_with_updated_regions(ttml_file_path, subtitle_data):
    """
    Prints the TTML <p> elements with updated regions, removing 'region' if it's None.

    Parameters:
        ttml_file_path (str): Path to the TTML file.
        subtitle_data (list): List of subtitles in the format:
            [{"start": start_time, "end": end_time, "text": text, "region": "region_id"}]
    """

    # ✅ Load TTML File
    tree = ET.parse(ttml_file_path)
    root = tree.getroot()
    ns = {'ttml': 'http://www.w3.org/ns/ttml'}

    # ✅ Find All <p> Elements (Subtitles) and Update Regions
    for p in root.findall('.//ttml:p', ns):
        start_time = convert_ttml_time_to_seconds(p.attrib.get("begin", "0.0s"))
        end_time = convert_ttml_time_to_seconds(p.attrib.get("end", "0.0s"))

        # ✅ Find Matching Subtitle
        matched_subtitle = next((sub for sub in subtitle_data if sub["start"] <= start_time <= sub["end"]), None)

        if matched_subtitle:
            if matched_subtitle["region"] is not None:
                p.attrib["region"] = matched_subtitle["region"]  # ✅ Assign Correct Region
            elif "region" in p.attrib:
                del p.attrib["region"]  # ✅ Remove `region` if it's None

    # ✅ Print Updated TTML Content
    updated_ttml = ET.tostring(root, encoding="utf-8").decode("utf-8")
    print(updated_ttml)  # ✅ Print instead of writing to a file

In [56]:
import xml.etree.ElementTree as ET
import json

def generate_ttml_with_styling(ttml_file_path, json_data, frame_width, frame_height):
    """
    Ensures the TTML file has <styling> and <layout> sections and updates them with subtitle styles and positions.

    Parameters:
        ttml_file_path (str): Path to the TTML file.
        json_data (dict): JSON data containing subtitle positions.
        frame_width (int): Width of the video frame.
        frame_height (int): Height of the video frame.

    Returns:
        None (Prints updated TTML with styling and layout)
    """

    # ✅ Load TTML File
    tree = ET.parse(ttml_file_path)
    root = tree.getroot()

    # ✅ Define namespace for TTML (ensure correct XML parsing)
    ns = {'ttml': 'http://www.w3.org/ns/ttml'}
    ET.register_namespace("", ns["ttml"])

    # ✅ Find or Create the <head> Element
    head_element = root.find('.//ttml:head', ns)
    if head_element is None:
        head_element = ET.Element("{http://www.w3.org/ns/ttml}head")
        root.insert(0, head_element)  # Insert <head> at the top

    # ✅ Find or Create the <styling> Element (for subtitle styles)
    styling_element = head_element.find('.//ttml:styling', ns)
    if styling_element is None:
        styling_element = ET.Element("{http://www.w3.org/ns/ttml}styling")
        head_element.insert(0, styling_element)  # Insert styling before layout

    # ✅ Define Default Subtitle Styling (If Missing)
    default_style = styling_element.find('.//ttml:style', ns)
    if default_style is None:
        default_style = ET.Element("{http://www.w3.org/ns/ttml}style", attrib={
            "xml:id": "default",
            "tts:color": "white",
            "tts:fontSize": "36px",
            "tts:fontFamily": "sansSerif",
            "tts:backgroundColor": "black",
            "tts:opacity": "0.8",
            "tts:textOutline": "1px black",
            "tts:displayAlign": "center"
        })
        styling_element.append(default_style)

    # ✅ Find or Create the <layout> Element (for subtitle positions)
    layout_element = head_element.find('.//ttml:layout', ns)
    if layout_element is None:
        layout_element = ET.Element("{http://www.w3.org/ns/ttml}layout")
        head_element.append(layout_element)  # Append layout after styling

    # ✅ Remove ALL existing <region> elements inside <layout>
    for region in list(layout_element):
        layout_element.remove(region)

    # ✅ Insert Subtitle Regions from JSON (Predefined & Dynamic)
    for region_name, region_data in json_data.items():
        x1, y1, x2, y2 = region_data["coordinates"]

        # Convert absolute pixel values to TTML percentages
        origin_x = (x1 / frame_width) * 100
        origin_y = (y1 / frame_height) * 100
        extent_x = ((x2 - x1) / frame_width) * 100
        extent_y = ((y2 - y1) / frame_height) * 100

        # Construct the region XML element
        region_element = ET.Element("{http://www.w3.org/ns/ttml}region", attrib={
            "tts:origin": f"{origin_x:.2f}% {origin_y:.2f}%",
            "tts:extent": f"{extent_x:.2f}% {extent_y:.2f}%",
            "tts:displayAlign": "center",
            "tts:textAlign": "center",
            "xml:id": region_name
        })

        # Add new regions to the layout
        layout_element.append(region_element)

    # ✅ Print Updated TTML with Styling & Layout
    updated_ttml = ET.tostring(root, encoding="utf-8").decode("utf-8")
    print(updated_ttml)

In [58]:
import xml.etree.ElementTree as ET
import json
from collections import Counter, deque
import numpy as np

safe_zone_history = deque(maxlen=10)  # Stores past safe zones for consistency

def process_frames_batch_3fps(frames, subtitles, ttml_file_path, json_file_path, video_fps=30, process_fps=3):
    """
    Processes a batch of frames at 3 FPS:
    - Detects objects in sampled frames using YOLO
    - Computes one safe zone for the batch
    - Updates the TTML file with the computed subtitle region

    Parameters:
        frames (list): List of frames (NumPy arrays).
        subtitles (list): List of subtitles corresponding to each frame.
        ttml_file_path (str): Path to the TTML file.
        json_file_path (str): Path to the JSON file with predefined subtitle positions.
        video_fps (int): Original FPS of the video.
        process_fps (int): FPS at which YOLO will run.

    Returns:
        None (Modifies TTML file in-place)
    """

    # ✅ Step 1: Select Frames at 3 FPS for YOLO Detection
    frame_interval = video_fps // process_fps  # Process every `frame_interval` frames
    selected_indices = list(range(0, len(frames), frame_interval))

    if not selected_indices:  # Prevent empty selection
        selected_indices = [0]  # Process at least one frame

    selected_frames = [frames[i] for i in selected_indices]  # Sampled frames for YOLO

    # ✅ Step 2: Batch Object Detection on Selected Frames
    batch_detections = detect_objects(selected_frames)  # YOLO runs only on sampled frames
    frame_height, frame_width = frames[0].shape[:2]

    # ✅ Load Predefined Safe Zones (JSON file)
    with open(json_file_path, "r") as file:
        pre_positions = json.load(file).get(f"{frame_width}x{frame_height}", {})

    # ✅ Step 3: Compute Safe Zone for Each Sampled Frame
    subtitle_height, margin = get_subtitle_size(frame_height)
    batch_safe_zones = [
        tuple(map(int, calculate_safe_zone_with_prepositions_test(
            frame_width, frame_height, batch_detections[i], pre_positions, subtitle_height, margin
        ))) for i in range(len(selected_frames))
    ]

    # ✅ Step 4: Determine a Common Safe Zone for the Batch
    combined_safe_zones = batch_safe_zones + list(safe_zone_history)  # Merge with history
    most_common_zone, count = Counter(combined_safe_zones).most_common(1)[0]

    # ✅ Choose the Final Safe Zone
    final_safe_zone = (
        most_common_zone if count >= len(combined_safe_zones) * 0.6
        else tuple(map(int, np.mean(batch_safe_zones, axis=0)))
    )

    # ✅ Keep all computed dynamic regions in a global dictionary
    all_dynamic_regions = {}

    # ✅ Store the final safe zone for future frames
    safe_zone_history.append(final_safe_zone)

    # ✅ Generate unique dynamic region ID for each subtitle segment
    dynamic_region_name = f"dynamic_{int(final_safe_zone[0])}_{int(final_safe_zone[1])}"
    all_dynamic_regions[dynamic_region_name] = final_safe_zone  # ✅ Store multiple dynamic regions

    # ✅ Update TTML Layout to Include All Detected Dynamic Regions
    update_ttml_layout(ttml_file_path, json_file_path, frame_width, frame_height, all_dynamic_regions)

    # ✅ Update Subtitle `<p>` Elements with Individual Dynamic Regions
    update_ttml_subtitle_regions(ttml_file_path, subtitle_data)

    print(f"Updated TTML with region {dynamic_region_name}")

## Testing the Code

### Integrate with srt file and video fps

In [11]:
!pip install pysrt

Collecting pysrt
  Downloading pysrt-1.1.2.tar.gz (104 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/104.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.4/104.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pysrt
  Building wheel for pysrt (setup.py) ... [?25l[?25hdone
  Created wheel for pysrt: filename=pysrt-1.1.2-py3-none-any.whl size=13443 sha256=cb1897037971453d015b82c6464bc9752a79bfdf150ac9c4a1ba5c2e5f1312ba
  Stored in directory: /root/.cache/pip/wheels/2d/b2/df/ea10959920533975b4a74a25a35e6d79655b63f3006611a99f
Successfully built pysrt
Installing collected packages: pysrt
Successfully installed pysrt-1.1.2


In [7]:
import pysrt
from collections import defaultdict

def parse_srt_file(srt_file):
    """
    Reads and parses an SRT file, pre-indexing subtitles for fast lookup.

    Parameters:
        srt_file (str): Path to the .srt file.

    Returns:
        dict: A dictionary where keys are integer timestamps (seconds),
              and values are subtitle texts.
    """
    subs = pysrt.open(srt_file)
    subtitle_dict = defaultdict(lambda: None)  # Default to None for missing frames

    for sub in subs:
        start_time = int(sub.start.minutes * 60 + sub.start.seconds)  # Round to nearest second
        end_time = int(sub.end.minutes * 60 + sub.end.seconds)
        subtitle_text = sub.text.replace("\n", " ")  # Convert newlines to spaces

        # ✅ Store subtitles for all frames in the time range
        for t in range(start_time, end_time + 1):
            subtitle_dict[t] = subtitle_text

    return subtitle_dict  # Faster lookups using a dictionary

In [12]:
def get_subtitles_for_frames(frame_times, subtitle_data):
    """
    Retrieves subtitle texts for a batch of frame timestamps.

    Parameters:
        frame_times (list): List of timestamps (in seconds).
        subtitle_data (list): List of subtitles in the format:
            [{"start": start_time, "end": end_time, "text": text, "region": region}, ...]

    Returns:
        list: List of subtitle texts corresponding to each frame timestamp.
    """
    frame_subtitles = []

    for time in frame_times:
        subtitle_text = ""  # Default to empty string

        for subtitle in subtitle_data:
            if subtitle["start"] <= time <= subtitle["end"]:
                subtitle_text = subtitle["text"].replace("\n", " ")  # Remove newlines
                break  # Stop once we find a match

        frame_subtitles.append(subtitle_text)

    return frame_subtitles

In [13]:
def get_video_fps(video_path):
    """Extracts FPS from a video using FFmpeg."""
    cmd = ["ffmpeg", "-i", video_path]

    # ✅ Use stdout and stderr explicitly
    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    # ✅ Parse FPS from FFmpeg output
    for line in result.stderr.split("\n"):
        if "Stream" in line and "Video" in line and "fps" in line:
            fps_value = float(line.split("fps")[0].strip().split()[-1])  # Extract FPS
            return fps_value

    return 30  # Default to 30 FPS if not found

In [14]:
def should_process_frame(frame_time, subtitle_timestamps, fps, tolerance=0.01):
    """
    Checks if a frame's timestamp falls within any subtitle duration, with a small tolerance.

    Returns True if the frame should be processed, False otherwise.
    """
    frame_tolerance = 1 / fps  # The duration of a single frame in seconds
    adjusted_tolerance = min(tolerance, 2 * frame_tolerance)  # Limit to 2 frames

    result = any((start - adjusted_tolerance) <= frame_time <= (end + adjusted_tolerance)
                 for start, end in subtitle_timestamps)

    # print(f"Frame Time: {frame_time:.3f}s | Process: {result}")  # ✅ Print Result
    return result

In [58]:
## combined srt and ttml timestamp

import pysrt
import xml.etree.ElementTree as ET
import re

def get_subtitle_timestamps(subtitle_file, file_type="auto"):
    """
    Extracts subtitle timestamps from an SRT or TTML file.

    Parameters:
        subtitle_file (str): Path to the subtitle file.
        file_type (str): "srt" for SRT, "ttml" for TTML, or "auto" to detect from extension.

    Returns:
        list of tuples: Each tuple contains (start_time, end_time) in seconds.
    """
    # Auto-detect file type
    if file_type == "auto":
        if subtitle_file.endswith(".srt"):
            file_type = "srt"
        elif subtitle_file.endswith(".ttml") or subtitle_file.endswith(".xml"):
            file_type = "ttml"
        else:
            raise ValueError("Unsupported subtitle file format. Use 'srt' or 'ttml'.")

    # Process SRT
    if file_type == "srt":
        return get_srt_timestamps(subtitle_file)

    # Process TTML
    elif file_type == "ttml":
        return get_ttml_timestamps(subtitle_file)

    else:
        raise ValueError("Invalid file type specified. Use 'srt' or 'ttml'.")

def get_srt_timestamps(srt_file):
    """Extracts subtitle timestamps from an SRT file."""
    subs = pysrt.open(srt_file)
    subtitle_timestamps = []

    for sub in subs:
        start_time = sub.start.hours * 3600 + sub.start.minutes * 60 + sub.start.seconds + sub.start.milliseconds / 1000
        end_time = sub.end.hours * 3600 + sub.end.minutes * 60 + sub.end.seconds + sub.end.milliseconds / 1000
        subtitle_timestamps.append((start_time, end_time))

    return subtitle_timestamps

def get_ttml_timestamps(ttml_file):
    """Extracts subtitle timestamps from a TTML file."""
    tree = ET.parse(ttml_file)
    root = tree.getroot()
    ns = {'ttml': 'http://www.w3.org/ns/ttml'}

    subtitle_timestamps = []

    for p in root.findall('.//ttml:p', ns):
        start_time = p.attrib.get("begin", "0.0s")
        end_time = p.attrib.get("end", "0.0s")

        start_seconds = convert_ttml_time_to_seconds(start_time)
        end_seconds = convert_ttml_time_to_seconds(end_time)

        subtitle_timestamps.append((start_seconds, end_seconds))

    return subtitle_timestamps

# def convert_ttml_time_to_seconds(ttml_time):
#     """
#     Converts TTML time format (HH:MM:SS.mmm or MM:SS.mmm or SS.mmm or SS.mmm's') to seconds.

#     Parameters:
#         ttml_time (str): TTML-formatted time.

#     Returns:
#         float: Time in seconds.
#     """
#     ttml_time = ttml_time.rstrip('s')  # Remove trailing 's' if present
#     parts = ttml_time.split(":")

#     if len(parts) == 3:  # HH:MM:SS.mmm
#         hours, minutes, seconds = map(float, parts)
#     elif len(parts) == 2:  # MM:SS.mmm
#         hours, minutes, seconds = 0, *map(float, parts)
#     else:  # SS.mmm
#         hours, minutes, seconds = 0, 0, float(parts[0])

#     return hours * 3600 + minutes * 60 + seconds

def convert_ttml_time_to_seconds(ttml_time):
    """
    Converts TTML time format (HH:MM:SS.mmm, MM:SS.mmm, SS.mmm, or SS,mmm) to seconds.

    Parameters:
        ttml_time (str): TTML-formatted time.

    Returns:
        float: Time in seconds (with millisecond precision).
    """

    # ✅ Remove trailing 's' if present and replace ',' with '.'
    ttml_time = ttml_time.rstrip('s').replace(',', '.')

    # ✅ Use regex to extract time components
    match = re.match(r"(?:(\d+):)?(?:(\d+):)?(\d+)(?:\.(\d+))?", ttml_time)

    if not match:
        raise ValueError(f"Invalid TTML time format: {ttml_time}")

    # ✅ Extract components safely
    hours = int(match.group(1)) if match.group(1) else 0
    minutes = int(match.group(2)) if match.group(2) else 0
    seconds = int(match.group(3)) if match.group(3) else 0
    milliseconds = int(match.group(4)) if match.group(4) else 0

    return hours * 3600 + minutes * 60 + seconds + milliseconds / 1000.0

In [16]:
import xml.etree.ElementTree as ET
import pysrt
import os

def parse_subtitle_file(file_path):
    """
    Parses either an SRT or TTML subtitle file and extracts subtitles.

    Parameters:
        file_path (str): Path to the subtitle file.

    Returns:
        list: List of subtitles in the format:
            [
                {"start": start_time, "end": end_time, "text": "subtitle text", "region": "region_id"}
            ]
    """
    extension = os.path.splitext(file_path)[-1].lower()
    subtitle_data = []

    if extension == ".srt":
        subs = pysrt.open(file_path)
        for sub in subs:
            start_time = (
                sub.start.hours * 3600 + sub.start.minutes * 60 + sub.start.seconds + sub.start.milliseconds / 1000
            )
            end_time = (
                sub.end.hours * 3600 + sub.end.minutes * 60 + sub.end.seconds + sub.end.milliseconds / 1000
            )
            text = sub.text.replace("\n", " ")  # Convert newlines to spaces

            subtitle_data.append({
                "start": start_time,
                "end": end_time,
                "text": text,
                "region": None  # SRT doesn't support regions
            })

    elif extension == ".ttml":
        # ✅ Register TTML Namespaces
        ET.register_namespace('', "http://www.w3.org/ns/ttml")  # Default TTML namespace
        ET.register_namespace('ttp', "http://www.w3.org/ns/ttml#parameter")
        ET.register_namespace('tts', "http://www.w3.org/ns/ttml#styling")
        ET.register_namespace('ttm', "http://www.w3.org/ns/ttml#metadata")

        # ✅ Parse TTML File
        tree = ET.parse(file_path)
        root = tree.getroot()
        ns = {'ttml': 'http://www.w3.org/ns/ttml'}

        # ✅ Extract Subtitle Data
        for p in root.findall('.//ttml:p', ns):
            start_time = convert_ttml_time_to_seconds(p.attrib.get("begin", "0.0s"))
            end_time = convert_ttml_time_to_seconds(p.attrib.get("end", "0.0s"))
            text = " ".join(p.itertext()).strip()
            region = p.attrib.get("region", None)

            subtitle_data.append({
                "start": start_time,
                "end": end_time,
                "text": text,
                "region": region
            })

    else:
        raise ValueError("Unsupported subtitle format. Only SRT and TTML are supported.")

    return subtitle_data

In [59]:
## current main test

import os
import cv2
import subprocess

# ✅ Define file paths in /content/
video_input_path = "/content/optimal_subtitle_copied/test_video_3.mp4"
final_video_path = "/content/optimal_subtitle_copied/test_video_with_audio.mp4"
file_path = "/content/audio1_2.ttml"
# json_path = "/content/optimal_subtitle_copied/subtitle_positions_ttml.json"

subtitle_data_updated = []

# ✅ Extract FPS dynamically
fps = get_video_fps(video_input_path)
print(f"✅ Corrected FPS: {fps}")

# ✅ Load Pre-indexed Subtitles
subtitle_data = parse_subtitle_file(file_path)

# ✅ Load Subtitle Timestamps
subtitle_timestamps = get_subtitle_timestamps(file_path)

# ✅ Open Video File
cap = cv2.VideoCapture(video_input_path)
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
print(f"Frame Width: {frame_width}, Frame Height: {frame_height}")

# ✅ Initialize Video Writer
# fourcc = cv2.VideoWriter_fourcc(*"mp4v")
# out = cv2.VideoWriter(final_video_path, fourcc, fps, (frame_width, frame_height))

# ✅ Process Video Based on Subtitle Timeframes
frame_buffer = []
timestamp_buffer = []
subtitle_index = 0  # Track current subtitle
frame_number = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break  # Exit if no more frames

    frame_time = frame_number / fps  # Convert frame number to timestamp

    # ✅ Ensure we have a valid subtitle to match
    if subtitle_index < len(subtitle_data):
        current_subtitle = subtitle_data[subtitle_index]
        # print(current_subtitle)
        current_start = current_subtitle["start"]
        # print(current_start)
        current_end = current_subtitle["end"]
        # print(current_end)
        subtitle_text = current_subtitle["text"]

        # ✅ Collect frames only within the current subtitle's timestamp range
        if current_start <= frame_time <= current_end:
            frame_buffer.append(frame)
            timestamp_buffer.append(frame_time)

        # ✅ When the last frame of the subtitle is reached, process the batch
        if frame_time > current_end:
            if frame_buffer:  # Ensure we have frames to process
                subtitles = [current_subtitle]  # Use only the current subtitle's data

                processed_frames = process_frames_batch_3fps_processed(frame_buffer, subtitles)
                current_subtitle["region"] = processed_frames
                # print(processed_frames)

                # ✅ Append processed subtitle data dynamically
                # subtitle_data.append({
                #     "start": current_start,
                #     "end": current_end,
                #     "text": subtitle_text,
                #     "region": processed_frames  # ✅ Dynamically determined region
                # })

                # ✅ Write processed frames to the output video
                # for processed_frame in processed_frames:
                #     print(processed_frame)

                # ✅ Clear buffers and move to the next subtitle
                frame_buffer.clear()
                timestamp_buffer.clear()
                subtitle_index += 1

    frame_number += 1

# ✅ Process remaining frames if any subtitles are left
if frame_buffer:
    subtitles = [subtitle_data[subtitle_index]]  # Process last remaining batch
    processed_frames = process_frames_batch_3fps_processed(frame_buffer, subtitles)
    print(processed_frames)
    # for processed_frame in processed_frames:
    #     print(processed_frame)

cap.release()
print(subtitle_data)
print_ttml_with_updated_regions(file_path, subtitle_data)
layout = get_used_safe_zones()
generate_ttml_with_styling(file_path, layout, frame_width, frame_height)
# out.release()
# print(f"✅ Video processing completed: {final_video_path}")

✅ Corrected FPS: 29.97
Frame Width: 1280, Frame Height: 720
[{'start': 9789.066, 'end': 9789.906, 'text': 'Hi. My name is.', 'region': None}, {'start': 9789.99, 'end': 9792.384, 'text': "I'm, uh, hyper polyglot and whether you mean to...", 'region': None}, {'start': 9792.468, 'end': 9794.316, 'text': '...speak, like, ten or more languages.', 'region': None}, {'start': 9794.484, 'end': 9796.836, 'text': 'I think that the question like, oh, how many years do you speak?', 'region': None}, {'start': 9796.92, 'end': 9799.692, 'text': "It's a very difficult one to answer because I don't know how...", 'region': None}, {'start': 9799.776, 'end': 9802.8, 'text': '...to define the difference between, uh, you can speak in a language...', 'region': None}, {'start': 9802.884, 'end': 9805.53, 'text': '...or you really understand the communicate in the language.', 'region': None}, {'start': 9805.614, 'end': 9809.1, 'text': 'So, for example, if you ask me, how many languages can I count from...', 'reg

In [19]:
## process frame in time frame only
import os
import cv2
import pysrt
import torch
import subprocess

# ✅ Define file paths in /content/
video_input_path = "/content/optimal_subtitle_copied/test_video_3.mp4"
final_video_path = "/content/optimal_subtitle_copied/test_video_with_audio.mp4"
# file_path = "/content/optimal_subtitle_copied/SRT_file/test_long_video_17_eng.srt"
file_path = "/content/optimal_subtitle_copied/TTML_file/test_video_3_eng.ttml"
json_path = "/content/optimal_subtitle_copied/subtitle_positions_ttml.json"

# ✅ Create temporary paths inside /content/
audio_path = "/content/temp_audio.aac"
output_video_path = "/content/temp_video_no_audio.mp4"

# ✅ Extract FPS dynamically
fps = get_video_fps(video_input_path)
print(f"✅ Corrected FPS: {fps}")

# ✅ Extract audio from the original video
# os.system(f"ffmpeg -i {video_input_path} -q:a 0 -map a {audio_path}")

# ✅ Load Pre-indexed Subtitles
# subtitle_data = parse_srt_file(file_path)
subtitle_data = parse_subtitle_file(file_path)

# ✅ Load Subtitle Timestamps
subtitle_timestamps = get_subtitle_timestamps(file_path)

# ✅ Open Video File
cap = cv2.VideoCapture(video_input_path)
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
print(f"Frame Width: {frame_width}, Frame Height: {frame_height}")
# subtitle_height, margin = get_subtitle_size(frame_height)

# ✅ Initialize Video Writer
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

# ✅ Process Video with Batching
batch_size = 128
frame_buffer = []
timestamp_buffer = []
frame_number = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break  # Exit if no more frames

    frame_time = frame_number / fps  # Convert frame number to timestamp

    # ✅ Only process frames that fall within subtitle timestamps
    if should_process_frame(frame_time, subtitle_timestamps, fps):
        frame_buffer.append(frame)
        timestamp_buffer.append(frame_time)

        # ✅ Process in batch when buffer reaches batch_size
        if len(frame_buffer) == batch_size:
            subtitles = get_subtitles_for_frames(timestamp_buffer, subtitle_data)
            # print(subtitles)
            # processed_frames = process_frames_batch_3fps(frame_buffer, subtitles, file_path, json_path)
            processed_frames = process_frames_batch_3fps_processed(frame_buffer, subtitles)

            for processed_frame in processed_frames:
                out.write(processed_frame)

            frame_buffer.clear()
            timestamp_buffer.clear()

    frame_number += 1

# ✅ Process remaining frames if they exist
if frame_buffer:
    subtitles = get_subtitles_for_frames(timestamp_buffer, subtitle_data)
    # processed_frames = process_frames_batch_3fps(frame_buffer, subtitles, file_path, json_path)
    processed_frames = process_frames_batch_3fps_processed(frame_buffer, subtitles)
    for processed_frame in processed_frames:
        out.write(processed_frame)

cap.release()

out.release()

# print(f"✅ Video without audio saved at: {output_video_path}")

# # ✅ Check if final_video_path exists
# if os.path.exists(final_video_path):
#     os.remove(final_video_path)  # ✅ Delete existing file

# # ✅ Merge Video & Audio with Correct FPS & Sync Fixes
# os.system(f"ffmpeg -i {output_video_path} -i {audio_path} -map 0:v:0 -map 1:a:0 -c:v libx264 -preset fast -crf 23 -c:a copy -vsync vfr {final_video_path}")

# # ✅ Clean up the temporary files
# os.remove(audio_path)
# os.remove(output_video_path)

# print(f"✅ Final video with audio saved at: {final_video_path}")

✅ Corrected FPS: 29.97
Frame Width: 1280, Frame Height: 720

0: 384x640 2 faces, 42.2ms
1: 384x640 2 faces, 42.2ms
2: 384x640 2 faces, 42.2ms
3: 384x640 2 faces, 42.2ms
4: 384x640 2 faces, 1 text, 42.2ms
5: 384x640 2 faces, 1 text, 42.2ms
6: 384x640 2 faces, 1 text, 42.2ms
7: 384x640 2 faces, 1 text, 42.2ms
8: 384x640 2 faces, 42.2ms
9: 384x640 2 faces, 1 logo, 1 news-ticker, 1 text, 42.2ms
10: 384x640 2 faces, 2 logos, 2 news-tickers, 2 texts, 42.2ms
11: 384x640 2 faces, 2 logos, 1 news-ticker, 2 texts, 42.2ms
12: 384x640 2 faces, 2 logos, 1 news-ticker, 2 texts, 42.2ms
Speed: 5.1ms preprocess, 42.2ms inference, 43.6ms postprocess per image at shape (13, 3, 384, 640)
(14, 522, 1266, 558)

0: 384x640 2 faces, 2 logos, 1 news-ticker, 2 texts, 3.9ms
1: 384x640 2 faces, 2 logos, 1 news-ticker, 1 text, 3.9ms
2: 384x640 2 faces, 2 logos, 1 news-ticker, 3 texts, 3.9ms
3: 384x640 2 faces, 2 logos, 1 news-ticker, 2 texts, 3.9ms
4: 384x640 2 faces, 2 logos, 1 news-ticker, 1 text, 3.9ms
5: 384x6