### Descriptive Captions

In [None]:
#!pip install scenedetect
#!pip install opencv-python

In [None]:
from scenedetect import open_video, SceneManager
from scenedetect.detectors import ContentDetector

path = "src/input.mp4"

video = open_video(path)
manager = SceneManager()
manager.add_detector(ContentDetector(threshold=30.0))
manager.detect_scenes(video)

scenes = manager.get_scene_list()

In [None]:
from transformers import pipeline

captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

In [None]:
import cv2
from PIL import Image

def extract_frame(input_path, frame_number):
    cap = cv2.VideoCapture(input_path)

    # aller directement à la frame demandée
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)

    ok, frame_bgr = cap.read()
    cap.release()
    if not ok:
        raise RuntimeError(f"Impossible de lire la frame {frame_number}.")
    
    # OpenCV lit en BGR → convertir en RGB
    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)

    # Vers PIL (le pipeline accepte PIL / numpy / torch)
    img = Image.fromarray(frame_rgb)

    return img

In [None]:
import re

def dedupe_tail(text: str) -> str:
    s = re.sub(r'(\b.+?\b)( \1)+$', r'\1', text.strip()) # simple expression 
    return re.sub(r'(\b[\w ]+\b)(\s*-\s*\1)+$', r'\1', s.strip()) # sign intersection 


In [None]:
out = []

for i, (start, end) in enumerate(scenes, 1):
    middle = int((end.frame_num + start.frame_num)/2)
    
    f_image = extract_frame(path, middle)

    text = captioner(f_image)[0]['generated_text']
    text = dedupe_tail(text)

    out.append(str(i))
    out.append(f"{start.get_timecode()} --> {end.get_timecode()}")
    out.append(f"({text})")
    out.append("")  # ligne vide entre blocs

descriptive_captions = "\n".join(out)

### Subtitles

In [None]:
import whisper

model = whisper.load_model("turbo")
result = model.transcribe(path)

In [None]:
import math
import textwrap

def _format_ts_srt(t: float) -> str:
    """Formate un timestamp en HH:MM:SS,mmm (SRT)."""
    ms = int(round((t % 1) * 1000))
    secs = int(math.floor(t))
    if ms == 1000:  # évite 99,1000 -> 100,000
        secs += 1
        ms = 0
    h = secs // 3600
    m = (secs % 3600) // 60
    s = secs % 60
    return f"{h:02}:{m:02}:{s:02},{ms:03}"

def to_srt(segments, max_line_length: int | None = None) -> str:
    """Convertit une liste de segments Whisper en sous-titres SRT."""
    out, idx = [], 1
    for seg in segments:
        text = (seg.get("text") or "").strip()
        if not text:
            continue
        if max_line_length:
            text = "\n".join(textwrap.wrap(text, width=max_line_length))
        out.append(str(idx))
        out.append(f"{_format_ts_srt(seg['start'])} --> {_format_ts_srt(seg['end'])}")
        out.append(text)
        out.append("")  # ligne vide entre blocs
        idx += 1
    return "\n".join(out)

srt_text = to_srt(result.get("segments"))

In [None]:
# !pip install srt

In [None]:
import srt
from datetime import timedelta

# >>> Ordre de priorité des sources pour la concaténation
SOURCE_ORDER = {"image": 0, "son": 1}

def parse_srt_text(text: str, source: str):
    """Parse un SRT depuis une chaîne et marque la provenance."""
    items = list(srt.parse(text or ""))
    for it in items:
        it._source = source
    return items

def active_on_segment(sub, seg_start, seg_end):
    # recouvrement strict, pas de tolérance
    return (sub.start < seg_end) and (sub.end > seg_start)

def fuse_srt_strings(descriptive_captions: str, srt_text: str) -> str:
    subs_img = parse_srt_text(descriptive_captions, "image")
    subs_son = parse_srt_text(srt_text, "son")
    all_subs = subs_img + subs_son

    if not all_subs:
        return ""

    # 1) Points de rupture = tous les starts et ends
    cuts = set()
    for it in all_subs:
        cuts.add(it.start)
        cuts.add(it.end)
    # trier et retirer les segments nuls
    cuts = sorted(cuts)

    # 2) Segments consécutifs [t_i, t_{i+1})
    segments = []
    for i in range(len(cuts) - 1):
        a, b = cuts[i], cuts[i+1]
        if a < b:
            segments.append((a, b))

    # 3) Pour chaque segment, concaténer les textes actifs
    fused = []
    idx = 1
    for seg_start, seg_end in segments:
        actives = [it for it in all_subs if active_on_segment(it, seg_start, seg_end)]

        if not actives:
            continue  # pas de texte actif sur ce segment

        # Ordre de concaténation :
        #   - d'abord l'ordre chronologique (start)
        #   - en cas d'égalité, tri par source pour stabilité
        actives.sort(key=lambda it: (it.start, getattr(it, "_source", "")))

        # >>> tri par source d'abord, puis par start
        actives.sort(key=lambda it: (SOURCE_ORDER.get(getattr(it, "_source", ""), 99), it.start))

        texts = [it.content.strip() for it in actives if it.content and it.content.strip()]
        if not texts:
            continue  # rien à écrire

        content = "\n".join(texts)

        fused.append(
            srt.Subtitle(
                index=idx,
                start=seg_start,
                end=seg_end,
                content=content
            )
        )
        idx += 1

    return srt.compose(fused)

In [None]:
from pathlib import Path

fusion = fuse_srt_strings(descriptive_captions, srt_text)

OUT_FILE = Path(path).with_name(Path(path).stem + "_cc.srt")

with open(OUT_FILE, "w", encoding="utf-8") as f:
    f.write(fusion)