In [1]:
#!pip install scenedetect
#!pip install opencv-python

In [2]:
from scenedetect import open_video, SceneManager
from scenedetect.detectors import ContentDetector

path = "input.mp4"

video = open_video(path)
manager = SceneManager()
manager.add_detector(ContentDetector(threshold=30.0))
manager.detect_scenes(video)

scenes = manager.get_scene_list()

In [3]:
from transformers import pipeline

captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use mps:0


In [4]:
import cv2
from PIL import Image

def extract_frame(input_path, frame_number):
    cap = cv2.VideoCapture(input_path)

    # aller directement à la frame demandée
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)

    ok, frame_bgr = cap.read()
    cap.release()
    if not ok:
        raise RuntimeError(f"Impossible de lire la frame {frame_number}.")
    
    # OpenCV lit en BGR → convertir en RGB
    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)

    # Vers PIL (le pipeline accepte PIL / numpy / torch)
    img = Image.fromarray(frame_rgb)

    return img

In [5]:
import re

def dedupe_tail(text: str) -> str:
    s = re.sub(r'(\b.+?\b)( \1)+$', r'\1', text.strip()) # simple expression 
    return re.sub(r'(\b[\w ]+\b)(\s*-\s*\1)+$', r'\1', s.strip()) # sign intersection 


In [6]:
out = []

for i, (start, end) in enumerate(scenes, 1):
    middle = int((end.frame_num + start.frame_num)/2)
    
    f_image = extract_frame(path, middle)

    text = captioner(f_image)[0]['generated_text']

    text = dedupe_tail(text)

    out.append(str(i))
    out.append(f"{start.get_timecode()} --> {end.get_timecode()}")
    out.append(f"({text})")
    out.append("")  # ligne vide entre blocs

descriptive_captions = "\n".join(out)

In [7]:
srt_path = path + "_dc.srt"

# Écriture du fichier SRT
with open(srt_path, "w", encoding="utf-8") as f:
    f.write(descriptive_captions)

### Todo : Closed Captions (CC)