In [None]:
# transcribe.py
from faster_whisper import WhisperModel
from pathlib import Path
import torch, argparse, math

def ts(sec: float) -> str:
    """seconds → SRT timestamp (HH:MM:SS,mmm)"""
    h, sec = divmod(sec, 3600)
    m, sec = divmod(sec, 60)
    return f"{int(h):02}:{int(m):02}:{int(sec):02},{int((sec-int(sec))*1000):03}"

def transcribe(audio, model_size="large-v3", lang="fr"):
    model = WhisperModel(model_size,
                         device="cuda" if torch.cuda.is_available() else "cpu",
                         compute_type="auto")  # quantised = less VRAM
    segments, _ = model.transcribe(audio, language=lang)
    
    base = Path(audio).with_suffix("")
    text_out, srt_out = [], []

    for i, segment in enumerate(segments):
        start = segment.start
        end   = segment.end
        txt   = segment.text
        text_out.append(txt)
        srt_out.append(f"{i+1}\n{ts(start)} --> {ts(end)}\n{txt.strip()}\n")
    
    base.with_suffix(".txt").write_text("\n".join(text_out), encoding="utf-8")
    base.with_suffix(".srt").write_text("\n".join(srt_out), encoding="utf-8")
    print(f"✓ Wrote {base}.txt and {base}.srt")

transcribe("data/sample.mp4", "large-v3", "fr")

# if __name__ == "__main__":
#     ap = argparse.ArgumentParser()
#     ap.add_argument("audio", help="audio/video file (wav, mp3, mp4, etc.)")
#     ap.add_argument("--model", default="large-v3", help="tiny|base|small|..."
#                     " or large-v3 for best French accuracy")
#     args = ap.parse_args()
#     transcribe(args.audio, args.model)

In [None]:
# transcribe.py
from faster_whisper import WhisperModel
from pathlib import Path

def ts(sec: float) -> str:
    """seconds → SRT timestamp (HH:MM:SS,mmm)"""
    h, sec = divmod(sec, 3600)
    m, sec = divmod(sec, 60)
    return f"{int(h):02}:{int(m):02}:{int(sec):02},{int((sec-int(sec))*1000):03}"

def transcribe(audio, model_size="large-v3", lang="fr"):
    model = WhisperModel(model_size,
                         device="cuda" if torch.cuda.is_available() else "cpu",
                         compute_type="auto")  # quantised = less VRAM
    segments, _ = model.transcribe(audio, language=lang)
    
    base = Path(audio).with_suffix("")
    text_out, srt_out = [], []

    for i, segment in enumerate(segments):
        start = segment.start
        end   = segment.end
        txt   = segment.text
        text_out.append(txt)
        srt_out.append(f"{i+1}\n{ts(start)} --> {ts(end)}\n{txt.strip()}\n")
    
    base.with_suffix(".txt").write_text("\n".join(text_out), encoding="utf-8")
    base.with_suffix(".srt").write_text("\n".join(srt_out), encoding="utf-8")
    print(f"✓ Wrote {base}.txt and {base}.srt")

transcribe("data/sample.mp4", "large-v3", "fr")