# Subtitle Generation Notebook
This notebook provides a step-by-step guide to generating subtitles for videos using various tools and libraries. Follow the instructions below to create subtitles for your video content.

In [None]:
import json
import math
import os
import subprocess
import uuid
import wave
import contextlib
from pathlib import Path
from typing import Dict, List, Any

try:
    import torch
except ImportError:
    torch = None

try:
    import whisper
except ImportError:
    whisper = None

try:
    from transformers import pipeline as hf_asr_pipeline
except ImportError:
    hf_asr_pipeline = None

try:
    from transformers import MarianMTModel, MarianTokenizer
except ImportError:
    MarianMTModel = None
    MarianTokenizer = None

try:
    import nemo.collections.asr as nemo_asr
except ImportError:
    nemo_asr = None

LANG_CODE_MAP = {
    'hindi': 'hi', 'marathi': 'mr', 'spanish': 'es', 'french': 'fr', 'german': 'de',
    'japanese': 'ja', 'chinese': 'zh', 'arabic': 'ar', 'english': 'en', 'tamil': 'ta',
    'telugu': 'te', 'bengali': 'bn', 'kannada': 'kn', 'gujarati': 'gu', 'punjabi': 'pa'
}


In [2]:
def ensure_dir(path: Path) -> Path:
    path.mkdir(parents=True, exist_ok=True)
    return path


def run_ffmpeg(command: List[str]) -> None:
    completed = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False)
    if completed.returncode != 0:
        raise RuntimeError(completed.stderr.decode('utf-8', errors='ignore'))


def extract_audio_ffmpeg(video_path: Path, output_dir: Path, sample_rate: int = 16000) -> Path:
    ensure_dir(output_dir)
    audio_path = output_dir / f'{video_path.stem}_{uuid.uuid4().hex[:8]}.wav'
    command = [
        'ffmpeg', '-y', '-i', str(video_path),
        '-ac', '1', '-ar', str(sample_rate), str(audio_path)
    ]
    run_ffmpeg(command)
    return audio_path


def get_audio_duration(audio_path: Path) -> float:
    with contextlib.closing(wave.open(str(audio_path), 'rb')) as wf:
        frames = wf.getnframes()
        rate = wf.getframerate()
    return frames / float(rate)


def format_timestamp(seconds: float) -> str:
    seconds = max(seconds, 0.0)
    h, rem = divmod(int(seconds), 3600)
    m, s = divmod(rem, 60)
    ms = int(round((seconds - int(seconds)) * 1000))
    return f'{h:02}:{m:02}:{s:02},{ms:03}'


def segments_to_srt(segments: List[Dict[str, Any]], output_path: Path) -> None:
    ensure_dir(output_path.parent)
    with open(output_path, 'w', encoding='utf-8') as handle:
        for idx, segment in enumerate(segments, start=1):
            start_ts = format_timestamp(segment.get('start', 0.0))
            end_ts = format_timestamp(segment.get('end', segment.get('start', 0.0)))
            text = segment.get('text', '').strip()
            handle.write(f'{idx}\n{start_ts} --> {end_ts}\n{text}\n\n')


def aggregate_words(words: List[Dict[str, Any]], max_words: int = 10) -> List[Dict[str, Any]]:
    segments, buffer, start_time = [], [], None
    for word in words:
        token = word.get('word', '').strip()
        if not token:
            continue
        if start_time is None:
            start_time = word.get('start', 0.0)
        buffer.append(token)
        end_time = word.get('end', start_time)
        if len(buffer) >= max_words:
            segments.append({'start': start_time, 'end': end_time, 'text': ' '.join(buffer)})
            buffer, start_time = [], None
    if buffer:
        end_time = words[-1].get('end', start_time or 0.0)
        segments.append({'start': start_time or 0.0, 'end': end_time, 'text': ' '.join(buffer)})
    return segments


def approximate_segments_from_text(text: str, audio_duration: float, words_per_segment: int = 16) -> List[Dict[str, Any]]:
    tokens = text.strip().split()
    if not tokens or audio_duration <= 0:
        return []
    avg_time = audio_duration / max(len(tokens), 1)
    segments = []
    start = 0.0
    idx = 0
    while idx < len(tokens):
        chunk = tokens[idx: idx + words_per_segment]
        chunk_duration = avg_time * len(chunk)
        end = min(start + chunk_duration, audio_duration)
        segments.append({'start': start, 'end': end, 'text': ' '.join(chunk)})
        start = end
        idx += words_per_segment
    if segments:
        segments[-1]['end'] = audio_duration
    return segments


def preview_segments(name: str, segments: List[Dict[str, Any]], limit: int = 3) -> None:
    print('\n{} -> {} segments'.format(name, len(segments)))
    for segment in segments[:limit]:
        print('  [{} - {}] {}'.format(format_timestamp(segment['start']), format_timestamp(segment['end']), segment['text']))


## Configure Input Paths
Update `VIDEO_PATH` to point to the video you want to process. The cell extracts mono 16 kHz audio so every ASR model can share the same waveform.

In [3]:
VIDEO_PATH = Path('media/your_video.mp4')
assert VIDEO_PATH.exists(), f'Video not found: {VIDEO_PATH}'

BASE_NAME = VIDEO_PATH.stem
NOTEBOOK_OUTPUT = ensure_dir(Path('output') / BASE_NAME)
AUDIO_DIR = ensure_dir(NOTEBOOK_OUTPUT / 'audio')
SRT_DIR = ensure_dir(NOTEBOOK_OUTPUT / 'srt')
TRANSLATION_DIR = ensure_dir(NOTEBOOK_OUTPUT / 'translations')

AUDIO_PATH = extract_audio_ffmpeg(VIDEO_PATH, AUDIO_DIR)
AUDIO_DURATION = get_audio_duration(AUDIO_PATH)
print(f'Audio extracted to {AUDIO_PATH}')
print(f'Duration: {AUDIO_DURATION:.2f} seconds')

Audio extracted to output/your_video/audio/your_video_c11c97cc.wav
Duration: 80.13 seconds


# Segment 1 · Transcription
Each subsection runs a different speech recogniser. Results are cached inside `transcripts_by_model` and written to disk as English `.srt` files.

In [4]:
transcripts_by_model: Dict[str, List[Dict[str, Any]]] = {}

### Whisper

In [5]:
model_size = 'small'  # change to tiny/base/medium/large as needed
try:
    if whisper is None:
        raise ImportError('whisper package not installed')
    whisper_model = whisper.load_model(model_size)
    result = whisper_model.transcribe(str(AUDIO_PATH))
    whisper_segments = [
        {'start': seg['start'], 'end': seg['end'], 'text': seg['text'].strip()}
        for seg in result.get('segments', [])
    ]
    transcripts_by_model['whisper'] = whisper_segments
    whisper_srt = SRT_DIR / f'{BASE_NAME}_whisper.srt'
    segments_to_srt(whisper_segments, whisper_srt)
    preview_segments('Whisper', whisper_segments)
    print(f'Saved -> {whisper_srt}')
except Exception as exc:
    print(f'Whisper transcription skipped: {exc}')


Whisper -> 20 segments
  [00:00:00,000 - 00:00:05,320] Hello and welcome to One Minute Wednesdays where I teach you English in just one minute.
  [00:00:05,320 - 00:00:11,280] You solidify your knowledge by writing a practice sentence down below and I will give it a thumb up
  [00:00:11,280 - 00:00:14,600] if it's correct and I will try to correct you if it's not.
Saved -> output/your_video/srt/your_video_whisper.srt


### Wav2Vec2

In [6]:
wav2vec_model_id = 'facebook/wav2vec2-large-960h-lv60-self'
try:
    if hf_asr_pipeline is None:
        raise ImportError('transformers pipeline not available')
    wav2vec_pipeline = hf_asr_pipeline(
        task='automatic-speech-recognition',
        model=wav2vec_model_id,
        chunk_length_s=30,
        stride_length_s=5,
        return_timestamps='word'
    )
    wav2vec_result = wav2vec_pipeline(str(AUDIO_PATH))
    if 'chunks' in wav2vec_result:
        wav2vec_segments = [
            {'start': float(chunk['timestamp'][0]), 'end': float(chunk['timestamp'][1]), 'text': chunk['text'].strip()}
            for chunk in wav2vec_result['chunks']
            if chunk.get('timestamp') and chunk.get('text', '').strip()
        ]
    else:
        wav2vec_segments = approximate_segments_from_text(wav2vec_result.get('text', ''), AUDIO_DURATION)
    transcripts_by_model['wav2vec2'] = wav2vec_segments
    wav2vec_srt = SRT_DIR / f'{BASE_NAME}_wav2vec2.srt'
    segments_to_srt(wav2vec_segments, wav2vec_srt)
    preview_segments('Wav2Vec2', wav2vec_segments)
    print(f'Saved -> {wav2vec_srt}')
except Exception as exc:
    print(f'Wav2Vec2 transcription skipped: {exc}')

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps:0
Device set to use mps:0



Wav2Vec2 -> 175 segments
  [00:00:00,320 - 00:00:00,620] HELLO
  [00:00:00,780 - 00:00:00,880] AND
  [00:00:00,920 - 00:00:01,220] WELCOME
Saved -> output/your_video/srt/your_video_wav2vec2.srt


### Silero

In [7]:
silero_device = 'cuda' if torch and torch.cuda.is_available() else 'cpu'
try:
    if torch is None:
        raise ImportError('torch not installed')

    silero_model, silero_decoder, silero_utils = torch.hub.load(
        repo_or_dir='snakers4/silero-models',
        model='silero_stt',
        language='en',
        device=silero_device
    )

    read_batch, split_into_batches, read_audio, prepare_model_input = silero_utils

    silero_batches = split_into_batches([str(AUDIO_PATH)], batch_size=1)
    silero_text = []

    for batch in silero_batches:
        audio = read_batch(batch)
        input_tensor = prepare_model_input(audio).to(silero_device)
        output = silero_model(input_tensor)
        silero_text.append(silero_decoder(output[0].cpu()))

    combined_text = ' '.join(silero_text)
    silero_segments = approximate_segments_from_text(combined_text, AUDIO_DURATION)
    transcripts_by_model['silero'] = silero_segments

    silero_srt = SRT_DIR / f'{BASE_NAME}_silero.srt'
    segments_to_srt(silero_segments, silero_srt)
    preview_segments('Silero', silero_segments)

    print(f'Saved -> {silero_srt}')

except Exception as exc:
    print(f'Silero transcription skipped: {exc}')

Using cache found in /Users/aryansharma/.cache/torch/hub/snakers4_silero-models_master



Silero -> 9 segments
  [00:00:00,000 - 00:00:09,787] to edesday where i to english just one minute you little y by writing a act
  [00:00:09,787 - 00:00:19,574] sentence down low and i will if it's correct i wouldtry to rectyou if it's not
  [00:00:19,574 - 00:00:29,361] so let's start that clock i us the phoneones they are se differently but sound the
Saved -> output/your_video/srt/your_video_silero.srt


### NeMo (Kaldi-inspired)

In [None]:
nemo_model_name = 'stt_en_conformer_ctc_small'
try:
    if nemo_asr is None:
        raise ImportError('nemo-toolkit not installed')
    nemo_model = nemo_asr.models.ASRModel.from_pretrained(model_name=nemo_model_name)
    try:
        transcribe_result = nemo_model.transcribe([str(AUDIO_PATH)], return_timestamps='word')
    except TypeError:
        transcribe_result = nemo_model.transcribe([str(AUDIO_PATH)])
    nemo_transcripts = []
    nemo_word_ts = None

    if isinstance(transcribe_result, (list, tuple)) and len(transcribe_result) == 2:
        possible_transcripts, possible_word_ts = transcribe_result[0], transcribe_result[1]
        nemo_transcripts = possible_transcripts
        nemo_word_ts = possible_word_ts
    else:
        nemo_transcripts = transcribe_result
    def _coerce_text(obj):
        if obj is None:
            return ''
        if isinstance(obj, str):
            return obj
        if hasattr(obj, 'text'):
            try:
                return obj.text
            except Exception:
                pass
        try:
            return str(obj)
        except Exception:
            return ''

    if isinstance(nemo_transcripts, (list, tuple)):
        nemo_transcripts = [_coerce_text(t) for t in nemo_transcripts]
    elif isinstance(nemo_transcripts, str):
        nemo_transcripts = [nemo_transcripts]
    else:
        try:
            nemo_transcripts = [_coerce_text(t) for t in list(nemo_transcripts)]
        except Exception:
            nemo_transcripts = []
    if nemo_word_ts:
        normalized_word_ts = []
        for utt in nemo_word_ts:
            utt_tokens = []
            for tok in utt:
                if isinstance(tok, dict):
                    word = tok.get('word') or tok.get('text') or ''
                    start = tok.get('start_time', tok.get('start', 0.0))
                    end = tok.get('end_time', tok.get('end', start))
                else:
                    # object with attributes
                    word = getattr(tok, 'word', None) or getattr(tok, 'text', None) or str(tok)
                    start = getattr(tok, 'start_time', None)
                    if start is None:
                        start = getattr(tok, 'start', 0.0)
                    end = getattr(tok, 'end_time', None)
                    if end is None:
                        end = getattr(tok, 'end', start)
                try:
                    start = float(start) if start is not None else 0.0
                except Exception:
                    start = 0.0
                try:
                    end = float(end) if end is not None else start
                except Exception:
                    end = start
                utt_tokens.append({'word': str(word).strip(), 'start': start, 'end': end})
            normalized_word_ts.append(utt_tokens)
        nemo_word_ts = normalized_word_ts

    words = []
    if nemo_word_ts and len(nemo_word_ts) > 0:
        # use the first utterance's tokens (we only provided one file)
        for token in nemo_word_ts[0]:
            words.append({'word': token.get('word', ''), 'start': token.get('start', 0.0), 'end': token.get('end', 0.0)})
        nemo_segments = aggregate_words(words)
    elif nemo_transcripts:
        # no timestamps — create approximate segments from transcript text
        nemo_segments = approximate_segments_from_text(nemo_transcripts[0], AUDIO_DURATION)
    else:
        nemo_segments = []

    transcripts_by_model['nemo'] = nemo_segments
    nemo_srt = SRT_DIR / f'{BASE_NAME}_nemo.srt'
    segments_to_srt(nemo_segments, nemo_srt)
    preview_segments('NeMo', nemo_segments)
    print(f'Saved -> {nemo_srt}')
except Exception as exc:
    print(f'NeMo transcription skipped: {type(exc).__name__}: {exc}')


[NeMo I 2025-09-26 15:17:55 nemo_logging:393] Found existing object /Users/aryansharma/.cache/torch/NeMo/NeMo_2.4.0/stt_en_conformer_ctc_small/5d2d8e5b2b5adb8f5091363c6ba19c55/stt_en_conformer_ctc_small.nemo.
[NeMo I 2025-09-26 15:17:55 nemo_logging:393] Re-using file from: /Users/aryansharma/.cache/torch/NeMo/NeMo_2.4.0/stt_en_conformer_ctc_small/5d2d8e5b2b5adb8f5091363c6ba19c55/stt_en_conformer_ctc_small.nemo
[NeMo I 2025-09-26 15:17:55 nemo_logging:393] Re-using file from: /Users/aryansharma/.cache/torch/NeMo/NeMo_2.4.0/stt_en_conformer_ctc_small/5d2d8e5b2b5adb8f5091363c6ba19c55/stt_en_conformer_ctc_small.nemo
[NeMo I 2025-09-26 15:17:55 nemo_logging:393] Instantiating model from pre-trained checkpoint
[NeMo I 2025-09-26 15:17:55 nemo_logging:393] Instantiating model from pre-trained checkpoint
[NeMo I 2025-09-26 15:17:55 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2025-09-26 15:17:55 nemo_logging:393] Tokenizer SentencePieceTokenizer init

[NeMo W 2025-09-26 15:17:55 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /data/NeMo_ASR_SET/English/v2.0/train/tarred_audio_manifest.json
    sample_rate: 16000
    batch_size: 64
    shuffle: true
    num_workers: 8
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 20.0
    min_duration: 0.1
    shuffle_n: 2048
    is_tarred: true
    tarred_audio_filepaths: /data/NeMo_ASR_SET/English/v2.0/train/audio__OP_0..4095_CL_.tar
    
[NeMo W 2025-09-26 15:17:55 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath:
    - /data/ASR/LibriSpeech/librisp

[NeMo I 2025-09-26 15:17:55 nemo_logging:393] PADDING: 0
[NeMo I 2025-09-26 15:17:55 nemo_logging:393] Model EncDecCTCModelBPE was successfully restored from /Users/aryansharma/.cache/torch/NeMo/NeMo_2.4.0/stt_en_conformer_ctc_small/5d2d8e5b2b5adb8f5091363c6ba19c55/stt_en_conformer_ctc_small.nemo.
[NeMo I 2025-09-26 15:17:55 nemo_logging:393] Model EncDecCTCModelBPE was successfully restored from /Users/aryansharma/.cache/torch/NeMo/NeMo_2.4.0/stt_en_conformer_ctc_small/5d2d8e5b2b5adb8f5091363c6ba19c55/stt_en_conformer_ctc_small.nemo.


Transcribing: 100%|██████████| 1/1 [00:01<00:00,  1.26s/it]


NeMo -> 11 segments
  [00:00:00,000 - 00:00:07,454] hello i welcome two one minute wednesdays where i teach you english in just one minute
  [00:00:07,454 - 00:00:14,908] you solidify your knowledge by writing a practice sentence down below and i will give it
  [00:00:14,908 - 00:00:22,362] a thumb up if it's correct and i will try to correct you if it's not
Saved -> output/your_video/srt/your_video_nemo.srt





### Vosk

In [17]:
vosk_model_dir = Path('models/vosk-model-small-en-us-0.15')
try:
    from vosk import Model, KaldiRecognizer
    if not vosk_model_dir.exists():
        raise FileNotFoundError(f'Download a Vosk model to {vosk_model_dir}')
    vosk_model = Model(str(vosk_model_dir))
    wf = wave.open(str(AUDIO_PATH), 'rb')
    recognizer = KaldiRecognizer(vosk_model, wf.getframerate())
    recognizer.SetWords(True)
    words = []
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if recognizer.AcceptWaveform(data):
            partial = json.loads(recognizer.Result())
            words.extend(partial.get('result', []))
    final = json.loads(recognizer.FinalResult())
    words.extend(final.get('result', []))
    wf.close()
    vosk_segments = aggregate_words(words)
    transcripts_by_model['vosk'] = vosk_segments
    vosk_srt = SRT_DIR / f'{BASE_NAME}_vosk.srt'
    segments_to_srt(vosk_segments, vosk_srt)
    preview_segments('Vosk', vosk_segments)
    print(f'Saved -> {vosk_srt}')
except Exception as exc:
    print(f'Vosk transcription skipped: {exc}')

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from models/vosk-model-small-en-us-0.15/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:282) Loading HCL and G from models/vosk-model-small-en-us-0.15/graph/HCLr.fst models/vosk-model-small-en-us-0.15/graph/Gr.fst
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo models/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int



Vosk -> 18 segments
  [00:00:00,270 - 00:00:03,180] hello and welcome to woman it wednesday's where i teach
  [00:00:03,270 - 00:00:07,290] english in just one minute you solidify our knowledge by
  [00:00:07,290 - 00:00:10,350] writing a practice sentence down below and i will give
Saved -> output/your_video/srt/your_video_vosk.srt


# Segment 2 · Translation
Provide the language names (or ISO codes) you want to generate. Translations reuse the English segments created above.

In [18]:
target_languages = ['hindi', 'marathi']  # supports names or ISO codes from LANG_CODE_MAP

def resolve_language(lang: str) -> str:
    lang = lang.lower().strip()
    if lang in LANG_CODE_MAP.values():
        return lang
    return LANG_CODE_MAP.get(lang, 'en')

resolved_targets = [resolve_language(lang) for lang in target_languages]
print(f'Translating to -> {resolved_targets}')

Translating to -> ['hi', 'mr']


In [19]:
if MarianMTModel is None or MarianTokenizer is None:
    raise ImportError('transformers MarianMT not available; install transformers for translation')

_translation_cache: Dict[str, Any] = {}

def get_translation_model(src_lang: str, tgt_lang: str):
    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
    if model_name not in _translation_cache:
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        if torch:
            model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
        _translation_cache[model_name] = (tokenizer, model)
    return _translation_cache[model_name]

def chunk_list(items: List[Any], chunk_size: int = 8):
    for idx in range(0, len(items), chunk_size):
        yield items[idx: idx + chunk_size]

def translate_segments(segments: List[Dict[str, Any]], src_lang: str, tgt_lang: str) -> List[Dict[str, Any]]:
    tokenizer, model = get_translation_model(src_lang, tgt_lang)
    device = next(model.parameters()).device if torch else 'cpu'
    translated_segments = []
    for batch in chunk_list(segments, chunk_size=8):
        texts = [seg['text'] for seg in batch]
        inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
        if torch:
            inputs = {k: v.to(device) for k, v in inputs.items()}
            with torch.no_grad():
                outputs = model.generate(**inputs)
        else:
            outputs = model.generate(**inputs)
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        for seg, text in zip(batch, decoded):
            seg_copy = dict(seg)
            seg_copy['text'] = text
            translated_segments.append(seg_copy)
    return translated_segments

translated_paths = []
for model_name, segments in transcripts_by_model.items():
    for tgt in resolved_targets:
        try:
            translated = translate_segments(segments, src_lang='en', tgt_lang=tgt)
            out_dir = ensure_dir(TRANSLATION_DIR / tgt)
            out_path = out_dir / f'{BASE_NAME}_{model_name}_{tgt}.srt'
            segments_to_srt(translated, out_path)
            preview_segments(f'{model_name.upper()} -> {tgt.upper()}', translated)
            translated_paths.append(out_path)
            print(f'Saved -> {out_path}')
        except Exception as exc:
            print(f'Translation skipped for {model_name} -> {tgt}: {exc}')

translated_paths


WHISPER -> HI -> 20 segments
  [00:00:00,000 - 00:00:05,320] नमस्कार और एक मिनट के लिए स्वागत है जहां मैं सिर्फ एक मिनट में तुम्हें अंग्रेजी सिखाता हूं.
  [00:00:05,320 - 00:00:11,280] आप नीचे दिये गये वाक्य को लिख कर अपने ज्ञान को मज़बूत करते हैं और मैं उसे इंच दूँगा
  [00:00:11,280 - 00:00:14,600] अगर यह सही है और मैं आपको सही करने की कोशिश करेंगे अगर यह नहीं है.
Saved -> output/your_video/translations/hi/your_video_whisper_hi.srt

WHISPER -> MR -> 20 segments
  [00:00:00,000 - 00:00:05,320] मी तुला इंग्रजीचा अभ्यास करायला शिकवू शकते.
  [00:00:05,320 - 00:00:11,280] मी तुला ज्ञान व समज देईन. मी तुला ज्ञान व समज देईन.
  [00:00:11,280 - 00:00:14,600] मी तुला तसं करायला देऊ शकत नाही.
Saved -> output/your_video/translations/mr/your_video_whisper_mr.srt

WHISPER -> MR -> 20 segments
  [00:00:00,000 - 00:00:05,320] मी तुला इंग्रजीचा अभ्यास करायला शिकवू शकते.
  [00:00:05,320 - 00:00:11,280] मी तुला ज्ञान व समज देईन. मी तुला ज्ञान व समज देईन.
  [00:00:11,280 - 00:00:14,600] मी तुला तसं कराय

[PosixPath('output/your_video/translations/hi/your_video_whisper_hi.srt'),
 PosixPath('output/your_video/translations/mr/your_video_whisper_mr.srt'),
 PosixPath('output/your_video/translations/hi/your_video_wav2vec2_hi.srt'),
 PosixPath('output/your_video/translations/mr/your_video_wav2vec2_mr.srt'),
 PosixPath('output/your_video/translations/hi/your_video_silero_hi.srt'),
 PosixPath('output/your_video/translations/mr/your_video_silero_mr.srt'),
 PosixPath('output/your_video/translations/hi/your_video_nemo_hi.srt'),
 PosixPath('output/your_video/translations/mr/your_video_nemo_mr.srt'),
 PosixPath('output/your_video/translations/hi/your_video_vosk_hi.srt'),
 PosixPath('output/your_video/translations/mr/your_video_vosk_mr.srt')]