In [6]:
pip install stable-ts

Collecting stable-ts
  Using cached stable_ts-2.13.7-py3-none-any.whl
Collecting torchaudio
  Using cached torchaudio-2.1.2-cp310-cp310-win_amd64.whl (2.3 MB)
Installing collected packages: torchaudio, stable-ts
Successfully installed stable-ts-2.13.7 torchaudio-2.1.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datetime import datetime, timedelta

import pandas as pd
import srt
import whisper
import logging
import os
from stable_whisper import modify_model
logger = logging.getLogger("transcribe")

In [1]:
def get_whisper_model(model_name="base"):
    # initialize model
    logging.info(f"Initializing openai's '{model_name} 'model")
    if model_name in [
        "tiny.en",
        "tiny",
        "base.en",
        "base",
        "small.en",
        "small",
        "medium.en",
        "medium",
        "large",
    ]:
        try:
            model = whisper.load_model(model_name)
            # Using the stable whisper to modifiy the model for better timestamps accuracy
            modify_model(model)
            logging.info("Model was successfully initialized")
        except:
            logging.error("Unable to initialize openai model")
            return None
    else:
        logging.error(
            "Model  not found; available models = ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large']"
        )
        return None

    return model

def get_whisper_result(file_path, model):
    
    logging.info(f"Generating transcription for file - {file_path}")

    decode_options = dict(language="en")
    transcribe_options = dict(task="transcribe", **decode_options)
    output = model.transcribe(file_path, **transcribe_options)
    output = model.align(file_path, output, language='en')
    return output


def generate_whisper_transcription(file_name, file_path, output):
    
    logging.info(f"Organizing transcription for file - {file_path}")
    
    transcriptions = {}

    for num, s in enumerate(output.segments):
        transcriptions[num] = []
        for word in s.words:
            transcriptions[num].append(
                {
                    "text": s.text.strip(),
                    "segment_start": s.start,
                    "segment_end": s.end,
                    "word": word.word.strip(),
                    "word_start": word.start,
                    "word_end": word.end,
                }
            )

    rows = []

    for key, words in transcriptions.items():
        for word in words:
            row = {
                "file_name": file_name,
                "segment_id": key,
                "segment_text": word["text"],
                "segment_start": word["segment_start"],
                "segment_end": word["segment_end"],
                "word": word["word"],
                "word_start": word["word_start"],
                "word_end": word["word_end"],
            }
            rows.append(row)

    df = pd.DataFrame(rows)

    return df

In [17]:
file_name = 'MagnusCarlson_542_599.mp4'
model_name_openai = "medium.en"
video_dir = "../../Dataset/Videos/"

model = get_whisper_model(model_name_openai)
output = get_whisper_result(os.path.join(video_dir, file_name), model)
df = generate_whisper_transcription(file_name, os.path.join(video_dir, file_name), output)

100%|█████████████████████████████████████| 1.42G/1.42G [00:46<00:00, 32.7MiB/s]
Transcribe: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 57.03/57.03 [00:09<00:00,  6.08sec/s]
Align: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 57.03/57.03 [00:01<00:00, 53.93sec/s]


In [19]:
df.head(30)

Unnamed: 0,file_name,segment_id,segment_text,segment_start,segment_end,word,word_start,word_end
0,MagnusCarlson_542_599.mp4,0,I'm going to name a sport.,0.0,0.94,I'm,0.0,0.18
1,MagnusCarlson_542_599.mp4,0,I'm going to name a sport.,0.0,0.94,going,0.18,0.28
2,MagnusCarlson_542_599.mp4,0,I'm going to name a sport.,0.0,0.94,to,0.28,0.36
3,MagnusCarlson_542_599.mp4,0,I'm going to name a sport.,0.0,0.94,name,0.36,0.46
4,MagnusCarlson_542_599.mp4,0,I'm going to name a sport.,0.0,0.94,a,0.46,0.66
5,MagnusCarlson_542_599.mp4,0,I'm going to name a sport.,0.0,0.94,sport.,0.66,0.94
6,MagnusCarlson_542_599.mp4,1,You have to tell me the greatest of all time.,1.56,2.78,You,1.56,1.62
7,MagnusCarlson_542_599.mp4,1,You have to tell me the greatest of all time.,1.56,2.78,have,1.62,1.74
8,MagnusCarlson_542_599.mp4,1,You have to tell me the greatest of all time.,1.56,2.78,to,1.74,1.84
9,MagnusCarlson_542_599.mp4,1,You have to tell me the greatest of all time.,1.56,2.78,tell,1.84,1.94


In [3]:
video_files = [
    f
    for f in os.listdir(video_dir)
    if os.path.isfile(os.path.join(video_dir, f)) and os.path.splitext(os.path.join(video_dir, f))[1] in [".mp4"]
]
video_files

NameError: name 'video_dir' is not defined

In [4]:
model_name_openai = "medium.en"
video_dir = "../../Dataset/Videos/"
video_files = [
    f
    for f in os.listdir(video_dir)
    if os.path.isfile(os.path.join(video_dir, f)) and os.path.splitext(os.path.join(video_dir, f))[1] in [".mp4"]
]
df_list = []
for f in video_files:
    model = get_whisper_model(model_name_openai)
    output = get_whisper_result(os.path.join(video_dir, f), model)
    df_list.append(generate_whisper_transcription(f, os.path.join(video_dir, f), output))

final_df = pd.concat(df_list)
final_df.to_excel(f"Transcriptions_{datetime.now().strftime('_%Y%m%d_%H%M%S')}.xlsx", index=False)

Transcribe: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 60.02/60.02 [00:10<00:00,  5.73sec/s]
Align: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 60.02/60.02 [00:00<00:00, 117.85sec/s]
  output = model.align(file_path, output, language='en')
Transcribe: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 61.02/61.02 [00:07<00:00,  7.71sec/s]
Align: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 61.02/61.02 [00:00<00:00, 127.17sec/s]
Transcribe: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 37.01/37.01 [00:04<00:00,  7.70sec/s]
Align: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 37.01/37.01 [00:00<00:00, 113.77se

In [5]:
final_df

Unnamed: 0,file_name,segment_id,segment_text,segment_start,segment_end,word,word_start,word_end
0,StarTalk_CMBR_92_152.mp4,0,So.,0.00,0.32,So.,0.00,0.32
1,StarTalk_CMBR_92_152.mp4,1,"Well, so here you go.",0.50,1.74,"Well,",0.50,0.74
2,StarTalk_CMBR_92_152.mp4,1,"Well, so here you go.",0.50,1.74,so,0.88,1.00
3,StarTalk_CMBR_92_152.mp4,1,"Well, so here you go.",0.50,1.74,here,1.00,1.36
4,StarTalk_CMBR_92_152.mp4,1,"Well, so here you go.",0.50,1.74,you,1.36,1.54
...,...,...,...,...,...,...,...,...
117,StarTalk_Sleep_748_796.mp4,19,So isn't it a strange thing?,45.46,47.54,isn't,45.60,46.22
118,StarTalk_Sleep_748_796.mp4,19,So isn't it a strange thing?,45.46,47.54,it,46.26,46.38
119,StarTalk_Sleep_748_796.mp4,19,So isn't it a strange thing?,45.46,47.54,a,46.38,46.52
120,StarTalk_Sleep_748_796.mp4,19,So isn't it a strange thing?,45.46,47.54,strange,46.52,47.12
