In [6]:
pip install stable-ts

Collecting stable-ts
  Using cached stable_ts-2.13.7-py3-none-any.whl
Collecting torchaudio
  Using cached torchaudio-2.1.2-cp310-cp310-win_amd64.whl (2.3 MB)
Installing collected packages: torchaudio, stable-ts
Successfully installed stable-ts-2.13.7 torchaudio-2.1.2
Note: you may need to restart the kernel to use updated packages.


In [5]:
from datetime import datetime, timedelta

import pandas as pd
import srt
import whisper
import logging
import os
from stable_whisper import modify_model
logger = logging.getLogger("transcribe")

In [9]:
model_name_openai = "base.en"
file_path = "/home/sunil/projects/Stuff/Combined/TalkNet-ASD/Videos/Video4.mp4"
video_dir = "../Dataset/Videos/"

def get_openai_model(model_name="base"):
    # initialize model
    logging.info(f"Initializing openai's '{model_name} 'model")
    if model_name in [
        "tiny.en",
        "tiny",
        "base.en",
        "base",
        "small.en",
        "small",
        "medium.en",
        "medium",
        "large",
    ]:
        try:
            model = whisper.load_model(model_name)
            # Using the stable whisper to modifiy the model for better timestamps accuracy
            modify_model(model)
            logging.info("Model was successfully initialized")
        except:
            logging.error("Unable to initialize openai model")
            return None
    else:
        logging.error(
            "Model  not found; available models = ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large']"
        )
        return None

    return model

model = get_openai_model(model_name_openai)

def generate_openai_transcription(file_name, file_path, model):
    logging.info(f"Generating transcription for file - {file_path}")

    decode_options = dict(language="en")
    transcribe_options = dict(task="transcribe", **decode_options)
    output = model.transcribe(file_path, **transcribe_options)

    transcriptions = {}

    for num, s in enumerate(output.segments):
        transcriptions[num] = []
        for word in s.words:
            transcriptions[num].append(
                {
                    "text": s.text,
                    "segment_start": s.start,
                    "segment_end": s.end,
                    "word": word.word,
                    "word_start": word.start,
                    "word_end": word.end,
                }
            )

    rows = []

    for key, words in transcriptions.items():
        for word in words:
            row = {
                "file_name": file_name,
                "segment_id": key,
                "segment_text": word["text"],
                "segment_start": word["segment_start"],
                "segment_end": word["segment_end"],
                "word": word["word"],
                "word_start": word["word_start"],
                "word_end": word["word_end"],
            }
            rows.append(row)

    df = pd.DataFrame(rows)

    return df

In [12]:
video_files = [
    f
    for f in os.listdir(video_dir)
    if os.path.isfile(os.path.join(video_dir, f)) and os.path.splitext(os.path.join(video_dir, f))[1] in [".mp4"]
]
video_files

['MagnusCarlson_542_599.mp4',
 'NDT_India_19_88.mp4',
 'StarTalk_CMBR_190_225.mp4',
 'StarTalk_CMBR_270_308.mp4',
 'StarTalk_CMBR_319_356.mp4',
 'StarTalk_CMBR_92_152.mp4',
 'StarTalk_FlyingVehicles_1001_1043.mp4',
 'StarTalk_FlyingVehicles_1980_2040.mp4',
 'StarTalk_FlyingVehicles_2446_2508.mp4',
 'StarTalk_FlyingVehicles_2670_2710.mp4',
 'StarTalk_FlyingVehicles_300_340.mp4',
 'StarTalk_FlyingVehicles_674_719.mp4',
 'StarTalk_FlyingVehicles_780_811.mp4',
 'StarTalk_FlyingVehicles_949_1000.mp4',
 'StarTalk_Sleep_1152_1211.mp4',
 'StarTalk_Sleep_1602_1639.mp4',
 'StarTalk_Sleep_1980_2041.mp4',
 'StarTalk_Sleep_2099_2160.mp4',
 'StarTalk_Sleep_2379_2443.mp4',
 'StarTalk_Sleep_2470_2551.mp4',
 'StarTalk_Sleep_382_450.mp4',
 'StarTalk_Sleep_748_796.mp4']

In [13]:
# video_files = ['MagnusCarlson_542_599.mp4', 'NDT_India_19_88.mp4']
df_list = []
for f in video_files:
    df_list.append(generate_openai_transcription(f, os.path.join(video_dir, f), model))

final_df = pd.concat(df_list)
final_df.to_excel("Transcriptions.xlsx", index=False)

Transcribe: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 57.01/57.01 [00:09<00:00,  5.94sec/s]
Transcribe: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 69.0/69.0 [00:11<00:00,  6.08sec/s]
Transcribe: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35.0/35.0 [00:05<00:00,  6.16sec/s]
Transcribe: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38.01/38.01 [00:07<00:00,  5.13sec/s]
Transcribe: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 37.01/37.01 [00:05<00:00,  6.61sec/s]
Transcribe: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 60.02/

Transcribe: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48.0/48.0 [00:07<00:00,  6.26sec/s]


In [11]:
df.to_excel("Video4.xlsx", index=False)