In [6]:
pip install stable-ts

Collecting stable-ts
  Using cached stable_ts-2.13.7-py3-none-any.whl
Collecting torchaudio
  Using cached torchaudio-2.1.2-cp310-cp310-win_amd64.whl (2.3 MB)
Installing collected packages: torchaudio, stable-ts
Successfully installed stable-ts-2.13.7 torchaudio-2.1.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
from datetime import datetime, timedelta

import pandas as pd
import srt
import whisper
import logging
import os
from stable_whisper import modify_model
logger = logging.getLogger("transcribe")

In [3]:
def get_whisper_model(model_name="base"):
    # initialize model
    logging.info(f"Initializing openai's '{model_name} 'model")
    if model_name in [
        "tiny.en",
        "tiny",
        "base.en",
        "base",
        "small.en",
        "small",
        "medium.en",
        "medium",
        "large",
    ]:
        try:
            model = whisper.load_model(model_name)
            # Using the stable whisper to modifiy the model for better timestamps accuracy
            modify_model(model)
            logging.info("Model was successfully initialized")
        except:
            logging.error("Unable to initialize openai model")
            return None
    else:
        logging.error(
            "Model  not found; available models = ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large']"
        )
        return None

    return model

def get_whisper_result(file_path, model):
    
    logging.info(f"Generating transcription for file - {file_path}")

    decode_options = dict(language="en")
    transcribe_options = dict(task="transcribe", **decode_options)
    output = model.transcribe(file_path, **transcribe_options)
    output = model.align(file_path, output, language='en')
    return output


def generate_whisper_transcription(file_name, file_path, output):
    
    logging.info(f"Organizing transcription for file - {file_path}")
    
    transcriptions = {}

    for num, s in enumerate(output.segments):
        transcriptions[num] = []
        for word in s.words:
            transcriptions[num].append(
                {
                    "text": s.text.strip(),
                    "segment_start": s.start,
                    "segment_end": s.end,
                    "word": word.word.strip(),
                    "word_start": word.start,
                    "word_end": word.end,
                }
            )

    rows = []

    for key, words in transcriptions.items():
        for word in words:
            row = {
                "file_name": file_name,
                "segment_id": key,
                "segment_text": word["text"],
                "segment_start": word["segment_start"],
                "segment_end": word["segment_end"],
                "word": word["word"],
                "word_start": word["word_start"],
                "word_end": word["word_end"],
            }
            rows.append(row)

    df = pd.DataFrame(rows)

    return df

In [23]:
model_name_openai = "large"
model = get_whisper_model(model_name_openai)

In [41]:
file_name = 'StarTalk_Sleep_382_450.mp4'
video_dir = "../../Dataset/Videos/"

output = get_whisper_result(os.path.join(video_dir, file_name), model)
df = generate_whisper_transcription(file_name, os.path.join(video_dir, file_name), output)

Transcribe: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 68.01/68.01 [00:11<00:00,  6.01sec/s]
Align: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 68.01/68.01 [00:00<00:00, 82.31sec/s]


In [48]:
print(len(df))
df[70:100]

166


Unnamed: 0,file_name,segment_id,segment_text,segment_start,segment_end,word,word_start,word_end
70,StarTalk_Sleep_382_450.mp4,10,"You know, that little brown bat can sleep up t...",27.6,32.58,brown,28.86,29.38
71,StarTalk_Sleep_382_450.mp4,10,"You know, that little brown bat can sleep up t...",27.6,32.58,bat,29.38,29.7
72,StarTalk_Sleep_382_450.mp4,10,"You know, that little brown bat can sleep up t...",27.6,32.58,can,29.7,29.82
73,StarTalk_Sleep_382_450.mp4,10,"You know, that little brown bat can sleep up t...",27.6,32.58,sleep,30.0,30.14
74,StarTalk_Sleep_382_450.mp4,10,"You know, that little brown bat can sleep up t...",27.6,32.58,up,30.14,30.48
75,StarTalk_Sleep_382_450.mp4,10,"You know, that little brown bat can sleep up t...",27.6,32.58,to,30.48,30.66
76,StarTalk_Sleep_382_450.mp4,10,"You know, that little brown bat can sleep up t...",27.6,32.58,19,30.66,31.24
77,StarTalk_Sleep_382_450.mp4,10,"You know, that little brown bat can sleep up t...",27.6,32.58,hours,31.54,32.06
78,StarTalk_Sleep_382_450.mp4,10,"You know, that little brown bat can sleep up t...",27.6,32.58,per,32.06,32.22
79,StarTalk_Sleep_382_450.mp4,10,"You know, that little brown bat can sleep up t...",27.6,32.58,day.,32.48,32.58


In [49]:
df.to_excel("StarTalk_Sleep_382_450.xlsx", index=False)

In [13]:
video_files = [
    f
    for f in os.listdir(video_dir)
    if os.path.isfile(os.path.join(video_dir, f)) and os.path.splitext(os.path.join(video_dir, f))[1] in [".mp4"]
]
video_files.sort()
video_files

['MagnusCarlson_542_599.mp4',
 'NDT_India_19_88.mp4',
 'StarTalk_CMBR_190_225.mp4',
 'StarTalk_CMBR_270_308.mp4',
 'StarTalk_CMBR_319_356.mp4',
 'StarTalk_CMBR_92_152.mp4',
 'StarTalk_FlyingVehicles_1001_1043.mp4',
 'StarTalk_FlyingVehicles_1980_2040.mp4',
 'StarTalk_FlyingVehicles_2446_2508.mp4',
 'StarTalk_FlyingVehicles_2670_2710.mp4',
 'StarTalk_FlyingVehicles_300_340.mp4',
 'StarTalk_FlyingVehicles_674_719.mp4',
 'StarTalk_FlyingVehicles_780_811.mp4',
 'StarTalk_FlyingVehicles_949_1000.mp4',
 'StarTalk_Sleep_1152_1211.mp4',
 'StarTalk_Sleep_1602_1639.mp4',
 'StarTalk_Sleep_1980_2041.mp4',
 'StarTalk_Sleep_2099_2160.mp4',
 'StarTalk_Sleep_2379_2443.mp4',
 'StarTalk_Sleep_2470_2551.mp4',
 'StarTalk_Sleep_382_450.mp4',
 'StarTalk_Sleep_748_796.mp4']

In [16]:
model_name_openai = "large"
video_dir = "../../Dataset/Videos/"
video_files = [
    f
    for f in os.listdir(video_dir)
    if os.path.isfile(os.path.join(video_dir, f)) and os.path.splitext(os.path.join(video_dir, f))[1] in [".mp4"]
]
video_files.sort()
df_list = []
model = get_whisper_model(model_name_openai)
for f in video_files:
    output = get_whisper_result(os.path.join(video_dir, f), model)
    df_list.append(generate_whisper_transcription(f, os.path.join(video_dir, f), output))

final_df = pd.concat(df_list)
final_df.to_excel(f"Transcriptions{datetime.now().strftime('_%Y%m%d_%H%M%S')}.xlsx", index=False)

Transcribe: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 60.02/60.02 [00:14<00:00,  4.16sec/s]
Align: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 60.02/60.02 [00:01<00:00, 55.86sec/s]
  output = model.align(file_path, output, language='en')
Transcribe: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 61.02/61.02 [00:07<00:00,  8.46sec/s]
Align: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 61.02/61.02 [00:01<00:00, 56.90sec/s]
Transcribe: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 37.01/37.01 [00:05<00:00,  6.68sec/s]
Align: 100%|██████████████████████████████████████████████████████████████████

In [17]:
final_df

Unnamed: 0,file_name,segment_id,segment_text,segment_start,segment_end,word,word_start,word_end
0,StarTalk_CMBR_92_152.mp4,0,So here you go.,0.00,1.64,So,0.00,0.40
1,StarTalk_CMBR_92_152.mp4,0,So here you go.,0.00,1.64,here,0.40,1.00
2,StarTalk_CMBR_92_152.mp4,0,So here you go.,0.00,1.64,you,1.00,1.40
3,StarTalk_CMBR_92_152.mp4,0,So here you go.,0.00,1.64,go.,1.40,1.64
4,StarTalk_CMBR_92_152.mp4,1,Here we go.,2.02,2.46,Here,2.02,2.12
...,...,...,...,...,...,...,...,...
116,StarTalk_Sleep_748_796.mp4,16,So isn't it a strange thing?,44.64,47.38,isn't,45.48,45.84
117,StarTalk_Sleep_748_796.mp4,16,So isn't it a strange thing?,44.64,47.38,it,45.84,45.92
118,StarTalk_Sleep_748_796.mp4,16,So isn't it a strange thing?,44.64,47.38,a,45.92,46.34
119,StarTalk_Sleep_748_796.mp4,16,So isn't it a strange thing?,44.64,47.38,strange,46.50,46.86
