In [6]:
pip install stable-ts

Collecting stable-ts
  Using cached stable_ts-2.13.7-py3-none-any.whl
Collecting torchaudio
  Using cached torchaudio-2.1.2-cp310-cp310-win_amd64.whl (2.3 MB)
Installing collected packages: torchaudio, stable-ts
Successfully installed stable-ts-2.13.7 torchaudio-2.1.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
from datetime import datetime, timedelta

import pandas as pd
import srt
import whisper
import logging
import os
from stable_whisper import modify_model
from datetime import timedelta
logger = logging.getLogger("transcribe")

In [2]:
def get_whisper_model(model_name="base"):
    # initialize model
    logging.info(f"Initializing openai's '{model_name} 'model")
    if model_name in [
        "tiny.en",
        "tiny",
        "base.en",
        "base",
        "small.en",
        "small",
        "medium.en",
        "medium",
        "large",
    ]:
        try:
            model = whisper.load_model(model_name)
            # Using the stable whisper to modifiy the model for better timestamps accuracy
            modify_model(model)
            logging.info("Model was successfully initialized")
        except:
            logging.error("Unable to initialize openai model")
            return None
    else:
        logging.error(
            "Model  not found; available models = ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large']"
        )
        return None

    return model


def get_whisper_result(file_path, model):

    logging.info(f"Generating transcription for file - {file_path}")

    decode_options = dict(language="en")
    transcribe_options = dict(task="transcribe", **decode_options)
    output = model.transcribe(file_path, **transcribe_options)
    output = model.align(file_path, output, language="en")
    return output


def generate_whisper_transcription(file_name, file_path, output):

    logging.info(f"Organizing transcription for file - {file_path}")

    transcriptions = {}

    for num, s in enumerate(output.segments):
        transcriptions[num] = []
        for word in s.words:
            transcriptions[num].append(
                {
                    "text": s.text.strip(),
                    "segment_start": s.start,
                    "segment_end": s.end,
                    "word": word.word.strip(),
                    "word_start": word.start,
                    "word_end": word.end,
                }
            )

    rows = []

    for key, words in transcriptions.items():
        for word in words:
            row = {
                "file_name": file_name,
                "segment_id": key,
                "segment_text": word["text"],
                "segment_start": word["segment_start"],
                "segment_end": word["segment_end"],
                "word": word["word"],
                "word_start": word["word_start"],
                "word_end": word["word_end"],
            }
            rows.append(row)

    df = pd.DataFrame(rows)

    return df


def create_srt(df, video_name, video_dir):
    temp_df = (
        df[df["file_name"] == video_name][
            ["segment_id", "segment_text", "segment_start", "segment_end"]
        ]
        .drop_duplicates()
        .copy()
    )
    temp_df.reset_index(drop=True, inplace=True)
    temp_df["segment_id"] = temp_df.index + 1

    srt_list = temp_df.apply(
        lambda row: srt.Subtitle(
            index=row["segment_id"],
            start=timedelta(
                seconds=(
                    row["segment_start"]
                    if row["segment_start"] < 0.1
                    else row["segment_start"] - 0.1
                )
            ),
            end=timedelta(seconds=row["segment_end"] + 0.1),
            content=f"{row['segment_text']}",
        ),
        axis=1,
    ).to_list()

    srt_string = srt.compose(srt_list)
    with open(
        os.path.join(
            video_dir,
            os.path.splitext(video_name)[0] + ".srt",
        ),
        "w",
    ) as f:
        f.write(srt_string)

In [3]:
model_name_openai = "medium.en"
model = get_whisper_model(model_name_openai)

In [4]:
file_name = 'video.avi'
video_dir = "../../output/video_temp/StarTalk_FlyingVehicles_780_811/pyavi/"

output = get_whisper_result(os.path.join(video_dir, file_name), model)
df = generate_whisper_transcription(file_name, os.path.join(video_dir, file_name), output)

Transcribe: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 31.06/31.06 [00:08<00:00,  3.56sec/s]
Align: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 31.06/31.06 [00:00<00:00, 88.23sec/s]


In [5]:
df

Unnamed: 0,file_name,segment_id,segment_text,segment_start,segment_end,word,word_start,word_end
0,video.avi,0,And what's the one that's a mile high?,0.46,2.88,And,0.46,0.86
1,video.avi,0,And what's the one that's a mile high?,0.46,2.88,what's,0.86,1.94
2,video.avi,0,And what's the one that's a mile high?,0.46,2.88,the,1.94,2.04
3,video.avi,0,And what's the one that's a mile high?,0.46,2.88,one,2.04,2.20
4,video.avi,0,And what's the one that's a mile high?,0.46,2.88,that's,2.20,2.48
...,...,...,...,...,...,...,...,...
105,video.avi,18,So let's get into some Q&A.,29.60,30.96,get,29.96,30.08
106,video.avi,18,So let's get into some Q&A.,29.60,30.96,into,30.08,30.30
107,video.avi,18,So let's get into some Q&A.,29.60,30.96,some,30.30,30.48
108,video.avi,18,So let's get into some Q&A.,29.60,30.96,Q,30.48,30.66


In [13]:
print(len(df))
df[70:120]

220


Unnamed: 0,file_name,segment_id,segment_text,segment_start,segment_end,word,word_start,word_end
70,StarTalk_Consciousness_1075_1175.mp4,13,Some of this is a matter of intelligence as op...,26.02,34.7,matter,28.36,28.58
71,StarTalk_Consciousness_1075_1175.mp4,13,Some of this is a matter of intelligence as op...,26.02,34.7,of,28.58,28.92
72,StarTalk_Consciousness_1075_1175.mp4,13,Some of this is a matter of intelligence as op...,26.02,34.7,intelligence,29.06,29.94
73,StarTalk_Consciousness_1075_1175.mp4,13,Some of this is a matter of intelligence as op...,26.02,34.7,as,29.94,29.98
74,StarTalk_Consciousness_1075_1175.mp4,13,Some of this is a matter of intelligence as op...,26.02,34.7,opposed,29.98,30.92
75,StarTalk_Consciousness_1075_1175.mp4,13,Some of this is a matter of intelligence as op...,26.02,34.7,to,31.0,31.82
76,StarTalk_Consciousness_1075_1175.mp4,13,Some of this is a matter of intelligence as op...,26.02,34.7,consciousness.,31.82,34.7
77,StarTalk_Consciousness_1075_1175.mp4,14,"Yes, it could be.",34.7,35.4,"Yes,",34.7,34.9
78,StarTalk_Consciousness_1075_1175.mp4,14,"Yes, it could be.",34.7,35.4,it,34.9,34.96
79,StarTalk_Consciousness_1075_1175.mp4,14,"Yes, it could be.",34.7,35.4,could,34.96,35.12


In [14]:
df.to_excel("StarTalk_Sleep_382_450.xlsx", index=False)

In [15]:
video_files = [
    f
    for f in os.listdir(video_dir)
    if os.path.isfile(os.path.join(video_dir, f)) and os.path.splitext(os.path.join(video_dir, f))[1] in [".mp4"]
]
video_files.sort()
video_files

['StarTalk_Consciousness_1075_1175.mp4',
 'StarTalk_Consciousness_1799_1887.mp4',
 'StarTalk_Consciousness_2190_2254.mp4',
 'StarTalk_Consciousness_2254_2314.mp4',
 'StarTalk_Consciousness_2314_2387.mp4',
 'StarTalk_Consciousness_56_180.mp4',
 'StarTalk_Consciousness_683_784.mp4',
 'StarTalk_Cosmic_1050_1130.mp4',
 'StarTalk_Cosmic_1135_1200.mp4',
 'StarTalk_Cosmic_1350_1442.mp4',
 'StarTalk_Cosmic_1550_1620.mp4',
 'StarTalk_Cosmic_1820_1900.mp4',
 'StarTalk_Cosmic_200_290.mp4',
 'StarTalk_Cosmic_2225_2300.mp4',
 'StarTalk_Cosmic_2600_2683.mp4',
 'StarTalk_Cosmic_440_532.mp4',
 'StarTalk_Cosmic_600_700.mp4',
 'StarTalk_Cosmic_780_850.mp4',
 'StarTalk_Farming_0_98.mp4',
 'StarTalk_Farming_1059_1180.mp4',
 'StarTalk_Farming_1700_1800.mp4',
 'StarTalk_Farming_2405_2500.mp4',
 'StarTalk_Farming_2550_2645.mp4',
 'StarTalk_Farming_307_387.mp4',
 'StarTalk_Mars_1026_1086.mp4',
 'StarTalk_Mars_1109_1175.mp4',
 'StarTalk_Mars_1345_1426.mp4',
 'StarTalk_Mars_1430_1500.mp4',
 'StarTalk_Mars_1680_

In [16]:
model_name_openai = "medium.en"
video_dir = "../../Dataset/Set_2/"
video_files = [
    f
    for f in os.listdir(video_dir)
    if os.path.isfile(os.path.join(video_dir, f)) and os.path.splitext(os.path.join(video_dir, f))[1] in [".mp4"]
]
video_files.sort()
df_list = []
model = get_whisper_model(model_name_openai)
for f in video_files:
    output = get_whisper_result(os.path.join(video_dir, f), model)
    df_list.append(generate_whisper_transcription(f, os.path.join(video_dir, f), output))

final_df = pd.concat(df_list)
final_df.to_excel(f"Transcriptions{datetime.now().strftime('_%Y%m%d_%H%M%S')}.xlsx", index=False)

Transcribe: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 100.03/100.03 [00:08<00:00, 11.87sec/s]
Align: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 100.03/100.03 [00:00<00:00, 154.70sec/s]
  output = model.align(file_path, output, language="en")
Transcribe: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 88.03/88.03 [00:08<00:00, 10.21sec/s]
Align: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 88.03/88.03 [00:00<00:00, 177.54sec/s]
  output = model.align(file_path, output, language="en")
Transcribe: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 64.02/64.02 [00:05<00:00, 10.83sec/s]
Align: 100%|████████████████████████████████████████████████████████████████████████████████████

In [None]:
final_df

In [5]:
final_df = pd.read_excel("Transcriptions_20240204_122057.xlsx")
final_df

Unnamed: 0,file_name,segment_id,segment_text,segment_start,segment_end,word,word_start,word_end
0,StarTalk_Consciousness_1075_1175.mp4,0,did consciousness emerge?,0.00,1.32,did,0.00,0.14
1,StarTalk_Consciousness_1075_1175.mp4,0,did consciousness emerge?,0.00,1.32,consciousness,0.44,0.80
2,StarTalk_Consciousness_1075_1175.mp4,0,did consciousness emerge?,0.00,1.32,emerge?,0.80,1.32
3,StarTalk_Consciousness_1075_1175.mp4,1,"And the other question is,",1.74,3.38,And,1.74,2.32
4,StarTalk_Consciousness_1075_1175.mp4,1,"And the other question is,",1.74,3.38,the,2.34,2.50
...,...,...,...,...,...,...,...,...
9052,StarTalk_Questions_831_5_924.mp4,37,"Yeah, there's something called subspace where ...",88.60,91.62,they,90.04,90.22
9053,StarTalk_Questions_831_5_924.mp4,37,"Yeah, there's something called subspace where ...",88.60,91.62,can,90.22,90.36
9054,StarTalk_Questions_831_5_924.mp4,37,"Yeah, there's something called subspace where ...",88.60,91.62,communicate,90.48,90.70
9055,StarTalk_Questions_831_5_924.mp4,37,"Yeah, there's something called subspace where ...",88.60,91.62,basically,90.70,91.14


In [8]:
file_list = final_df["file_name"].unique()
for item in file_list:
    create_srt(final_df, item, "../../Dataset/Set_2/")