### Imports

In [None]:
import whisperx
import pandas as pd
import torch
import json
import os
import time
import datetime
import gc
import py3nvml.py3nvml as nvml
import logging
from threading import Thread

### Variables (adjustable)

In [None]:
device = "cuda"  # "cpu" to use the CPU, "cuda" to use the GPU
device_index = 0  # for multiple GPU setup. Indicates which GPU to use
batch_size = 16  # the higher, the faster, but more GPU memory will be used
compute_type = "float32"  # precision to use (fp16, fp32, int8, etc.)
model_version = "large-v3"  # options: https://github.com/beeldengeluid/dane-whisper-asr-worker?tab=readme-ov-file#model-options
audios_path = "/opt/app-root/src/nbest/bn_nl_segments/"  # absolute path to folder where audio to be transcribed can be found
audio_file = "/opt/app-root/src/nbest/bn-nl/nbest-eval-2008-bn-nl-002.wav"  # for experimenting/testing purposes
out_path = (
    "/opt/app-root/src/results/whisperx/" + compute_type + "/" + model_version + "/"
)  # absolute path to folder where transcriptions + log should be saved
# GPU measurement parameter
interval = 0.5  # how often to measure GPU usage (in s)
# SECRET (for running diarization)
HF_TOKEN = "YOUR_HF_TOKEN"

### Threading function (to measure GPU usage)

In [None]:
class MyThread(Thread):
    def __init__(self, func, params):
        super(MyThread, self).__init__()
        self.func = func
        self.params = params
        self.result = None

    def run(self):
        self.result = self.func(*self.params)

    def get_result(self):
        return self.result

### Other setup (ffmpeg, changing alignment model, function for output formatting)

In [None]:
# # Uncomment the 2 lines below if you want to download a static FFmpeg build
# !curl https://johnvansickle.com/ffmpeg/releases/ffmpeg-release-amd64-static.tar.xz -o ffmpeg.tar.xz \
#  && tar -xf ffmpeg.tar.xz && rm ffmpeg.tar.xz

# Add the build to PATH
ffmdir = !find . -iname ffmpeg-*-static
path = %env PATH
path = path + ':' + ffmdir[0]
%env PATH $path
print('')
!which ffmpeg
print('Done!')

In [None]:
# Changing the alignment model as the original model lacks certain characters
# that appear in Dutch (such as accented vowels)
whisperx.alignment.DEFAULT_ALIGN_MODELS_HF["nl"] = (
    "jonatasgrosman/wav2vec2-xls-r-1b-dutch"
)

In [None]:
def get_speaker(word, seg_spk):
    if "speaker" in word:
        return word["speaker"]
    else:
        return seg_spk

### Dataset benchmarking

In [None]:
### SETTING UP THE LOGGER
logging.basicConfig(
    filename=out_path + "log.txt",
    format="%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s",
    level=logging.INFO,
    force=True,
)
logger = logging.getLogger(__name__)
consoleHandler = logging.StreamHandler()
logger.addHandler(consoleHandler)

# Will also add a dataframe to save each stat per file in an Excel sheet
file_names = []
# info about transcription
t_gpu_mem = []
t_gpu_power = []
t_times = []
# info about alignment
a_gpu_mem = []
a_gpu_power = []
a_times = []
# info about diarization
d_gpu_mem = []
d_gpu_power = []
d_times = []

### LOADING THE MODEL
logger.info(
    "================================START OF EVALUATION================================"
)
logger.info("batch_size: " + str(batch_size))
logger.info("=======================")
start = time.time()
model = whisperx.load_model(
    model_version,
    device=device,
    device_index=device_index,
    compute_type=compute_type,
    asr_options={"suppress_numerals": True},
)
load_time = time.time() - start
logger.info(f"Time to load the model: {load_time} s")
logger.info("================================")
logger.info(
    "Measuring maximum GPU memory usage on GPU device."
    " Make sure to not have additional processes running on the same GPU."
)

# Initialization for measuring GPU usage
nvml.nvmlInit()
handle = nvml.nvmlDeviceGetHandleByIndex(device_index)
gpu_name = nvml.nvmlDeviceGetName(handle)
gpu_memory_limit = nvml.nvmlDeviceGetMemoryInfo(handle).total >> 20
gpu_power_limit = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000.0

for file in os.listdir(audios_path):
    gpu_usage = {"gpu_memory_usage": [], "gpu_power_usage": []}

    def _get_gpu_info():
        while True:
            gpu_usage["gpu_memory_usage"].append(
                nvml.nvmlDeviceGetMemoryInfo(handle).used >> 20
            )
            gpu_usage["gpu_power_usage"].append(
                nvml.nvmlDeviceGetPowerUsage(handle) / 1000
            )
            time.sleep(interval)

            if stop:
                break

        return gpu_usage

    stop = False
    thread = MyThread(_get_gpu_info, params=())
    thread.start()

    # Measuring time spent transcribing this file
    file_start = time.time()
    # Measuring time spent processing file (transcription-diarization)
    st_file_time = time.time()

    # 1. Transcribe with original Whisper (batched)
    audio = whisperx.load_audio(audios_path + file)
    result = model.transcribe(audio, batch_size=batch_size)

    # Stop measuring GPU usage for this file
    stop = True
    thread.join()

    file_time = time.time() - file_start

    logger.info(f"Time to transcribe: {file_time} s")
    max_memory_usage = max(gpu_usage["gpu_memory_usage"])
    max_power_usage = max(gpu_usage["gpu_power_usage"])
    logger.info(
        "Maximum GPU memory usage: %dMiB / %dMiB (%.2f%%)"
        % (
            max_memory_usage,
            gpu_memory_limit,
            (max_memory_usage / gpu_memory_limit) * 100,
        )
    )
    logger.info(
        "Maximum GPU power usage: %dW / %dW (%.2f%%)"
        % (
            max_power_usage,
            gpu_power_limit,
            (max_power_usage / gpu_power_limit) * 100,
        )
    )
    logger.info("--------------------------------")

    file_names.append(file[:-4])
    t_times.append(file_time)
    t_gpu_mem.append(max_memory_usage)
    t_gpu_power.append(max_power_usage)

    gpu_usage = {"gpu_memory_usage": [], "gpu_power_usage": []}
    stop = False
    thread = MyThread(_get_gpu_info, params=())
    thread.start()

    # Measuring time spent aligning the output (word-level timestamps)
    file_start = time.time()

    # 2. Align whisper output
    model_a, metadata = whisperx.load_align_model(
        language_code=result["language"], device=device
    )
    result = whisperx.align(
        result["segments"],
        model_a,
        metadata,
        audio,
        device,
        return_char_alignments=False,
    )

    # Stop measuring GPU usage for this file
    stop = True
    thread.join()

    file_time = time.time() - file_start

    logger.info(
        f"Time to align (generate word-level timestamps using wav2vec2): {file_time} s"
    )
    max_memory_usage = max(gpu_usage["gpu_memory_usage"])
    max_power_usage = max(gpu_usage["gpu_power_usage"])
    logger.info(
        "Maximum GPU memory usage: %dMiB / %dMiB (%.2f%%)"
        % (
            max_memory_usage,
            gpu_memory_limit,
            (max_memory_usage / gpu_memory_limit) * 100,
        )
    )
    logger.info(
        "Maximum GPU power usage: %dW / %dW (%.2f%%)"
        % (
            max_power_usage,
            gpu_power_limit,
            (max_power_usage / gpu_power_limit) * 100,
        )
    )
    logger.info("--------------------------------")

    a_times.append(file_time)
    a_gpu_mem.append(max_memory_usage)
    a_gpu_power.append(max_power_usage)

    gpu_usage = {"gpu_memory_usage": [], "gpu_power_usage": []}
    stop = False
    thread = MyThread(_get_gpu_info, params=())
    thread.start()

    file_start = time.time()
    # 3. Assign speaker labels
    diarize_model = whisperx.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device)
    diarize_segments = diarize_model(audio)

    result = whisperx.assign_word_speakers(diarize_segments, result)

    # Stop measuring GPU usage for this file
    stop = True
    thread.join()

    file_time = time.time() - file_start

    logger.info(f"Time to diarize: {file_time} s")
    max_memory_usage = max(gpu_usage["gpu_memory_usage"])
    max_power_usage = max(gpu_usage["gpu_power_usage"])
    logger.info(
        "Maximum GPU memory usage: %dMiB / %dMiB (%.2f%%)"
        % (
            max_memory_usage,
            gpu_memory_limit,
            (max_memory_usage / gpu_memory_limit) * 100,
        )
    )
    logger.info(
        "Maximum GPU power usage: %dW / %dW (%.2f%%)"
        % (
            max_power_usage,
            gpu_power_limit,
            (max_power_usage / gpu_power_limit) * 100,
        )
    )
    logger.info("--------------------------------")

    d_times.append(file_time)
    d_gpu_mem.append(max_memory_usage)
    d_gpu_power.append(max_power_usage)

    logger.info(
        f"Total time spent transcribing->diarization for {file[:-4]}: {time.time() - st_file_time} s"
    )
    logger.info("================================")

    # Formatting the transcription
    segments_to_add = []
    for segment in result["segments"]:
        words_to_add = []
        for i, word in enumerate(segment["words"]):
            # There can be issues with characters not present in the vocab of the alignment model
            try:
                words_to_add.append(
                    {
                        # There's an issue where the text output contains a whitespace at the front of the text
                        "text": word["word"].strip(),
                        "start": word["start"],
                        "end": word["end"],
                        "confidence": word["score"] if "score" in word else 0,
                        "speaker": get_speaker(
                            word, segment["speaker"] if "speaker" in segment else "N/A"
                        ),
                    }
                )
            # In that case, we will skip these words
            except KeyError:
                continue
        segments_to_add.append(
            {
                "start": segment["start"],
                "end": segment["end"],
                "text": segment["text"].strip(),
                "speaker": segment["speaker"] if "speaker" in segment else "N/A",
                "words": words_to_add,
            }
        )
    result = {"segments": segments_to_add}
    # Saving results to JSON file
    with open(out_path + file[:-3] + "json", "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2, ensure_ascii=False)

    torch.cuda.empty_cache()
    gc.collect()

logger.info(
    "Total time spent evaluating: "
    + str(datetime.timedelta(seconds=time.time() - start))
)
logger.info(
    "================================END OF EVALUATION================================"
)

logging.shutdown()
nvml.nvmlShutdown()

In [None]:
df = pd.DataFrame(
    {
        "TRANSCRIPTION Time (s)": t_times,
        "T Max GPU mem usage (MiB)": t_gpu_mem,
        "T Max GPU power usage (W)": t_gpu_power,
        "ALIGNMENT Time (s)": a_times,
        "A Max GPU mem usage (MiB)": a_gpu_mem,
        "A Max GPU power usage (W)": a_gpu_power,
        "DIARIZATION Time (s)": d_times,
        "D Max GPU mem usage (MiB)": d_gpu_mem,
        "D Max GPU power usage (W)": d_gpu_power,
    },
    index=file_names,
)
df.to_csv(out_path + "info.csv")