### Imports

In [None]:
from transformers import pipeline, GenerationConfig
import torch
import json
import os
import time
import datetime
import gc
import py3nvml.py3nvml as nvml
import logging
from threading import Thread

### Variables (adjustable)

In [None]:
device = "cuda:1"
device_index = 1  # for multiple GPU setup. Indicates which GPU to use
fp16 = True
batch_size = 5
model_version = "large-v2"
audios_path = "/opt/app-root/src/nbest/bn_nl_segments/"
audio_file = "/opt/app-root/src/nbest/bn-nl/nbest-eval-2008-bn-nl-002.wav"
out_path = "/opt/app-root/src/results/jax/large-v2/"
# GPU measurement parameter
interval = 0.5  # how often to measure GPU usage (in s)

### Threading function (to measure GPU usage)

In [None]:
class MyThread(Thread):
    def __init__(self, func, params):
        super(MyThread, self).__init__()
        self.func = func
        self.params = params
        self.result = None

    def run(self):
        self.result = self.func(*self.params)

    def get_result(self):
        return self.result

### ffmpeg setup

In [None]:
# # Uncomment the 2 lines below if you want to download a static FFmpeg build
# !curl https://johnvansickle.com/ffmpeg/releases/ffmpeg-release-amd64-static.tar.xz -o ffmpeg.tar.xz \
#  && tar -xf ffmpeg.tar.xz && rm ffmpeg.tar.xz

# Add the build to PATH
ffmdir = !find . -iname ffmpeg-*-static
path = %env PATH
path = path + ':' + ffmdir[0]
%env PATH $path
print('')
!which ffmpeg
print('Done!')

### Running it all (modify where needed, mostly when changing implementation)

In [None]:
### SETTING UP THE LOGGER
logging.basicConfig(filename="hf_" + model_version + "_batch_" + str(batch_size) + "_ctype_float16.txt",
                    format="%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s",
                    level=logging.INFO,
                    force=True)
logger = logging.getLogger(__name__)
consoleHandler = logging.StreamHandler()
logger.addHandler(consoleHandler)

### LOADING THE MODEL
logger.info("================================START OF EVALUATION================================")
start = time.time()
pipe = pipeline(
  "automatic-speech-recognition",
  model="openai/whisper-" + model_version,
  chunk_length_s=30,
  device=device,
  torch_dtype=torch.float16 if fp16 else torch.float32,
)
logger.info(f"Time to load the model: {time.time() - start} s")
logger.info("================================")
logger.info(
            "Measuring maximum GPU memory usage on GPU device."
            " Make sure to not have additional processes running on the same GPU."
        )
# Initialization for measuring GPU usage
nvml.nvmlInit()
handle = nvml.nvmlDeviceGetHandleByIndex(device_index)
gpu_name = nvml.nvmlDeviceGetName(handle)
gpu_memory_limit = nvml.nvmlDeviceGetMemoryInfo(handle).total >> 20
gpu_power_limit = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000.0

gpu_usage = {"gpu_memory_usage": [], "gpu_power_usage": []}

def _get_gpu_info():
    while True:
        gpu_usage["gpu_memory_usage"].append(
            nvml.nvmlDeviceGetMemoryInfo(handle).used >> 20
        )
        gpu_usage["gpu_power_usage"].append(
            nvml.nvmlDeviceGetPowerUsage(handle) / 1000
        )
        time.sleep(interval)

        if stop:
            break

    return gpu_usage

stop = False
thread = MyThread(_get_gpu_info, params=())
thread.start()

# Measuring time spent transcribing this file
file_start = time.time()

# we can also return timestamps for the predictions
prediction = pipe(audio_file, batch_size=batch_size, return_timestamps="word")["chunks"]

# Stop measuring GPU usage for this file
stop = True
thread.join()

logger.info(f"Time to transcribe: {time.time() - file_start} s")
max_memory_usage = max(gpu_usage["gpu_memory_usage"])
max_power_usage = max(gpu_usage["gpu_power_usage"])
logger.info(
    "Maximum GPU memory usage: %dMiB / %dMiB (%.2f%%)"
    % (
        max_memory_usage,
        gpu_memory_limit,
        (max_memory_usage / gpu_memory_limit) * 100,
    )
)
logger.info(
    "Maximum GPU power usage: %dW / %dW (%.2f%%)"
    % (
        max_power_usage,
        gpu_power_limit,
        (max_power_usage / gpu_power_limit) * 100,
    )
)
# print(prediction)
logger.info(f"Time to transcribe: {time.time() - file_start} s")
logger.info("===================")
logger.info('Total time spent evaluating: ' + str(datetime.timedelta(seconds=time.time() - start)))
logger.info("================================END OF EVALUATION================================")