In [None]:
from pprint import pprint

import ipywidgets as widgets
import torch
import whisper
from IPython.display import clear_output, display

from unified_desktop import RESOURCES_DIR
from unified_desktop.pipelines import UDSpeechRecognizer

%load_ext autoreload
%autoreload 2

## Automatic Speech Recognition (ASR)

In [None]:
def update_asr_obj(change):
    global asrObj
    clear_output()
    display(model_dropdown, device_dropdown)
    asrObj = UDSpeechRecognizer(name=model_dropdown.value, device=device_dropdown.value)
    print(f"Loaded model: {model_dropdown.value} on device: {device_dropdown.value}")


# Dropdown for OpenAI Whisper models
model_dropdown = widgets.Dropdown(
    options=whisper.available_models(),
    value="tiny.en",
    description="ModelName:",
)


cuda_options = [torch.device("cuda", idx) for idx in range(torch.cuda.device_count())]
device_dropdown = widgets.Dropdown(
    options=["cpu"] + cuda_options,
    value="cpu",
    description="Device:",
)

# Attach the update function to the dropdown
model_dropdown.observe(update_asr_obj, names="value")
device_dropdown.observe(update_asr_obj, names="value")

# Display the widgets and initialize asrObj
display(model_dropdown, device_dropdown)
asrObj = UDSpeechRecognizer(name=model_dropdown.value, device=device_dropdown.value)

In [None]:
# Audio file to transcribe
audio_file = (
    RESOURCES_DIR
    / "call-center-sample-en_US"
    / "en_US_7a4f56d7-9aca-4ed5-96b9-9c9c36b8a3ac.wav"
)
# Use fp16 if on CUDA, else fp32
fp16 = device_dropdown.value in cuda_options

# Transcribe the audio file
transcribed_text = asrObj(audio_file, verbose=True, fp16=fp16)
pprint(transcribed_text)