In [None]:
from pprint import pprint

import ipywidgets as widgets
import torch
import whisper
from IPython.display import Audio, clear_output, display

from unified_desktop import RESOURCES_DIR
from unified_desktop.core.utils.io_utils import get_matching_files_in_dir
from unified_desktop.pipelines import UDSpeechEmotionRecognizer, UDSpeechRecognizer

CUDA_OPTIONS = [torch.device("cuda", idx) for idx in range(torch.cuda.device_count())]

%load_ext autoreload
%autoreload 2
clear_output()

## Automatic Speech Recognition (ASR)

In [None]:
def update_asr_obj(change):
    global asrObj
    clear_output()
    display(model_dropdown, device_asr_dropdown)
    asrObj = UDSpeechRecognizer(
        name=model_dropdown.value, device=device_asr_dropdown.value
    )
    print(
        f"Loaded model: {model_dropdown.value} on device: {device_asr_dropdown.value}"
    )


# Dropdown for OpenAI Whisper models
model_dropdown = widgets.Dropdown(
    options=whisper.available_models(),
    value="tiny.en",
    description="ModelName:",
)
device_asr_dropdown = widgets.Dropdown(
    options=["cpu"] + CUDA_OPTIONS,
    value="cpu",
    description="Device:",
)

# Attach the update function to the dropdown
model_dropdown.observe(update_asr_obj, names="value")
device_asr_dropdown.observe(update_asr_obj, names="value")

# Display the widgets and initialize asrObj
display(model_dropdown, device_asr_dropdown)
asrObj = UDSpeechRecognizer(name=model_dropdown.value, device=device_asr_dropdown.value)

In [None]:
# Audio file to transcribe
audio_file = (
    RESOURCES_DIR
    / "call-center-sample-en_US"
    / "en_US_7a4f56d7-9aca-4ed5-96b9-9c9c36b8a3ac.wav"
)
audio = Audio(filename=audio_file)
display(audio)

# Use fp16 if on CUDA, else fp32
fp16 = device_asr_dropdown.value in CUDA_OPTIONS

# Transcribe the audio file
transcribed_text = asrObj(audio_file, verbose=True, fp16=fp16)
pprint(transcribed_text)

## Speech Emotion Recognition (SER)

In [None]:
def update_ser_obj(change):
    global serObj
    clear_output()
    display(device_ser_dropdown)
    serObj = UDSpeechEmotionRecognizer(device=device_ser_dropdown.value)
    print(f"Loaded SER model on device: {device_ser_dropdown.value}")


device_ser_dropdown = widgets.Dropdown(
    options=["cpu"] + CUDA_OPTIONS,
    value="cpu",
    description="Device:",
)

# Attach the update function to the dropdown
device_ser_dropdown.observe(update_ser_obj, names="value")

# Display the widgets and initialize serObj
serObj = UDSpeechEmotionRecognizer(device=device_ser_dropdown.value)
clear_output()
display(device_ser_dropdown)

In [None]:
# Recognize the emotion in the audio file
emotion_pred_per_audio = {}
for audio_file in get_matching_files_in_dir(
    RESOURCES_DIR / "emotions-sample-en", "*.wav"
):
    emotion_pred_per_audio[audio_file] = serObj(audio_file)
    clear_output()

# Display the audio file and the recognized emotion
for audio_file, emotion_pred in emotion_pred_per_audio.items():
    audio = Audio(filename=audio_file)
    display(audio)
    print(f"Emotion: {emotion_pred.label}, Score: {emotion_pred.score:.3f}")