# AudioDescription example
This notebook demonstrates how the [LTU-AS model](https://github.com/YuanGongND/ltu) generates audio descriptions for three different categories of audio: music, speech and environmental audio.

All the needed audio files are provided. 

In [40]:
import os 
import tempfile

import librosa
import soundfile as sf
from gradio_client import Client

import IPython.display as ipd

In [4]:
# the model requires audio in 16kHz, so we have to resample it
SAMPLE_RATE = 16000
# for efficiency, we crop the audio to a maximum of 15 seconds
MAX_AUDIO_LENGTH = 15
SAMPLE_DATA_PATH = "../sample_data/audio"

In [48]:
PROMPT = "Classify the audio between 'music', 'environmental', 'speech' and 'silence'. Can you describe it in two sentences?"

In [3]:
def cut_audio(x):
    cut_x = x.copy()
    if len(cut_x) > MAX_AUDIO_LENGTH*fs:
        cut_x = cut_x[:MAX_AUDIO_LENGTH*fs]
    return cut_x

# Music (jazz)

In [43]:
x, fs = librosa.load(os.path.join(SAMPLE_DATA_PATH, "jazz.wav"), sr=SAMPLE_RATE)

x = cut_audio(x)

In [44]:
# listen to the sample
ipd.Audio(x, rate=fs)

In [49]:
with tempfile.NamedTemporaryFile(dir=".", suffix=".wav") as f:
    sf.write(f.name, x, fs)

    client = Client("https://yuangongfdu-ltu-2.hf.space/")
    result = client.predict(
        f.name, 
        "", 
        PROMPT, 
        "7B (Default)", 
        api_name="/predict"
    )

print(result)

Loaded as API: https://yuangongfdu-ltu-2.hf.space/ ✔
Labels: Swing music; Jazz; Music; Blues.


# Music (candombe)

In [51]:
x, fs = librosa.load(os.path.join(SAMPLE_DATA_PATH, "candombe.flac"), sr=SAMPLE_RATE)

x = cut_audio(x)

In [52]:
ipd.Audio(x, rate=fs)

In [53]:
with tempfile.NamedTemporaryFile(dir=".", suffix=".wav") as f:
    sf.write(f.name, x, fs)
    client = Client("https://yuangongfdu-ltu-2.hf.space/")
    result = client.predict(
        f.name, 
        "", 
        PROMPT, 
        "7B (Default)", 
        api_name="/predict"
    )

print(result)

Loaded as API: https://yuangongfdu-ltu-2.hf.space/ ✔
Labels: Drum; Percussion; Musical instrument; Music


# Enviromental

In [54]:
x, fs = librosa.load(os.path.join(SAMPLE_DATA_PATH, "env.wav"), sr=SAMPLE_RATE)
x = cut_audio(x)
sf.write("tmp.wav", x, fs)

In [55]:
ipd.Audio(x, rate=fs)

In [56]:
with tempfile.NamedTemporaryFile(dir=".", suffix=".wav") as f:
    sf.write(f.name, x, fs)
    client = Client("https://yuangongfdu-ltu-2.hf.space/")
    result = client.predict(
        f.name, 
        "", 
        PROMPT, 
        "7B (Default)", 
        api_name="/predict"
    )

print(result)

Loaded as API: https://yuangongfdu-ltu-2.hf.space/ ✔
Labels: Bird chirping,  tweeting; Wind; Vehicle horn, car horn, honking, toot; Traffic noise, roadway noise. Audio caption: A bird is chirping while a vehicle honks and traffic noises are heard.


# Speech

In [57]:
x, fs = librosa.load(os.path.join(SAMPLE_DATA_PATH, "speech.wav"), sr=SAMPLE_RATE)
x = cut_audio(x)
sf.write("tmp.wav", x, fs)

In [58]:
ipd.Audio(x, rate=fs)

In [59]:
with tempfile.NamedTemporaryFile(dir=".", suffix=".wav") as f:
    sf.write(f.name, x, fs)
    client = Client("https://yuangongfdu-ltu-2.hf.space/")
    result = client.predict(
        f.name, 
        "", 
        PROMPT, 
        "7B (Default)", 
        api_name="/predict"
    )

print(result)

Loaded as API: https://yuangongfdu-ltu-2.hf.space/ ✔
Labels: Human voice; Speech; Female speech and woman speaking; Speech synthesizer; Speech synthetic; Speech articulation and production; Speech recitation; Vocal music.
