# Audio Representation

## Imports

In [None]:
import numpy as np
import torch
import torchaudio as ta
import librosa
from matplotlib import pyplot as plt

import IPython
from IPython.display import display
import ipywidgets as widgets

## Audio File Settings

In [None]:
audio_file_path = "../../Data/Audio/Gutenberg/Night_and_Day_by_Virginia_Woolf_48khz.wav"
audio_sample_rate = 48000
audio_range_sec = [ 10.0, 14.0 ]

audio_file_gui = widgets.Text(value=audio_file_path, description="Audio File:", style={'description_width': 'initial'}) 
audio_sample_rate_gui = widgets.IntText(value=audio_sample_rate, description="Audio Sample Rate:", style={'description_width': 'initial'})
audio_range_sec_gui_1 = widgets.IntText(value=audio_range_sec[0], description="Audio Range Start (sec):", style={'description_width': 'initial'})
audio_range_sec_gui_2 = widgets.IntText(value=audio_range_sec[1], description="Audio Range End (sec):", style={'description_width': 'initial'})

display(audio_file_gui)
display(audio_sample_rate_gui)
display(audio_range_sec_gui_1)
display(audio_range_sec_gui_2)

In [None]:
audio_file_path = audio_file_gui.value
audio_sample_rate = audio_sample_rate_gui.value
audio_range_sec[0] = audio_range_sec_gui_1.value
audio_range_sec[1] = audio_range_sec_gui_2.value

## Load Audio File

In [None]:
audio_waveform, _ = librosa.load(audio_file_path, sr=audio_sample_rate)

if len(audio_waveform.shape) == 1:
    audio_waveform = np.expand_dims(audio_waveform, 0)

audio_waveform = audio_waveform[:, int(audio_range_sec[0] * audio_sample_rate) : int(audio_range_sec[1] * audio_sample_rate) ]

## Play Audio

In [None]:
IPython.display.Audio(audio_waveform,rate=audio_sample_rate)

## Plot Audio Waveform

In [None]:
plt.title('Waveform')
plt.plot(audio_waveform[0])

## Audio Buffer

In [None]:
audio_buffer_size = 1024

audio_buffer_size_gui = widgets.IntText(value=audio_buffer_size, description="Audio Buffer Size:", style={'description_width': 'initial'})

display(audio_buffer_size_gui)

In [None]:
audio_buffer_size = audio_buffer_size_gui.value

audio_buffer = audio_waveform[0, :audio_buffer_size]

plt.title('Audio Buffer')
plt.plot(audio_buffer)
plt.show()

## Amplitude Envelope

In [None]:
audio_window = np.hanning(audio_buffer_size)

plt.title('Hanning Window')
plt.plot(audio_window)
plt.show()

## Windowed Audio Buffer

In [None]:
audio_buffer_windowed = audio_buffer * audio_window

plt.title('Audio Buffer Windowed')
plt.plot(audio_buffer_windowed)
plt.show()

## Calculate Audio Spectrum

In [None]:
audio_buffer_windowed = torch.from_numpy(audio_buffer_windowed)

audio_spectrum = torch.fft.fft(audio_buffer_windowed)

## Plot Audio Spectrum

In [None]:
# magnitude of spectrum
plt.title('Audio Spectrum Magnitude')
plt.plot(audio_spectrum[:audio_buffer_size//2].abs().numpy())
plt.show()

# phase of spectrum
plt.title('Audio Spectrum Phase')
plt.plot(audio_spectrum[:audio_buffer_size//2].angle().numpy())
plt.show()

## Reconstruct Audio Buffer from Spectrum

In [None]:
audio_buffer_rec = torch.fft.ifft(audio_spectrum).real

plt.title('Reconstructed Audio Buffer')
plt.plot(audio_buffer_rec.numpy())

## Calculate Audio Spectrogram

In [None]:
"""
see: https://pytorch.org/audio/main/generated/torchaudio.transforms.Spectrogram.html
"""

nFFT = 1024

nFFT_gui = widgets.IntText(value=nFFT, description="Spectral Bin Count:", style={'description_width': 'initial'})

display(nFFT_gui)

In [None]:
nFFT = nFFT_gui.value

audio_spectrogram = ta.transforms.Spectrogram(n_fft=nFFT)(torch.from_numpy(audio_waveform))

## Plot Audio Spectrogram

In [None]:
plt.figure(figsize=(10, 4))
plt.imshow(audio_spectrogram.squeeze().log2().numpy(), aspect='auto', origin='lower')
plt.title('Audio Spectrogram (log2)')
plt.xlabel('Frame')
plt.ylabel('FFT bins')
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.show()

## Calculate MEL Spectrogram

In [None]:
nFFT = 1024
nMels = 128

nFFT_gui = widgets.IntText(value=nFFT, description="Spectral Bin Count:", style={'description_width': 'initial'})
nMels_gui = widgets.IntText(value=nMels, description="Mel Band Count:", style={'description_width': 'initial'})

display(nFFT_gui)
display(nMels_gui)

In [None]:
nFFT = nFFT_gui.value
nMels = nMels_gui.value

# Create MelSpectrogram transform
mel_transform = ta.transforms.MelSpectrogram(
    sample_rate=audio_sample_rate,
    n_fft=nFFT,
    hop_length=nFFT // 2,
    n_mels=nMels
)

# Compute Mel spectrogram (shape: [channels, n_mels, time])
mel_spec = mel_transform(torch.from_numpy(audio_waveform))

## Plot Mel Spectrogram

In [None]:
plt.figure(figsize=(10, 4))
plt.imshow(mel_spec.squeeze().log2().numpy(), aspect='auto', origin='lower')
plt.title('Mel Spectrogram (log2)')
plt.xlabel('Frame')
plt.ylabel('Mel bins')
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.show()