In [None]:
import os
import essentia
import essentia.standard as es
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('default')
plt.rcParams['axes.grid'] = True

In [None]:
# Helper functions

import numpy as np
import matplotlib.pyplot as plt

def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None, ylim=None):
    waveform = np.atleast_2d(waveform)
    num_channels, num_frames = waveform.shape
    time_axis = np.arange(0, num_frames) / sample_rate

    figure, axes = plt.subplots(num_channels, 1, figsize=(8, 4), sharex=True, sharey=True)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].plot(time_axis, waveform[c], linewidth=1)
        axes[c].grid(True)
        if num_channels > 1:
            axes[c].set_ylabel(f'Channel {c + 1}')
        if xlim:
            axes[c].set_xlim(xlim)
        if ylim:
            axes[c].set_ylim(ylim)
    figure.suptitle(title)
    plt.tight_layout()
    plt.show(block=False)

In [None]:
# Audio files

PREDICTION = str("../audio/pred-tcn-1700-updated-stft-48k.wav")
TARGET = str("../audio/target-tcn-1700-updated-stft-48k.wav")

In [None]:
target, sample_rate, _, _, _, _ = es.AudioLoader(filename=TARGET)()
target = target.reshape(1, -1)
print(f"sample rate: {int(sample_rate)}, audio shape {target.shape}")
plot_waveform(target, sample_rate, title="Target Waveform")

In [None]:
pred, sample_rate, _, _, _, _ = es.AudioLoader(filename=PREDICTION)()
pred = pred.reshape(1, -1)
print(f"sample rate: {int(sample_rate)}, audio shape {pred.shape}")
plot_waveform(pred, sample_rate, title="Prediction Waveform")

In [None]:
start_idx = 0  # start of the zoom
end_idx = 4  # end of the zoom

# print(f"pred_zoom shape: {pred_zoom.shape}")
# print(f"targ_zoom shape: {targ_zoom.shape}")
# print(f"time vector shape: {time.shape}")

In [9]:
output = pred
target = target

fs = sample_rate
output = output[int(sample_rate * start_idx):int(sample_rate * end_idx)]
target = target[int(sample_rate * start_idx):int(sample_rate * end_idx)]

T = 1 / fs                              # sampling interval
t = np.arange(start_idx,end_idx, T)     # Time vector

plt.figure()
plt.plot(t, output, alpha=0.8, label="Model")
plt.plot(t, target, alpha=0.8, label="Target")
plt.xlabel("Time (samples)")
plt.ylabel("Amplitude")
plt.title(title)
plt.grid()
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
N = 1024
hop_size = 256
window = 'hann'

pred = pred_zoom[0]
targ = targ_zoom[0]

print(f"pred_zoom shape: {pred_zoom.shape}")
print(f"targ_zoom shape: {targ_zoom.shape}")
print(f"time vector shape: {time.shape}")

max_len = max(len(pred), len(targ))
pred = np.pad(pred, (0, max_len - len(pred)))
targ = np.pad(targ, (0, max_len - len(targ)))

# 1. Compute the STFT for both signals
Pxx_1, freqs_1, time_1, _= plt.specgram(pred, NFFT=N, Fs=sample_rate, window=window, noverlap=hop_size, scale='dB', mode='magnitude')
Pxx_2, freqs_2, time_2, _ = plt.specgram(targ, NFFT=N, Fs=sample_rate, window=window, noverlap=hop_size, scale='dB', mode='magnitude')
plt.close()

# Convert time values to sample values considering the 'start' index
time_pred_samples = time_1 * sample_rate + start  
time_targ_samples = time_2 * sample_rate + start

# 2. Subtract one spectrogram from the other
difference = 10 * np.log10(Pxx_2 + 1e-7) - 10 * np.log10(Pxx_1 + 1e-7)  # Convert to dB and then subtract

# 3. Plot the difference
fig, ax = plt.subplots()
img = ax.imshow(difference, aspect='auto', origin='lower', cmap='coolwarm', extent=[time_pred_samples[0], time_targ_samples[-1], freqs_1[0], freqs_1[-1]])
ax.set_title("Difference in Frequency Domain")
ax.set_ylabel('Frequency [Hz]')
ax.set_xlabel('Samples')
ax.grid(True)

cbar = fig.colorbar(img, ax=ax, format="%+2.0f dB")
cbar.set_label('Intensity [dB]')

plt.tight_layout()
plt.show()

In [None]:
pred = pred_zoom
targ = targ_zoom

max_len = max(len(pred), len(targ))
pred = np.pad(pred, (0, max_len - len(pred)))
targ = np.pad(targ, (0, max_len - len(targ)))

# 1. Compute the STFT for both signals
Pxx_1, freqs_1, time_1, _= plt.specgram(pred, NFFT=N, Fs=sample_rate, window=window, noverlap=hop_size, scale='dB', mode='magnitude')
Pxx_2, freqs_2, time_2, _ = plt.specgram(targ, NFFT=N, Fs=sample_rate, window=window, noverlap=hop_size, scale='dB', mode='magnitude')
plt.close()

# 2. Subtract one spectrogram from the other
difference = 10 * np.log10(Pxx_2 + 1e-7) - 10 * np.log10(Pxx_1 + 1e-7)  # Convert to dB and then subtract

# difference = Pxx_o - Pxx_t

# 3. Plot the difference
fig, ax = plt.subplots()
img = ax.imshow(difference, aspect='auto', origin='lower', cmap='coolwarm', extent=[t_1[0], t_1[-1], freqs_1[0], freqs_1[-1]])
ax.set_title("Difference in Frequency Domain")
ax.set_ylabel('Frequency [Hz]')
ax.set_xlabel('Time [sec]')
ax.grid(True)

cbar = fig.colorbar(img, ax=ax, format="%+2.0f dB")
cbar.set_label('Intensity [dB]')

plt.tight_layout()
plt.show()

In [None]:
# Setting sample rate and STFT parameters
n_fft_value = 512
hop_length_value = 256  # Adjust as needed for time resolution

# Ensure both audios have the same length
if len(y1) > len(y2):
    y1 = y1[:len(y2)]
else:
    y2 = y2[:len(y1)]

# 1. Compute the STFT for both signals
Pxx_1, freqs_1, t_1, _ = plt.specgram(y1, NFFT=n_fft_value, Fs=sample_rate, window=np.hanning(n_fft_value), noverlap=hop_length_value, scale='dB', mode='magnitude')
Pxx_2, freqs_2, t_2, _ = plt.specgram(y2, NFFT=n_fft_value, Fs=sample_rate, window=np.hanning(n_fft_value), noverlap=hop_length_value, scale='dB', mode='magnitude')
plt.close()

# Convert time values to sample values
t_o_samples = (t_1 * sample_rate).astype(int)
t_t_samples = (t_2 * sample_rate).astype(int)

# 2. Subtract one spectrogram from the other
difference = 10 * np.log10(Pxx_2 + 1e-7) - 10 * np.log10(Pxx_1 + 1e-7)  # Convert to dB and then subtract

# 3. Plot the difference
fig, ax = plt.subplots()
img = ax.imshow(difference, aspect='auto', origin='lower', cmap='coolwarm', extent=[t_o_samples[0], t_o_samples[-1], freqs_1[0], freqs_1[-1]])
ax.set_title("Difference in Frequency Domain")
ax.set_ylabel('Frequency [Hz]')
ax.set_xlabel('Samples')
ax.grid(True)

cbar = fig.colorbar(img, ax=ax, format="%+2.0f dB")
cbar.set_label('Intensity [dB]')

plt.tight_layout()
plt.show()


In [None]:
# Setting sample rate and STFT parameters
n_fft_value = 512
hop_length_value = 256  # Adjust as needed for time resolution

# Ensure both audios have the same length
if len(y1) > len(y2):
    y1 = y1[:len(y2)]
else:
    y2 = y2[:len(y1)]

# 1. Compute the STFT for both signals
Pxx_1, freqs_1, t_1, _ = plt.specgram(y1, NFFT=n_fft_value, Fs=sample_rate, window=np.hanning(n_fft_value), noverlap=hop_length_value, scale='dB', mode='magnitude')
Pxx_2, freqs_2, t_2, _ = plt.specgram(y2, NFFT=n_fft_value, Fs=sample_rate, window=np.hanning(n_fft_value), noverlap=hop_length_value, scale='dB', mode='magnitude')
plt.close()

# 2. Subtract one spectrogram from the other
difference = 10 * np.log10(Pxx_2 + 1e-7) - 10 * np.log10(Pxx_1 + 1e-7)

# 3. Plot the spectrograms
fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(10, 8))

# First audio file
img1 = axs[0].imshow(10 * np.log10(Pxx_1 + 1e-7), aspect='auto', origin='lower', cmap='inferno', extent=[t_1[0], t_1[-1], freqs_1[0], freqs_1[-1]])
axs[0].set_title("Spectrogram of First File")
axs[0].set_ylabel('Frequency [Hz]')
axs[0].grid(True)
cbar1 = fig.colorbar(img1, ax=axs[0], format="%+2.0f dB")
cbar1.set_label('Intensity [dB]')

# Second audio file
img2 = axs[1].imshow(10 * np.log10(Pxx_2 + 1e-7), aspect='auto', origin='lower', cmap='inferno', extent=[t_2[0], t_2[-1], freqs_2[0], freqs_2[-1]])
axs[1].set_title("Spectrogram of Second File")
axs[1].set_ylabel('Frequency [Hz]')
axs[1].grid(True)
cbar2 = fig.colorbar(img2, ax=axs[1], format="%+2.0f dB")
cbar2.set_label('Intensity [dB]')

# Difference
img3 = axs[2].imshow(difference, aspect='auto', origin='lower', cmap='coolwarm', extent=[t_1[0], t_1[-1], freqs_1[0], freqs_1[-1]])
axs[2].set_title("Difference in Frequency Domain")
axs[2].set_ylabel('Frequency [Hz]')
axs[2].set_xlabel('Time [s]')
axs[2].grid(True)
cbar3 = fig.colorbar(img3, ax=axs[2], format="%+2.0f dB")
cbar3.set_label('Intensity [dB]')

plt.tight_layout()
plt.show()

In [None]:
# Plot waveforms
plt.figure()

# Create time axis
time = [i/FS for i in range(len(y1))]

# Display the waveforms
plt.plot(time, y1, alpha=0.5, label='Output')
plt.plot(time, y2, alpha=0.5, label='Target')

# Zooming into a specific part (e.g., the first 0.1 seconds)
# start, end = 3.6, 4.0
# plt.xlim([start, end])
# plt.ylim([-0.5, 0.5])

# Adding legend and labels
plt.legend(loc='upper right')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.title('Waveforms of Output and Target')
plt.grid(True)
plt.tight_layout()
plt.show()

Frequency of bin $i$:

\begin{equation}
    f(i) = \frac{i * SR} {2 * N (bins)} 
\end{equation}

In [None]:
def nth_octave_smoothing(spectrum, n: int = 3):
    N = len(spectrum)
    freq_bins = np.linspace(0, int(sample_rate/2), N)
    y = np.zeros(shape=np.shape(spectrum), dtype = type(spectrum[0]))
    M_1 = len(spectrum) - 1

    for k in range(len(spectrum)):
        a = int(np.round(k * 2 ** (-1 /(2 * n))))
        b = int(np.round(k * 2 ** (1 /(2 * n))))

        if a == b:
            b += 1

        if b > M_1:
            b = M_1

        y[k] = (1 / ((b-1) - a + 1)) * np.sum(spectrum[a:b])
    return y, freq_bins