# Mel-Spectrogram

Quelques écoutes pour avoir une idée de la taille des spectrogrammes en E/S. Reconstruction audio par G&L

In [1]:
import numpy as np
from scipy.io import wavfile
import scipy
import pandas as pd
import librosa
import librosa.display
import time

import matplotlib.pyplot as plt
%matplotlib widget
from mpl_toolkits.axes_grid1 import make_axes_locatable
from IPython.display import Audio

from synth import dexed

In [2]:
dexed_preset_db = dexed.PresetDatabase()
print("{} presets available".format(dexed_preset_db.get_nb_presets()))
chosen_algo = 5
preset_indexes = dexed_preset_db.get_preset_indexes_for_algorithm(chosen_algo)
print("{} presets using algorithm {}".format(len(preset_indexes), chosen_algo))

30293 presets available
3671 presets using algorithm 5


### Choix des paramètres du spectrogramme
* 22,050kHz par défaut dans librosa, d'autres ont choisi ça (on peut rester là-dessus...)
* Défault librosa
    * fft largeur 2048 (9,3ms) hopsize 512
* FlowSynth 2019
    * fft largeur 2048, hop 1024, 128 mel-bins de 30 à 11kHz
        * 128 mel-bins ça permet difficilement de reconstruire un son de DX7... (déjà sans AE)
    * log, normalized
    * corpus-wide zero-mean, unit-variance normalization on each spectrogram (based on the train dataset)
* Roche 2020
    * Dataset NSynth : fft largeur 1024, hop 512, 513 lin freq bins de la STFT basique
    * Dataset Arturia : fft 2048 (hop toujours 50% ?)
    * log, energy-normalized, threshold -100dB
* Wavenet (modèle baseline avec spectrogramme) 2017
    * fft largeur 1024, fft hop 256
    * STFT linéaire ??? (à confirmer)
    * log magnitude du power spectrum
    * Constant-Q transform pour *l'affichage* des résultats
    * *Ou alors, encoder wavenet sur waveform directement*
    
#### Problème de la taille du spectrogramme

### Utiliser un spectrogramme plutôt qu'un mel-spectrogramme ?
* $+$ représentation beaucoup plus fine du son, surtout pour un synthé comme le DX7
* $-$ pas d'échelle log pour les harmoniques, peut être un problème pour un CNN
* $+/-$ image à traiter est plus grande

In [3]:
t0 = time.time()
# Test pour voir si rendu temps-réel est viable. 5ms ouverture + 13ms pour 5s d'audio
for i in range(20):  # + rapide si petite boucle... (14ms total)
    dexed_renderer = dexed.Dexed()

    #idx = preset_indexes[1363]
    idx = 1767
    preset = dexed_preset_db.get_preset_values(idx, plugin_format=True)
    dexed_renderer.assign_preset(preset)
    dexed_renderer.set_all_oscillators_on()
    dexed_renderer.set_default_general_filter_and_tune_params()
    dexed_renderer.prevent_SH_LFO()
    x_wav = dexed_renderer.render_note(60, 100)
print("Temps moyen pour ouvrir dexed (nouvelle classe) et générer l'audio = {:.1f}ms\n".format(1000.0*(time.time() - t0)/20.0))

Fs = dexed_renderer.Fs

print("Input audio")
Audio(x_wav, rate=Fs)

Temps moyen pour ouvrir dexed (nouvelle classe) et générer l'audio = 13.9ms

Input audio


In [4]:
# PyTorch STFT test - for comparison
import torch
import torch.fft

# Default values for librosa (mel)spectrogram: 2048
fft_width = 1024
print("fft_width : {:.1f}ms ({} linear freq bins per time step).".format(1000.0*fft_width/Fs, fft_width//2+1))
print("Freq resolution = {:.1f}Hz".format(Fs / fft_width))
fft_hop = 512
x_dB_min_th = -200.0  # Valeur min du log-spectre

dynamic_range_dB = 100

# Real FFT, oneside is enough
spectrogram_norm_factor = torch.fft.rfft(torch.hann_window(fft_width, periodic=False)).abs().max().item()
print("Spectrogram norm factor (Hann window) = {}".format(spectrogram_norm_factor))

#x_test = np.sin(2*np.pi*0.1 * np.arange(0, x_wav.shape[0]))

spectrogram = torch.stft(torch.tensor(x_wav, dtype=torch.float32), n_fft=fft_width, hop_length=fft_hop,
                         window=torch.hann_window(fft_width, periodic=False), center=True,
                         pad_mode='constant', onesided=True, return_complex=True).abs()
spectrogram = spectrogram / spectrogram_norm_factor
spectrogram = torch.maximum(spectrogram, torch.ones(spectrogram.size()) * 10**(x_dB_min_th/20.0))
spectrogram = 20.0 * torch.log10(spectrogram)
spectrogram = torch.maximum(spectrogram, torch.ones(spectrogram.size()) * (torch.max(spectrogram) - dynamic_range_dB))  # Dynamic range

print("PyTorch Spectrogram size = {}".format(spectrogram.size()))

fig, ax = plt.subplots(1, 1)
ax.set(title="PyTorch STFT Spectrogram (dB). Min={:.1f} max={:.1f}".format(spectrogram.numpy().min(), spectrogram.numpy().max()),
       xlabel='Time [s]', ylabel='Freq [Hz]')
librosa.display.specshow(spectrogram.numpy(), shading='flat', ax=ax)

fft_width : 46.4ms (513 linear freq bins per time step).
Freq resolution = 21.5Hz
Spectrogram norm factor (Hann window) = 511.5
PyTorch Spectrogram size = torch.Size([513, 217])


  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.collections.QuadMesh at 0x7f9a902eeeb0>

In [5]:
# SciPy/Librosa spectrograms
fft_width = 2048  # easy config for mel-inverse
# STFT array: last axis correspond to segment times
print("Original signal length = {}".format(x_wav.shape[0]))
f_stft, t_stft, x_stft = scipy.signal.stft(x_wav, fs=Fs, nperseg=fft_width, noverlap=fft_width-fft_hop, padded=True)
print("spectrogram fft hop  : {:.1f}ms. {} time steps for {:.1f}s audio (padded)".format(1000.0*fft_hop/Fs, len(t_stft), x_wav.shape[0]/Fs))
print("STFT size : {}".format(x_stft.shape))
log_x_stft = np.maximum(20.0 * np.log10(np.abs(x_stft)), x_dB_min_th)

# Mel-spectrogram (and filter bank alone, for visualization)
n_mels = 256
f_mel_min = 30  # flow synth
f_mel_max = 11e3
melfb = librosa.filters.mel(Fs, fft_width, n_mels=n_mels, fmin=f_mel_min, fmax=f_mel_max)
mel_spectrogram = librosa.feature.melspectrogram(x_wav, sr=Fs, n_fft=fft_width, hop_length=fft_hop,
                                                 n_mels=n_mels, fmin=f_mel_min, fmax=f_mel_max)
log_mel_spectrogram = np.maximum(20.0 * np.log10(np.abs(mel_spectrogram)), x_dB_min_th)

# Reconstruction à partir du mel-spectro - prend un temps énorme. Ne converge pas toujours !
x_stft_from_mel = librosa.feature.inverse.mel_to_stft(mel_spectrogram, sr=Fs, n_fft=fft_width,
                                                      fmin=f_mel_min, fmax=f_mel_max)
log_x_stft_from_mel = np.maximum(20.0 * np.log10(np.abs(x_stft_from_mel)), x_dB_min_th)
print("Reconstructed STFT size : {}".format(x_stft_from_mel.shape))
# Reconstruction Audio via G&L - init avec la vraie phase à retrouver
x_stft_from_mel_with_phase = x_stft_from_mel * np.exp(1j * np.angle(x_stft))
print("Reconstructed STFT w/phase size : {}".format(x_stft_from_mel_with_phase.shape))
x_wav_from_mel = librosa.griffinlim(x_stft_from_mel_with_phase)
print("Mel-reconstructed signal length = {}".format(x_wav_from_mel.shape[0]))

fig, axes = plt.subplots(3, 2, figsize=(9, 8))
axes[0][0].set(title="Input Audio", xlabel="Time [s]")
axes[0][0].plot(np.arange(0, x_wav.shape[0]) / Fs, x_wav)
axes[1][0].set(title="STFT Spectrogram (log module)", xlabel='Time [s]', ylabel='Freq [Hz]')
im = axes[1][0].pcolormesh(t_stft, f_stft, log_x_stft, shading='gouraud')
fig.colorbar(im, ax=axes[1][0], orientation='vertical', format='%.0f dB')
im = librosa.display.specshow(melfb, x_axis='linear', ax=axes[2][0], cmap='nipy_spectral')
axes[2][0].set(ylabel='output index', title='Mel filter bank')
fig.colorbar(im, ax=axes[2][0], orientation='vertical')
im = librosa.display.specshow(log_mel_spectrogram, x_axis='time', y_axis='mel', sr=Fs, ax=axes[2][1], fmin=f_mel_min, fmax=f_mel_max)
fig.colorbar(im, ax=axes[2][1], orientation='vertical', format='%.0f dB')
axes[2][1].set(title="Mel Spectrogram - {} bins".format(n_mels))
axes[1][1].set(title="Reconstructed STFT Spectrogram (??)", xlabel='Time [s]', ylabel='Freq [Hz]')
im = axes[1][1].pcolormesh(t_stft, f_stft, log_x_stft_from_mel, shading='gouraud')
fig.colorbar(im, ax=axes[1][1], orientation='vertical', format='%.0f dB')
axes[0][1].set(title="Reconstructed Audio", xlabel="Time [s]")
axes[0][1].plot(np.arange(0, x_wav.shape[0]) / Fs, x_wav_from_mel)
fig.tight_layout()

Original signal length = 110592
spectrogram fft hop  : 23.2ms. 217 time steps for 5.0s audio (padded)
STFT size : (1025, 217)


  log_x_stft_from_mel = np.maximum(20.0 * np.log10(np.abs(x_stft_from_mel)), x_dB_min_th)


Reconstructed STFT size : (1025, 217)
Reconstructed STFT w/phase size : (1025, 217)
Mel-reconstructed signal length = 110592


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

  scaler(mode, **kwargs)
  scaler(mode, **kwargs)


In [6]:
print("Reconstructed audio")
Audio(x_wav_from_mel, rate=Fs)

Reconstructed audio


## Dataset presets, audio and spectrograms

Ceux qui risquent de poser problème :
* Tous les SFX (ou alors, ça peut aider le NN à bien comprendre comment fonctionne Dexed ?)
* Presque zéro :
    * 54035
    * 2555 (sorte de petit kick puis bruit...)
* Gros n'imp, limite inécoutable (par UID) :
    * 71429


In [7]:
from data import dataset
import matplotlib.pyplot as plt
import importlib
import config
importlib.reload(dataset)

dexed_dataset = dataset.DexedDataset(note_duration=config.model.note_duration, n_fft=config.model.stft_args[0],
                                     fft_hop=config.model.stft_args[1], n_mel_bins=config.model.mel_bins)
print(dexed_dataset)


Dataset of 30293/30293 Dexed presets. 144 learnable synth params, 11 fixed params.


In [8]:
# TODO function propre de fig - en package utils.figu
# Partial display of a dataloader batch
from torch.utils.data import DataLoader

import utils.figures
importlib.reload(utils.figures)

dataloader = DataLoader(dexed_dataset, config.train.minibatch_size, shuffle=True, num_workers=8, pin_memory=False)
dataloader_iter = iter(dataloader)

sample = next(dataloader_iter)
spectrogram, params, sample_info = sample


_ = utils.figures.plot_spectrograms(spectrogram, specs_recons=spectrogram*0.5, presets_UIDs=sample_info[:, 0],
                                    print_info=False, plot_error=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [9]:
# Corresponding audio - insert UID
UID = sample_info[0, 0].item()  # Change first index to retrieve the sound corresponding to a spectrogram
print("Preset {}".format(UID))
x_wav, Fs = dataset.DexedDataset.get_wav_file(UID, dexed_dataset.midi_note, dexed_dataset.midi_velocity)
Audio(x_wav, rate=Fs)

Preset 73797


In [10]:
import torch
import os
# dataloader test
dexed_dataloader = torch.utils.data.DataLoader(dexed_dataset, batch_size=128, shuffle=False,
                                               num_workers=8)
sample = None
for i, sample in enumerate(dexed_dataloader):
    if i%50 == 0:
        print("batch {}/{}".format(i, len(dexed_dataloader)))
print(sample[0].size())

batch 0/237
batch 50/237
batch 100/237
batch 150/237
batch 200/237
torch.Size([85, 1, 257, 347])


# Stats on all data spectrograms

In [11]:
import pandas as pd
import json

with open(dexed_dataset._get_spectrogram_stats_file(), 'r') as f:
    print("Whole-dataset spectrogram stats: {} dB".format(json.load(f)))

spectrogram_stats_df = pd.read_csv(dexed_dataset._get_spectrogram_full_stats_file())
fig, axes = plt.subplots(4, 1, figsize=(9, 8), sharex=True)
n_bins = 50
for i, col in enumerate(['min', 'max', 'mean', 'std']):
    spectrogram_stats_df.hist(column=col, ax=axes[i], bins=n_bins)
    axes[i].set(title='spectrogram {} (dB)'.format(col))
fig.tight_layout()

Whole-dataset spectrogram stats: {'min': -190.982666015625, 'max': -31.16400146484375, 'mean': -125.90191813641988, 'std': 21.830508919641183} dB


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …