In [4]:
import librosa
import numpy as np
from IPython.lib.display import Audio


N_MFCC=100
HOP_LENGTH = 500


def invlogamplitude(S):
    '''librosa.logamplitude is actually 10*log10, so invert that.'''
    return 10.0 ** (S / 10.0)


# Load input audio.
filename = 'mp3s/orig_48.wav'
y, sr = librosa.load(filename)
print(sr)


# Calculate mfccs.
Y = librosa.stft(y)
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC, hop_length=HOP_LENGTH)
print 'mfccs shape: {}'.format(mfccs.shape)


# Build reconstruction mappings.
n_mfcc = mfccs.shape[0]
n_mel = 128
dctm = librosa.filters.dct(n_mfcc, n_mel)
n_fft = 2048
mel_basis = librosa.filters.mel(sr, n_fft)
# Empirical scaling of channels to get ~flat amplitude mapping.
bin_scaling = 1.0 / np.maximum(0.0005, np.sum(np.dot(mel_basis.T, mel_basis), axis=0))


# Reconstruct the approximate STFT squared-magnitude from the MFCCs.
recon_stft = bin_scaling[:, np.newaxis] * np.dot(mel_basis.T, 
        invlogamplitude(np.dot(dctm.T, mfccs)))


# Impose reconstructed magnitude on white noise STFT.
excitation = np.random.randn(y.shape[0])
E = librosa.stft(excitation, hop_length=HOP_LENGTH)
recon = librosa.istft(E / np.abs(E) * np.sqrt(recon_stft))
print recon
# Listen to the reconstruction
Audio(recon, rate=sr)

22050
mfccs shape: (100, 391)
[ 0.00721941  0.00669242  0.00608404 ...,  0.00073945  0.0005043
  0.00085108]
