In [1]:
# @misc{fayek2016,
#   title   = "Speech Processing for Machine Learning: Filter banks, Mel-Frequency Cepstral Coefficients (MFCCs) and What's In-Between",
#   author  = "Haytham M. Fayek",
#   year    = "2016",
#   url     = "https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html"
# }

Setup

In [2]:
import numpy
import scipy.io.wavfile
from scipy.fftpack import dct

sample_rate, signal = scipy.io.wavfile.read('./audio/OSR_us_000_0010_8k.wav')  # File assumed to be in the same directory
signal = signal[0:int(3.5 * sample_rate)]  # Keep the first 3.5 seconds

Parameters

In [3]:
pre_emphasis = 0.97
frame_size = 0.025
frame_stride = 0.01
NFFT = 512
nfilt = 40
num_ceps = 12
cep_lifter = 22

Pre-Emphasis

In [4]:
emphasized_signal = numpy.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])

Framing

In [5]:
frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
signal_length = len(emphasized_signal)
frame_length = int(round(frame_length))
frame_step = int(round(frame_step))
num_frames = int(numpy.ceil(float(numpy.abs(signal_length - frame_length)) / frame_step))  # Make sure that we have at least 1 frame

pad_signal_length = num_frames * frame_step + frame_length
z = numpy.zeros((pad_signal_length - signal_length))
pad_signal = numpy.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal

indices = numpy.tile(numpy.arange(0, frame_length), (num_frames, 1)) + numpy.tile(numpy.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames_ret = pad_signal[indices.astype(numpy.int32, copy=False)]

Window

In [6]:
frames = frames_ret * numpy.hamming(frame_length)
# frames *= 0.54 - 0.46 * numpy.cos((2 * numpy.pi * n) / (frame_length - 1))  # Explicit Implementation **

Fourier-Transform and Power Spectrum

In [7]:
mag_frames = numpy.absolute(numpy.fft.rfft(frames, NFFT))  # Magnitude of the FFT
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  # Power Spectrum

Filter Banks

In [8]:
low_freq_mel = 0
high_freq_mel = (2595 * numpy.log10(1 + (sample_rate / 2) / 700))  # Convert Hz to Mel
mel_points = numpy.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
bin = numpy.floor((NFFT + 1) * hz_points / sample_rate)

fbank = numpy.zeros((nfilt, int(numpy.floor(NFFT / 2 + 1))))
for m in range(1, nfilt + 1):
    f_m_minus = int(bin[m - 1])   # left
    f_m = int(bin[m])             # center
    f_m_plus = int(bin[m + 1])    # right

    for k in range(f_m_minus, f_m):
        fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
    for k in range(f_m, f_m_plus):
        fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
filter_banks = numpy.dot(pow_frames, fbank.T)
filter_banks = numpy.where(filter_banks == 0, numpy.finfo(float).eps, filter_banks)  # Numerical Stability
filter_banks = 20 * numpy.log10(filter_banks)  # dB

Mel-frequency Cepstral Coefficients (MFCCs)

In [9]:
mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1 : (num_ceps + 1)] # Keep 2-13

In [10]:
(nframes, ncoeff) = mfcc.shape
n = numpy.arange(ncoeff)
lift = 1 + (cep_lifter / 2) * numpy.sin(numpy.pi * n / cep_lifter)
mfcc *= lift  #*

Mean Normalization

In [11]:
filter_banks -= (numpy.mean(filter_banks, axis=0) + 1e-8)

In [12]:
mfcc -= (numpy.mean(mfcc, axis=0) + 1e-8)


---
# Gráficos


In [13]:
import matplotlib.pyplot as plt, IPython.display as ipd
import matplotlib.cm as cm
import numpy as np
import scipy.signal
import librosa.display

### Funções para a plotagem dos gráficos

In [14]:
def plotSound(audio, sr, title='Sinal no domínio do tempo'):
    N = len(audio)
    T = N/sr
    t = np.linspace(0,T,N)
    plt.figure(figsize=(12,2.5));
    plt.plot(t,audio);
    plt.title(title, fontsize=18)
    plt.xlabel('tempo (s)', fontsize=15); plt.ylabel('Amplitude', fontsize=15);
    plt.grid(which='both')
    plt.show()
    ipd.display(ipd.Audio(audio, rate=sr, autoplay=False))

def PreEnfase(signal,emphasized_signal, sr):
    plotSound(signal, sr, title='Sinal original no domínio do tempo')
    plotSound(emphasized_signal, sr, title='Sinal enfatizado no domínio do tempo')

def plotWindow(frame_length):
    plt.figure(figsize=(7,5));
    plt.plot(numpy.hamming(frame_length));
    plt.title('Janela Hamming', fontsize=18)
    plt.xlabel('amostras', fontsize=15); plt.ylabel('Amplitude', fontsize=15);
    plt.grid(which='both', linestyle='dotted')
    plt.show()
    
def plotFrame(frame, title='Fragmento do sinal'):
    plt.figure(figsize=(7,5));
    plt.plot(frame);
    plt.title(title, fontsize=18)
    plt.xlabel('amostras', fontsize=15); plt.ylabel('Amplitude', fontsize=15);
    plt.grid(which='both', linestyle='dotted')
    plt.show()

def plotWindoned(frames, frame_length):
    plotWindow(frame_length)
    nFrame=widgets.IntText(value=100, description='N° do frame:', disabled=False)
    plotFrame(frames[int(nFrame.value)], title=f'Frame n° {nFrame.value} janelado')

def plotFrames(frames):
    nFrame=widgets.IntText(value=100, description='N° do frame:', disabled=False)
    plotFrame(frames[int(nFrame.value)], title=f'Frame n° {nFrame.value} janelado')
    
def plotSpectogram(frames):
    # Não é meu o código
    plt.figure(figsize=(12, 2.5))
    plt.imshow(np.flipud(frames.T), cmap=cm.jet, aspect=0.2, extent=[0,4,0,4])
    plt.title('Spectrogram of the Signal')
    plt.ylabel('Frequency (kHz)', fontsize=16)
    plt.xlabel('Time (s)', fontsize=16)
    # plt.savefig('filter_banks_raw2.png', bbox_inches='tight', dpi=200)
    plt.show()


def plotFBank(fbank, sample_rate, low_freq):
    plt.figure(figsize=(12,2.5))
    plt.title('Banco de filtros MEL', fontsize=18)
    plt.plot(np.linspace(low_freq, (sample_rate / 2), 257), fbank.T)
    plt.xlabel('Frequências',fontsize=15); plt.ylabel('Amplitude', fontsize=15);
    plt.grid(which='both', linestyle='dotted')
    plt.show()
    # plt.savefig('mel_filters2.png', bbox_inches='tight', dpi=200)

def plotMFCC(num_ceps, mfcc):
    plt.figure(figsize=(12, 2.5))
    plt.imshow(np.flipud(mfcc.T), cmap=cm.jet, aspect=0.08, extent=[0,4,1,num_ceps])
    plt.title('MFCCs', fontsize=18)
    plt.xlabel('tempo (s)', fontsize=15); plt.ylabel('Coeficientes', fontsize=15);
    plt.show()
    # plt.savefig('mfcc_raw2.png', bbox_inches='tight', dpi=200)

def plotMFCC_librosa(mfcc):
    plt.figure(figsize=(15, 5))
    librosa.display.specshow(mfcc, x_axis='time', y_axis='frames')
    plt.colorbar()
    plt.title('MFCC')
    plt.tight_layout()
    plt.show()

def choosePlot(option=None):
    if option==None: print("Escolha uma etapa para visualisar.") 
    if option=='Pré-Enfase': PreEnfase(signal,emphasized_signal, sample_rate)
    if option=='Fragmentação': plotFrames(frames_ret)
    if option=='Janelamento': plotWindoned(frames, frame_length)
    if option=='FFT': plotSpectogram(mag_frames)
    if option=='Espectro de Potência':None
    if option=='Banco de filtros MEL': plotFBank(fbank, sample_rate, low_freq_mel)
    if option=='MFCC': plotMFCC(num_ceps, mfcc)
    if option=='Delta MFCC':None
    if option=='Delta Delta MFCC':None
    if option=='MFCC normalizada':None

In [15]:
# import ctypes  # An included library with Python install.
# def Mbox(title, text, style):
#     return ctypes.windll.user32.MessageBoxW(0, text, title, style)
# Mbox('Banco de Filtros MEL', 'Filtros utilizados parar descatacar as variações nas baixas frequências.\n\nO ouvido humano tem uma percepção muito mais apurada das baixas frequências', 0)

In [16]:
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

Lista = ['Pré-Enfase','Fragmentação','Janelamento','FFT','Espectro de Potência','Banco de filtros MEL','MFCC','Delta MFCC','Delta Delta MFCC','MFCC normalizada']

interact(choosePlot, option=widgets.Dropdown(options = Lista, 
                                             description='Etapa:', #value=None,
                                             disabled=False), continuous_update=False);

interactive(children=(Dropdown(description='Etapa:', options=('Pré-Enfase', 'Fragmentação', 'Janelamento', 'FF…