IMPLEMENT MFCC FOR SOUND PROCESSING USING FAST FOURIER TRANSFORM
Mel Frequency Cepstral Coefficents (MFCCs) are a feature widely used in automatic speech and speaker recognition

In [1]:
import soundfile as sf
import numpy as np
from scipy.io import wavfile
import scipy.fftpack as fft
from scipy.signal import get_window
import IPython.display as idp
def MFCC(TRAIN_PATH,file):
    audio, sample_rate, = sf.read(TRAIN_PATH + file)
    def normalize_audio(audio):
        audio = audio / np.max(np.abs(audio))
        return audio
    audio = normalize_audio(audio)
    def frame_audio(audio, FFT_size=2048, hop_size=10, sample_rate=44100):
        # hop_size in ms
        
        audio = np.pad(audio, int(FFT_size / 2), mode='reflect')
        frame_len = np.round(sample_rate * hop_size / 1000).astype(int)
        frame_num = int((len(audio) - FFT_size) / frame_len) + 1
        frames = np.zeros((frame_num,FFT_size))
        
        for n in range(frame_num):
            frames[n] = audio[n*frame_len:n*frame_len+FFT_size]
        
        return frames
    hop_size = 15 #ms
    FFT_size = 2048
    audio_framed = frame_audio(audio, FFT_size=FFT_size, hop_size=hop_size, sample_rate=sample_rate)
    window = get_window("hann", FFT_size, fftbins=True)
    audio_win = audio_framed * window
    audio_winT = np.transpose(audio_win)
    audio_fft = np.empty((int(1 + FFT_size // 2), audio_winT.shape[1]), dtype=np.complex64, order='F')

    for n in range(audio_fft.shape[1]):
        audio_fft[:, n] = fft.fft(audio_winT[:, n], axis=0)[:audio_fft.shape[0]]

    audio_fft = np.transpose(audio_fft)
    audio_power = np.square(np.abs(audio_fft))
    freq_min = 0
    freq_high = sample_rate / 2
    mel_filter_num = 10
    def freq_to_mel(freq):
        return 2595.0 * np.log10(1.0 + freq / 700.0)

    def met_to_freq(mels):
        return 700.0 * (10.0**(mels / 2595.0) - 1.0)
    def get_filter_points(fmin, fmax, mel_filter_num, FFT_size, sample_rate=44100):
        fmin_mel = freq_to_mel(fmin)
        fmax_mel = freq_to_mel(fmax)
        
        mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num+2)
        freqs = met_to_freq(mels)
        
        return np.floor((FFT_size + 1) / sample_rate * freqs).astype(int), freqs
    filter_points, mel_freqs = get_filter_points(freq_min, freq_high, mel_filter_num, FFT_size, sample_rate=44100)
    def get_filters(filter_points, FFT_size):
        filters = np.zeros((len(filter_points)-2,int(FFT_size/2+1)))
        
        for n in range(len(filter_points)-2):
            filters[n, filter_points[n] : filter_points[n + 1]] = np.linspace(0, 1, filter_points[n + 1] - filter_points[n])
            filters[n, filter_points[n + 1] : filter_points[n + 2]] = np.linspace(1, 0, filter_points[n + 2] - filter_points[n + 1])
        
        return filters
    filters = get_filters(filter_points, FFT_size)
    enorm = 2.0 / (mel_freqs[2:mel_filter_num+2] - mel_freqs[:mel_filter_num])
    filters *= enorm[:, np.newaxis]
    audio_filtered = np.dot(filters, np.transpose(audio_power))
    audio_log = 10.0 * np.log10(audio_filtered)
    def dct(dct_filter_num, filter_len):
        basis = np.empty((dct_filter_num,filter_len))
        basis[0, :] = 1.0 / np.sqrt(filter_len)
        
        samples = np.arange(1, 2 * filter_len, 2) * np.pi / (2.0 * filter_len)

        for i in range(1, dct_filter_num):
            basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_len)
            
        return basis
    dct_filter_num = 40

    dct_filters = dct(dct_filter_num, mel_filter_num)
    audio_log=np.nan_to_num(audio_log,posinf=0,neginf=0)

    cepstral_coefficents = np.dot(dct_filters, audio_log)
    return cepstral_coefficents

In [2]:
import pandas as pd
import os
import random
import csv
TRAIN_PATH='train-clean-wav/'
p1=pd.read_csv('newdata.csv',sep=';')
p1['ID   ']=p1['ID   '].astype(int)
listdata=[]
files=os.listdir(TRAIN_PATH)
list1=files[0:int(len(files)/2)]
for list in list1:
    num=""
    for char in list:
        if char=="-" :
            break
        num+=char
    num=int(num)
    p1[p1['ID   ']==num].index
    A=MFCC(TRAIN_PATH,list)
    indices = np.arange(A.shape[1])

    data=dict()
    data['cepstrum']=np.transpose(A[:,np.random.choice(indices, size=50, replace=False)])
    data['speaker']=p1.iloc[p1[p1['ID   ']==num].index][' NAME'].values[0]
    data['sex']=p1.iloc[p1[p1['ID   ']==num].index]['SEX'].values[0]
    listdata.append(data)        


  audio_log = 10.0 * np.log10(audio_filtered)


In [3]:
import pickle
listdata
with open('my_dictRNN.pkl', 'wb') as f:
    pickle.dump(listdata, f)
with open('my_dictRNN.pkl', 'rb') as f:
    my_dict = pickle.load(f)
my_dict

[{'cepstrum': array([[-60.34642114,  12.99511005,  10.19702579, ...,   7.87303922,
           10.19702579,  12.99511005],
         [-12.07203761,   4.00651096,  -1.74278718, ...,   1.75472333,
           -1.74278718,   4.00651096],
         [  0.87762234,  11.82514593,  -5.67607052, ..., -12.58024509,
           -5.67607052,  11.82514593],
         ...,
         [-19.91163944,  19.86462488,  -9.27978592, ...,  -5.59323065,
           -9.27978592,  19.86462488],
         [-61.1046534 ,   4.06116005,  17.88434638, ...,   3.91944577,
           17.88434638,   4.06116005],
         [-80.73036826,  17.21409714,  16.56078585, ...,   6.82741477,
           16.56078585,  17.21409714]]),
  'speaker': ' Karen Savage',
  'sex': ' F '},
 {'cepstrum': array([[ -78.93510564,   13.74410558,   10.59806104, ...,    6.92518837,
            10.59806104,   13.74410558],
         [ -45.90703747,   23.52324831,    1.48549262, ...,  -14.65168523,
             1.48549262,   23.52324831],
         [-106.049312