# ResNet data preprocessing

In [1]:
import os

import numpy as np
import seaborn as sns
from tqdm import tqdm
import librosa

from python_speech_features import mfcc
import scipy.io.wavfile as wav
from matplotlib import cm

In [2]:
# !pip install librosa
# !pip install numba

Preprocessing and hyperparameters are inspired by the [original paper implementation](https://github.com/castorini/honk/blob/c3aae750c428520ba340961bddd526f9c999bb93/utils/manage_audio.py#L30)

In [3]:
def wav_to_mfcc(reduce_noise=True):
    lowfreq = 20 if reduce_noise else 0
    highfreq = 4000 if reduce_noise else None
    
    mfcc_features = []
    labels = []

    # Iterate over all keyword wav files
    for keyword_dir in tqdm(os.listdir("speech_commands")):
        for file in os.listdir(f"speech_commands/{keyword_dir}"):
            if file.endswith(".wav"):

                # Read wav
                signal, sr = librosa.load(f"speech_commands/{keyword_dir}/{file}")
                
                # Compute and store mfcc features
                mfcc = librosa.feature.mfcc(signal, sr=16000, n_mfcc=40, fmin=20, fmax=4000, hop_length=160)
                mfcc = mfcc.reshape(1, -1, 40)
                
                mfcc_features.append(mfcc)
                labels.append(keyword_dir)
                
    return mfcc_features, labels

mfcc_features, labels = wav_to_mfcc()

100%|██████████| 35/35 [1:06:01<00:00, 113.18s/it]


In [7]:
len(mfcc_features)

105829

In [5]:
# counter = 0
# for index, f in enumerate(mfcc_features):
#     if f.shape[0] != 98:
# #         print(index)
#         counter+=1
# print(counter)

3801


In [14]:
# mfcc_copy = mfcc_features.copy()

In [4]:
# Make sure all samples have the same size using zero post-padding

mfcc_shape = 138
for i, mfcc in tqdm(enumerate(mfcc_features)):
    padding_length = mfcc_shape - mfcc.shape[1]
    if padding_length > 0:
        mfcc_features[i] = np.concatenate((mfcc, np.zeros((1, padding_length, 40))), axis=1)
#         print(padding_length)

105829it [00:00, 462952.65it/s]


In [5]:
counter = 0
for index, f in enumerate(mfcc_features):
    if f.shape[1] != 138:
#         print(index)
        counter+=1
print(counter)

0


In [17]:
mfcc_features = np.array(mfcc_features)

In [4]:
labels[-1]

'five'

In [8]:
# Export data
np.save('resnet_mfcc_features_test.npy', mfcc_features)
np.save('resnet_labels_test.npy', labels)

In [59]:
signal, sr = librosa.load('speech_commands/marvin/a2cc5444_nohash_0.wav')

mfcc = librosa.feature.mfcc(signal, sr=16000, n_mels=40, n_fft=480, fmin=40, fmax=4000, hop_length=160)
mfcc.reshape(1, -1, 20).shape

(1, 138, 20)

In [51]:
compute_mfcc(signal).shape


(1, 138, 40)