# ResNet data preprocessing

In [1]:
import os

import numpy as np
import seaborn as sns
from tqdm import tqdm

from python_speech_features import mfcc
import scipy.io.wavfile as wav
from matplotlib import cm

Hyperparameter for noise reduction, window size, window shift are from the paper
#TODO cepstrum = 13

In [2]:
# from tqdm import tqdm
# wav_files = []
# labels = []

def wav_to_mfcc(reduce_noise=True):
    lowfreq = 20 if reduce_noise else 0
    highfreq = 4000 if reduce_noise else None
    
    mfcc_features = []
    labels = []
    
    # Iterate over all keyword wav files
    for keyword_dir in tqdm(os.listdir("speech_commands")):
        for file in os.listdir(f"speech_commands/{keyword_dir}"):
            if file.endswith(".wav"):

                # Read wav
                (rate,sig) = wav.read(f"speech_commands/{keyword_dir}/{file}")
                
                # Compute and store mfcc features
                mfcc_feat = mfcc(sig,rate, winlen=0.03, lowfreq=lowfreq, highfreq=highfreq)
                mfcc_features.append(mfcc_feat)
                labels.append(keyword_dir)
                
    return mfcc_features, labels

mfcc_features, labels = wav_to_mfcc()

100%|██████████| 35/35 [04:53<00:00,  8.39s/it]


In [9]:
counter = 0
for index, f in enumerate(mfcc_features):
    if f.shape[0] != 98:
#         print(index)
        counter+=1
print(counter)

10425


In [8]:
len(max(mfcc_features, key=len))

98

In [3]:
# Make sure all samples have the same size using zero post-padding

longest_mfcc = len(max(mfcc_features, key=len))
for i, mfcc in tqdm(enumerate(mfcc_features)):
    padding_length = longest_mfcc - len(mfcc)
    if padding_length > 0:
        mfcc_features[i] = np.concatenate((mfcc, np.zeros((padding_length, 13))), axis=0)

105829it [00:00, 877475.74it/s]


In [4]:
# Export data
np.save('resnet_mfcc_features.npy', mfcc_features)
np.save('resnet_labels.npy', labels)