### pre processing


In [7]:
import numpy as np
import librosa
import librosa.display
from scipy.io import wavfile
import os
import csv
from collections import Counter

In [2]:
train_dir = './data/train/audio/' #download files from kaggle

classes = ['yes', 'no', 
           'up', 'down', 
           'left', 'right', 
           'on', 'off', 
           'stop', 'go', 
           'silence', 'unknown']

run this cell to move the background noises folder out of the audio directory. We will create silence samples from these files after.

In [5]:
%%bash
mv data/train/audio/_background_noise_ data/train
ls data/train

audio
_background_noise_
LICENSE
README.md
testing_list.txt
validation_list.txt


Split all the audio files from \_background\_noises\_ folder in 1-sec chunks

In [19]:
def split_arr(arr):
    """
    split an array into chunks of length 16000
    Returns:
        list of arrays
    """
    return np.split(arr, np.arange(16000, len(arr), 16000))

In [43]:
def create_silence():
    """
    reads wav files in background noises folder, splits them and saves to silence folder in train_dir
    """
    for file in os.listdir('data/train/_background_noise_/'):
        if 'wav' in file:
            sig, rate = librosa.load('data/train/_background_noise_/' + file, sr = 16000)        
            sig_arr = split_arr(sig)
            if not os.path.exists(train_dir+'silence/'):
                os.makedirs(train_dir+'silence/')
            for ind, arr in enumerate(sig_arr):
                filename = 'frag%d' %ind + '_%s' %file # example: frag0_running_tap.wav
                librosa.output.write_wav(train_dir+'silence/'+filename, arr, 16000)
  

In [27]:
create_silence()

three lists with file names. one for training set, one for validation set, one for all. Plus a dictionary with file counts per class.

In [37]:
folders = os.listdir(train_dir)

In [38]:
with open('./data/train/validation_list.txt') as val_list:
    validation_list = [row[0] for row in csv.reader(val_list)]
assert len(validation_list) == 6798, 'file not loaded'

"""
#if you want to add the files in testing_list.txt to the validation list:

with open('./data/train/testing_list.txt') as test_list:
    testing_list = [row[0] for row in csv.reader(test_list)]
assert len(testing_list) == 6835, 'file not loaded'

#combine into validation set
validation_list.extend(testing_list)
"""
#add silence files to validation_list
for i, file in enumerate(os.listdir(train_dir + 'silence/')):
    if i%10==0:
        validation_list.append('silence/'+file)

training_list = []
all_files_list = []
class_counts = {}

for folder in folders:
    files = os.listdir(train_dir + folder)
    for i, f in enumerate(files):
        all_files_list.append(folder + '/' + f)
        path = folder + '/' + f
        if path not in validation_list:
            training_list.append(folder + '/' + f)        
        class_counts[folder] = i

#remove filenames from validation_list that don't exist anymore (due to eda)
validation_list = list(set(validation_list).intersection(all_files_list))

In [39]:
assert len(validation_list)+len(training_list)==len(all_files_list), 'error'

In [40]:
# check random file name
print(training_list[345], 'size training set: ',len(training_list), 'size validation set: ', len(validation_list))

yes/15f04ff8_nohash_0.wav size training set:  58284 size validation set:  6839


In [42]:
print(class_counts)

{'bird': 1730, 'marvin': 1745, 'sheila': 1733, 'down': 2358, 'wow': 1744, 'five': 2356, 'stop': 2379, 'tree': 1732, 'left': 2352, 'cat': 1732, 'eight': 2351, 'right': 2366, 'happy': 1741, 'zero': 2375, 'bed': 1712, 'three': 2355, 'one': 2369, 'dog': 1745, 'four': 2371, 'go': 2371, 'no': 2374, 'up': 2374, 'house': 1749, 'six': 2368, 'yes': 2376, 'two': 2372, 'silence': 401, 'off': 2356, 'nine': 2363, 'on': 2366, 'seven': 2376}


### turn all wav files into spectrograms

In [44]:
def make_spec(file, file_dir = train_dir, flip = False, ps = False, st = 4):
    """
    create a melspectrogram from the amplitude of the sound
    
    Args:
        file (str): filename
        file_dir (str): directory path
        flip (bool): reverse time axis
        ps (bool): pitch shift
        st (int): half-note steps for pitch shift
    Returns:
        np.array with shape (122,85) (time, freq)
    """
    sig, rate = librosa.load(file_dir + file, sr = 16000)
    if len(sig) < 16000: # pad shorter than 1 sec audio with zeros
        sig = np.pad(sig, (0,16000-len(sig)), 'constant', constant_values = 0)
    if ps:
        sig = librosa.effects.pitch_shift(sig, rate, st)
    D = librosa.amplitude_to_db(librosa.stft(sig[:16000], n_fft = 512, 
                                             hop_length = 128, 
                                             center = False), ref = np.max)
    S = librosa.feature.melspectrogram(S=D, n_mels = 85).T
    if flip:
        S = np.flipud(S)
    return S.astype(np.float32)