Code assumes that data is in same directory as capstone_repo directory:  
```
capstone_repo  
│   README.md  
│   gsc_mfcc_extraction.ipynb    
│  
speech_commands_v0.02  
└───backward  
│   │   file01.wav  
│   │   file02.wav (etc)  
└───bed  
│   |..etc  
```

In [66]:
fs = 1600

#Functions for a

def add_white_noise(audio, noise_var_coeff):
        """Adds zero mean Gaussian noise to image with specified variance.
        """
        coeff = noise_var_coeff * np.mean(np.abs(audio))
        noisy_audio = audio + coeff * np.random.randn(len(audio))
        return noisy_audio


def shift(audio, shift_sec, fs):
        """Shifts audio.
        """
        shift_count = int(shift_sec * fs)
        return np.roll(audio, shift_count)

    
def stretch(audio, rate=1):
        """Stretches audio with specified ratio.
        """
        input_length = 16000
        audio2 = librosa.effects.time_stretch(audio, rate)
        if len(audio2) > input_length:
            audio2 = audio2[:input_length]
        else:
            audio2 = np.pad(audio2, (0, max(0, input_length - len(audio2))), "constant")

        return audio2

def augment_shift(audio, fs = 16000, verbose=False):
        """Augments audio by adding random shift
        """
        random_shift_time = np.random.uniform(-0.1,0.1)
        shifted_audio = shift(audio, random_shift_time, fs)
        return shifted_audio
    
def augment_stretch(audio, fs = 16000, verbose=False):
        """Augments audio by adding random stretch"""
        
        random_strech_coeff = np.random.uniform(0.8, 1.3)
        stretched_audio = tsm.wsola(audio, random_strech_coeff)
        return stretched_audio
        
def augment_noise(audio, fs = 16000, verbose=False):
        """ Augments auido by adding random white noise"""
        
        random_noise_var_coeff = np.random.uniform(0,1)
        noisy_audio = add_white_noise(audio, random_noise_var_coeff)
        return noisy_audio



def __parse_augmentation(self, augmentation):
        self.augmentation = augmentation
        if augmentation:
            if 'aug_num' not in augmentation:
                print('No key `aug_num` in input augmentation dictionary! ',
                      'Using 0.')
                self.augmentation['aug_num'] = 0
            elif self.augmentation['aug_num'] != 0:
                if 'noise_var' not in augmentation:
                    print('No key `noise_var` in input augmentation dictionary! ',
                          'Using defaults: [Min: 0., Max: 1.]')
                    self.augmentation['noise_var'] = {'min': 0., 'max': 1.}
                if 'shift' not in augmentation:
                    print('No key `shift` in input augmentation dictionary! '
                          'Using defaults: [Min:-0.1, Max: 0.1]')
                    self.augmentation['shift'] = {'min': -0.1, 'max': 0.1}
                if 'strech' not in augmentation:
                    print('No key `strech` in input augmentation dictionary! '
                          'Using defaults: [Min: 0.8, Max: 1.3]')
                    self.augmentation['strech'] = {'min': 0.8, 'max': 1.3}



                    
def __init__(self, root, classes, d_type, t_type, transform=None, quantization_scheme=None,
                 augmentation=None, download=False, save_unquantized=False):

        self.root = root
        self.classes = classes
        self.d_type = d_type
        self.t_type = t_type
        self.transform = transform
        self.save_unquantized = save_unquantized

        self.__parse_quantization(quantization_scheme)
        self.__parse_augmentation(augmentation)

        if not self.save_unquantized:
            self.data_file = 'dataset2.pt'
        else:
            self.data_file = 'unquantized.pt'

        if download:
            self.__download()

        self.data, self.targets, self.data_type = torch.load(os.path.join(
            self.processed_folder, self.data_file))

        print(f'\nProcessing {self.d_type}...')
        self.__filter_dtype()
        self.__filter_classes()
    

augmentation = {'aug_num': 0}

In [67]:
#import libraries
import numpy as np
import os
import librosa
import time
import pandas as pd
import errno
import hashlib
import os
import tarfile
import time
import urllib
import warnings

import numpy as np
import torch
from torch.utils.model_zoo import tqdm
from torchvision import transforms

import librosa
import pytsmod as tsm

In [68]:
data_path = '../speech_commands_v0.02/'

In [69]:
# get list of word directories in speech commands dataset
list(os.walk(data_path))[0]

('../speech_commands_v0.02/',
 ['backward',
  'bed',
  'bird',
  'cat',
  'dog',
  'down',
  'eight',
  'five',
  'follow',
  'forward',
  'four',
  'go',
  'happy',
  'house',
  'learn',
  'left',
  'marvin',
  'nine',
  'no',
  'off',
  'on',
  'one',
  'right',
  'seven',
  'sheila',
  'six',
  'stop',
  'three',
  'tree',
  'two',
  'up',
  'visual',
  'wow',
  'yes',
  'zero',
  '_background_noise_'],
 ['.DS_Store',
  'LICENSE',
  'README.md',
  'testing_list.txt',
  'validation_list.txt'])

In [70]:
all_words = ['backward',
  'bed',
  'bird',
  'cat',
  'dog',
  'down',
  'eight',
  'five',
  'follow',
  'forward',
  'four',
  'go',
  'happy',
  'house',
  'learn',
  'left',
  'marvin',
  'nine',
  'no',
  'off',
  'on',
  'one',
  'right',
  'seven',
  'sheila',
  'six',
  'stop',
  'three',
  'tree',
  'two',
  'up',
  'visual',
  'wow',
  'yes',
  'zero']

In [71]:
#only these 20 words will act as keywords
word_list = ['up','down','left', 'right', 'stop', 'go', 'yes', 'no', 'on', 'off', 'one', 'two', 
             'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'zero']
len(word_list)

20

In [72]:
#the other 15 words will be combined into one class of rejection words
rejection_words = [i for i in all_words if i not in word_list]
print(rejection_words)
len(rejection_words)

['backward', 'bed', 'bird', 'cat', 'dog', 'follow', 'forward', 'happy', 'house', 'learn', 'marvin', 'sheila', 'tree', 'visual', 'wow']


15

In [73]:
#create base MFCC directory
if not os.path.exists('mfccs'):
    os.makedirs('mfccs')

In [74]:
#create training, validation, and test directories
if not os.path.exists('mfccs/training'):
    os.makedirs('mfccs/training')
    
if not os.path.exists('mfccs/validation'):
    os.makedirs('mfccs/validation')
    
if not os.path.exists('mfccs/test'):
    os.makedirs('mfccs/test')

In [75]:
#create subdirectories that will contain MFCCs for each word
for directory in list(os.walk('mfccs'))[0][1]:
    for i in all_words:
        word_dir = 'mfccs/' + directory + '/' + i
        if not os.path.exists(word_dir):
            os.makedirs(word_dir)

In [76]:
#read in list of validation files
val_path = data_path + 'validation_list.txt'
val_list = np.loadtxt(val_path, dtype = 'str')
display(val_list[:5])
print('Number of validation samples:', len(val_list))

array(['right/a69b9b3e_nohash_0.wav', 'right/439c84f4_nohash_1.wav',
       'right/409c962a_nohash_1.wav', 'right/dbaf8fc6_nohash_2.wav',
       'right/a6d586b7_nohash_1.wav'], dtype='<U30')

Number of validation samples: 9981


In [77]:
#read in list of test files
test_path = data_path + 'testing_list.txt'
test_list = np.loadtxt(test_path, dtype = 'str')
display(test_list[:5])
print('Number of testing samples:', len(test_list))

array(['right/bb05582b_nohash_3.wav', 'right/97f4c236_nohash_2.wav',
       'right/f2e59fea_nohash_3.wav', 'right/fdb5155e_nohash_2.wav',
       'right/dc75148d_nohash_0.wav'], dtype='<U30')

Number of testing samples: 11005


In [81]:
#loop through all files, calculate MFCCs, save them to the appropriate directory
start = time.time()

#GSC is sampled at 16000Hz
sample_rate = 16000

fs = 16000

for i in all_words:
    #move to the directory for the ith word
    working_path = data_path + i
    
    #loop through each file in the word's directory
    for j in os.listdir(working_path):
        #define paths we'll need
        word_path = i + '/' + j
        file_path = data_path + word_path
        
        #load audio file
        y, sr = librosa.load(file_path, sr = sample_rate)
        
        #pad shorter audio clips that don't have 16000 data points
        if (len(y) < sample_rate):
            y = np.pad(y, (0, (sample_rate - len(y))))
            
            
        #augment training set
        if (word_path not in test_list) and (word_path not in val_list):
            #Augment Audio 3 different ways
            aug_audio1 = augment_shift(y, fs)
            aug_audio2 = augment_stretch(y, fs)
            aug_audio3 = augment_noise(y, fs)
            
            #Calculate MFCCs for augmented audio + normal audio
            mfcc = librosa.feature.mfcc(y = y, sr = sr)
            mfcc_aug1 = librosa.feature.mfcc(y = aug_audio1, sr = sr)
            mfcc_aug2 = librosa.feature.mfcc(y = aug_audio2, sr = sr)
            mfcc_aug3 = librosa.feature.mfcc(y = aug_audio3, sr = sr)
            
            #define save location directory
            word_dir = i + '_mfcc'
            
            #Set training save path
            save_path = os.path.join('mfccs', 'training', i, os.path.splitext(j)[0])
            np.save(save_path, mfcc)
            save_path = os.path.join('mfccs', 'training', i, os.path.splitext(j)[0])
            np.save(save_path, mfcc_aug1)
            save_path = os.path.join('mfccs', 'training', i, os.path.splitext(j)[0])
            np.save(save_path, mfcc_aug2)
            save_path = os.path.join('mfccs', 'training', i, os.path.splitext(j)[0])
            np.save(save_path, mfcc_aug3)

            
            
        else:
            #calculate MFCC
            mfcc = librosa.feature.mfcc(y=y, sr=sr)
        
            #define save location directory
            word_dir = i + '_mfcc'
        
            #set save path to either train, test, or validation
            if word_path in test_list:
                save_path = os.path.join('mfccs', 'test', i, os.path.splitext(j)[0])
            elif word_path in val_list:
                save_path = os.path.join('mfccs', 'validation', i, os.path.splitext(j)[0])
            
            #save MFCC
            np.save(save_path, mfcc)
            

end = time.time()
print(end - start)

0 MFCCs in validation folder
0 MFCCs in testing folder
1 MFCCs in training folder


TypeError: unsupported operand type(s) for +: 'int' and 'str'

<module 'ntpath' from 'C:\\Users\\hdfer\\anaconda3\\lib\\ntpath.py'>

In [99]:
#let's make sure there are the right number of MFCCs in validation, test, and train
val_count = 0
test_count = 0
train_count = 0
for i in range(1,36):
    val_count += len(list(os.walk('mfccs/validation'))[i][2])
    test_count += len(list(os.walk('mfccs/test'))[i][2]) 
    train_count += len(list(os.walk('mfccs/training'))[i][2])
print(val_count, 'MFCCs in validation folder')
print(test_count, 'MFCCs in testing folder')
print(train_count, 'MFCCs in training folder')

9981 MFCCs in validation folder
11005 MFCCs in testing folder
84843 MFCCs in training folder
