Code assumes that data is in same directory as capstone_repo directory:  
```
capstone_repo  
│   README.md  
│   gsc_mfcc_extraction.ipynb    
│  
speech_commands_v0.02  
└───backward  
│   │   file01.wav  
│   │   file02.wav (etc)  
└───bed  
│   |..etc  
```

In [31]:
#import libraries
import torch
import numpy as np
import os
import IPython.display as ipd
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
import librosa
import time
import pandas as pd

In [32]:
data_path = '../speech_commands_v0.02/'

In [33]:
# get list of word directories in speech commands dataset
list(os.walk(data_path))[0]

('../speech_commands_v0.02/',
 ['backward',
  'bed',
  'bird',
  'cat',
  'dog',
  'down',
  'eight',
  'five',
  'follow',
  'forward',
  'four',
  'go',
  'happy',
  'house',
  'learn',
  'left',
  'marvin',
  'nine',
  'no',
  'off',
  'on',
  'one',
  'right',
  'seven',
  'sheila',
  'six',
  'stop',
  'three',
  'tree',
  'two',
  'up',
  'visual',
  'wow',
  'yes',
  'zero',
  '_background_noise_'],
 ['.DS_Store',
  'LICENSE',
  'README.md',
  'testing_list.txt',
  'validation_list.txt'])

In [3]:
all_words = ['backward',
  'bed',
  'bird',
  'cat',
  'dog',
  'down',
  'eight',
  'five',
  'follow',
  'forward',
  'four',
  'go',
  'happy',
  'house',
  'learn',
  'left',
  'marvin',
  'nine',
  'no',
  'off',
  'on',
  'one',
  'right',
  'seven',
  'sheila',
  'six',
  'stop',
  'three',
  'tree',
  'two',
  'up',
  'visual',
  'wow',
  'yes',
  'zero']

In [8]:
#only these 20 words will act as keywords
word_list = ['up','down','left', 'right', 'stop', 'go', 'yes', 'no', 'on', 'off', 'one', 'two', 
             'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'zero']
len(word_list)

20

In [9]:
#the other 15 words will be combined into one class of rejection words
rejection_words = [i for i in all_words if i not in word_list]
print(rejection_words)
len(rejection_words)

['backward', 'bed', 'bird', 'cat', 'dog', 'follow', 'forward', 'happy', 'house', 'learn', 'marvin', 'sheila', 'tree', 'visual', 'wow']


15

In [13]:
#create base MFCC directory
if not os.path.exists('mfccs'):
    os.makedirs('mfccs')

In [127]:
#create training, validation, and test directories
if not os.path.exists('mfccs/training'):
    os.makedirs('mfccs/training')
    
if not os.path.exists('mfccs/validation'):
    os.makedirs('mfccs/validation')
    
if not os.path.exists('mfccs/test'):
    os.makedirs('mfccs/test')

In [128]:
#create subdirectories that will contain MFCCs for each word
for directory in list(os.walk('mfccs'))[0][1]:
    for i in all_words:
        word_dir = 'mfccs/' + directory + '/' + i
        if not os.path.exists(word_dir):
            os.makedirs(word_dir)

In [91]:
#read in list of validation files
val_path = data_path + 'validation_list.txt'
val_list = np.loadtxt(val_path, dtype = 'str')
display(val_list[:5])
print('Number of validation samples:', len(val_list))

array(['right/a69b9b3e_nohash_0.wav', 'right/439c84f4_nohash_1.wav',
       'right/409c962a_nohash_1.wav', 'right/dbaf8fc6_nohash_2.wav',
       'right/a6d586b7_nohash_1.wav'], dtype='<U30')

Number of validation samples: 9981


In [92]:
#read in list of test files
test_path = data_path + 'testing_list.txt'
test_list = np.loadtxt(test_path, dtype = 'str')
display(test_list[:5])
print('Number of testing samples:', len(test_list))

array(['right/bb05582b_nohash_3.wav', 'right/97f4c236_nohash_2.wav',
       'right/f2e59fea_nohash_3.wav', 'right/fdb5155e_nohash_2.wav',
       'right/dc75148d_nohash_0.wav'], dtype='<U30')

Number of testing samples: 11005


In [130]:
#loop through all files, calculate MFCCs, save them to the appropriate directory
start = time.time()

for i in all_words:
    #move to the directory for the ith word
    working_path = data_path + i
    
    #loop through each file in the word's directory
    for j in os.listdir(working_path):
        #define paths we'll need
        word_path = i + '/' + j
        file_path = data_path + word_path
        
        #load audio file, GSC is sampled at 16000Hz
        y, sr = librosa.load(file_path, sr = 16000)
        
        #calculate MFCC
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        
        #define save location directory
        word_dir = i + '_mfcc'
        
        #set save path to either train, test, or validation
        if word_path in test_list:
            save_path = os.path.join('mfccs', 'test', i, os.path.splitext(j)[0])
        elif word_path in val_list:
            save_path = os.path.join('mfccs', 'validation', i, os.path.splitext(j)[0])
        else:
            save_path = os.path.join('mfccs', 'training', i, os.path.splitext(j)[0])
            
        #save MFCC
        np.save(save_path, mfcc)

end = time.time()
print(end - start)

823.337000131607


In [160]:
#let's make sure there are the right number of MFCCs in validation, test, and train
val_count = 0
test_count = 0
train_count = 0
for i in range(1,36):
    val_count += len(list(os.walk('mfccs/validation'))[i][2])
    test_count += len(list(os.walk('mfccs/test'))[i][2]) 
    train_count += len(list(os.walk('mfccs/training'))[i][2])
print(val_count, 'MFCCs in validation folder')
print(test_count, 'MFCCs in testing folder')
print(train_count, 'MFCCs in training folder')

9981 MFCCs in validation folder
11005 MFCCs in testing folder
84843 MFCCs in training folder
