Code assumes that data is in same directory as capstone_repo directory:  
```
capstone_repo  
│   README.md  
│   gsc_mfcc_extraction.ipynb    
│  
speech_commands_v0.02  
└───backward  
│   │   file01.wav  
│   │   file02.wav (etc)  
└───bed  
│   |..etc  
```

In [1]:
#import libraries
import torch
import numpy as np
import os
import IPython.display as ipd
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
import time
import pandas as pd
import librosa

In [2]:
# data_path = '/Users/Shared/back up fix/speech_commands_v0.02/'
data_path = '../speech_commands_v0.02/'

In [3]:
# get list of word directories in speech commands dataset
list(os.walk(data_path))[0]

('../speech_commands_v0.02/',
 ['backward',
  'bed',
  'bird',
  'cat',
  'dog',
  'down',
  'eight',
  'five',
  'follow',
  'forward',
  'four',
  'go',
  'happy',
  'house',
  'learn',
  'left',
  'marvin',
  'nine',
  'no',
  'off',
  'on',
  'one',
  'right',
  'seven',
  'sheila',
  'six',
  'stop',
  'three',
  'tree',
  'two',
  'up',
  'visual',
  'wow',
  'yes',
  'zero',
  '_background_noise_'],
 ['.DS_Store',
  'LICENSE',
  'README.md',
  'testing_list.txt',
  'validation_list.txt'])

In [4]:
all_words = ['backward',
  'bed',
  'bird',
  'cat',
  'dog',
  'down',
  'eight',
  'five',
  'follow',
  'forward',
  'four',
  'go',
  'happy',
  'house',
  'learn',
  'left',
  'marvin',
  'nine',
  'no',
  'off',
  'on',
  'one',
  'right',
  'seven',
  'sheila',
  'six',
  'stop',
  'three',
  'tree',
  'two',
  'up',
  'visual',
  'wow',
  'yes',
  'zero']

In [5]:
#only these 20 words will act as keywords
word_list = ['up','down','left', 'right', 'stop', 'go', 'yes', 'no', 'on', 'off', 'one', 'two', 
             'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'zero']
len(word_list)

20

In [6]:
#the other 15 words will be combined into one class of rejection words
rejection_words = [i for i in all_words if i not in word_list]
print(rejection_words)
len(rejection_words)

['backward', 'bed', 'bird', 'cat', 'dog', 'follow', 'forward', 'happy', 'house', 'learn', 'marvin', 'sheila', 'tree', 'visual', 'wow']


15

In [7]:
#test to see if individual audio files will work correctly
y, sr = librosa.load((data_path + '/bird/0a7c2a8d_nohash_0.wav'), sr = 16000)
y1, sr1 = librosa.load((data_path + '/down/00f0204f_nohash_0.wav'), sr = 16000)

In [8]:
# def quantize_data(data, num_bits = 8, data_min=-1, data_max=1):
#     '''
#     Quantizes audio data based on number of bits and desired minimum and maximum
    
#     Parameters:
#     ------------
#     data - array containing time series of audio
#     num_bits - integer specifying number of bits to use, default 8 for MAX78000
#     data_min - desired minimum of quantized data
#     data_max - desired maximum of quantized data
    
#     Returns:
#     ------------
#     q_data - array with quantized data between data_min and data_max
#     '''
    
#     step_size = 2.0/(2**num_bits)
#     max_val = 2**num_bits -1
#     q_data = np.round(data / step_size)
#     q_data = np.clip(q_data, np.round(-max_val / 2), np.round(max_val/2))
#     q_data = q_data / (2**num_bits / 2)
#     q_data = np.clip(q_data, data_min, data_max)
#     return q_data

In [9]:
def quantize_data(data, num_bits=8):
    step_size = 2.0/2**num_bits
    max_val = 2**num_bits -1
    q_data = np.round((data - (-1))/ step_size)
    q_data = np.clip(q_data, 0, max_val)
    q_data /= 256.
    q_data = np.round(((q_data - 0.5) * 256))
    q_data = np.clip(q_data, -128, 127) / 128.
    return q_data

In [10]:
#let's test if quantize audio works as intended:
print(len(np.unique(y)))
yq = quantize_data(y)
print(len(np.unique(yq)))

4867
75


Converts test file y from 4867 values to 75 unique values. Is this what we should expect? Since our quantize function works with a uniform spacing of step_size, our y test file should have a range of about 75 / 256 before quantization.

In [11]:
percent = np.round(((y.max() - y.min()) / 2),3)
print(percent, 'percent of possible values are represented in original audio file')
print('So we should expect approximately', percent * 256, 'unique values in the transformed file')

0.289 percent of possible values are represented in original audio file
So we should expect approximately 73.984 unique values in the transformed file


Let's try with another file y1

In [12]:
expected = np.round(((y1.max() - y1.min()) / 2),3) * 256
yq1 = quantize_data(y1)
print(expected, 'values expected')
print(len(np.unique(yq1)), 'unique values after')

45.568 values expected
45 unique values after


Looks good!

In [14]:
#loop through all files, calculate MFCCs, save them to the appropriate directory
start = time.time()

base_dir = os.path.join('..','speech_commands_quantized')
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

for i in all_words:
    #move to the directory for the ith word
    working_path = data_path + "/" + i
    print(i)

    save_dir = os.path.join(base_dir, i)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    #loop through each file in the word's directory
    for j in os.listdir(working_path):
        #define paths we'll need
        word_path = i + '/' + j
        file_path = data_path + word_path
        
        #load audio file, GSC is sampled at 16000Hz
        y, sr = librosa.load(file_path, sr = 16000)
        quant = quantize_data(y, num_bits=8)
        
        save_path = os.path.join(base_dir, i, os.path.splitext(j)[0])
        
        np.save(save_path, quant)
        
end = time.time()
print(end - start)

backward
bed
bird
cat
dog
down
eight
five
follow
forward
four
go
happy
house
learn
left
marvin
nine
no
off
on
one
right
seven
sheila
six
stop
three
tree
two
up
visual
wow
yes
zero
824.4822390079498
