Code assumes that data is in same directory as capstone_repo directory:  
```
capstone_repo  
│   README.md  
│   gsc_mfcc_extraction.ipynb    
│  
speech_commands_v0.02  
└───backward  
│   │   file01.wav  
│   │   file02.wav (etc)  
└───bed  
│   |..etc  
```

In [2]:
#! pip install torch

Collecting torch
  Downloading torch-1.11.0-cp39-none-macosx_10_9_x86_64.whl (129.9 MB)
[K     |████████████████████████████████| 129.9 MB 22.8 MB/s eta 0:00:01
Installing collected packages: torch
Successfully installed torch-1.11.0


In [4]:
#! pip install torchaudio

Collecting torchaudio
  Downloading torchaudio-0.11.0-cp39-cp39-macosx_10_15_x86_64.whl (2.4 MB)
[K     |████████████████████████████████| 2.4 MB 3.3 MB/s eta 0:00:01
Installing collected packages: torchaudio
Successfully installed torchaudio-0.11.0


In [6]:
#import libraries
import torch
import numpy as np
import os
import IPython.display as ipd
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
import time
import pandas as pd

In [34]:
data_path = '/Users/Shared/back up fix/speech_commands_v0.02/'

In [8]:
# get list of word directories in speech commands dataset
list(os.walk(data_path))[0]

('/Users/Shared/back up fix/speech_commands_v0.02',
 ['right',
  'eight',
  'cat',
  'tree',
  'backward',
  'learn',
  'bed',
  'happy',
  'go',
  'dog',
  'no',
  'wow',
  'follow',
  'nine',
  'left',
  'stop',
  'three',
  '_background_noise_',
  'sheila',
  'one',
  'bird',
  'zero',
  'seven',
  'up',
  'visual',
  'marvin',
  'two',
  'house',
  'down',
  'six',
  'yes',
  'on',
  'five',
  'forward',
  'off',
  'four'],
 ['.DS_Store',
  'validation_list.txt',
  'LICENSE',
  'README.md',
  'testing_list.txt'])

In [9]:
all_words = ['backward',
  'bed',
  'bird',
  'cat',
  'dog',
  'down',
  'eight',
  'five',
  'follow',
  'forward',
  'four',
  'go',
  'happy',
  'house',
  'learn',
  'left',
  'marvin',
  'nine',
  'no',
  'off',
  'on',
  'one',
  'right',
  'seven',
  'sheila',
  'six',
  'stop',
  'three',
  'tree',
  'two',
  'up',
  'visual',
  'wow',
  'yes',
  'zero']

In [10]:
#only these 20 words will act as keywords
word_list = ['up','down','left', 'right', 'stop', 'go', 'yes', 'no', 'on', 'off', 'one', 'two', 
             'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'zero']
len(word_list)

20

In [11]:
#the other 15 words will be combined into one class of rejection words
rejection_words = [i for i in all_words if i not in word_list]
print(rejection_words)
len(rejection_words)

['backward', 'bed', 'bird', 'cat', 'dog', 'follow', 'forward', 'happy', 'house', 'learn', 'marvin', 'sheila', 'tree', 'visual', 'wow']


15

In [12]:
#! pip install librosa

Collecting librosa
  Downloading librosa-0.9.1-py3-none-any.whl (213 kB)
[K     |████████████████████████████████| 213 kB 4.1 MB/s eta 0:00:01
[?25hCollecting audioread>=2.1.5
  Downloading audioread-2.1.9.tar.gz (377 kB)
[K     |████████████████████████████████| 377 kB 22.9 MB/s eta 0:00:01
[?25hCollecting resampy>=0.2.2
  Downloading resampy-0.2.2.tar.gz (323 kB)
[K     |████████████████████████████████| 323 kB 46.1 MB/s eta 0:00:01
Collecting pooch>=1.0
  Downloading pooch-1.6.0-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 9.9 MB/s  eta 0:00:01
Collecting soundfile>=0.10.2
  Downloading SoundFile-0.10.3.post1-py2.py3.cp26.cp27.cp32.cp33.cp34.cp35.cp36.pp27.pp32.pp33-none-macosx_10_5_x86_64.macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.whl (613 kB)
[K     |████████████████████████████████| 613 kB 141.2 MB/s eta 0:00:01
Building wheels for collected packages: audioread, resampy
  Building wheel for audioread (setup.py) ... [?25ldone
[?25h 

In [13]:
import librosa

In [14]:
#test to see if one audio file will work correctly
y, sr = librosa.load('/Users/Shared/back up fix/speech_commands_v0.02/bird/0a7c2a8d_nohash_0.wav', sr = 16000)

In [18]:
#need a tensor as input not numpy array of time series audio
y = torch.from_numpy(y)

In [21]:
def quantize_data(data, num_bits=8):
              step_size = 2.0/2**num_bits
              max_val = 2**num_bits -1
              q_data = torch.round(data / step_size)
              q_data = torch.clamp(q_data, 0, max_val)
              q_data /= 256.
              q_data = q_data.sub(0.5).mul(256.).round().clamp(min=-128, max=127).div(128.)
              return q_data

In [44]:
#loop through all files, calculate MFCCs, save them to the appropriate directory
start = time.time()

for i in all_words:
    #move to the directory for the ith word
    working_path = data_path + "/" + i
    
    #loop through each file in the word's directory
    for j in os.listdir(working_path):
        #define paths we'll need
        word_path = i + '/' + j
        file_path = data_path + word_path
        
        #load audio file, GSC is sampled at 16000Hz
        y, sr = librosa.load(file_path, sr = 16000)
        
        # quantize data
        y = torch.from_numpy(y)
        
        quant = quantize_data(y, num_bits=8)
        
        #define save location directory
        #word_dir = i + '_quant'
        
        #saving file 
        #torch.save(y, 'Users/Shared/back up fix/quantized/%s.npy' % i)

end = time.time()
print(end - start)

52.561039686203


In [45]:
y

tensor([-3.0518e-05,  6.1035e-05, -3.0518e-05,  ..., -3.0518e-05,
        -3.0518e-05, -6.1035e-05])