In [1]:
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Audio
import librosa
import librosa.display

In [2]:
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, classification_report

## Load Data

In [3]:
from SimpleSpeechCommands import get_word_dict, read_list, load_data
from SimpleSpeechCommands import append_examples,partition_directory

In [4]:
word_to_label,label_to_word = get_word_dict()
path_dataset = '/home/edoardobucheli/TFSpeechCommands/train/audio'
sr = 16000
file_length = 16000

In [5]:
training_files = read_list(path_dataset,'training_files.txt')
validation_files = read_list(path_dataset,'validation_files.txt')
testing_files = read_list(path_dataset,'testing_files.txt')

In [6]:
x_train,y_train = load_data(training_files,sr,file_length,path_dataset,word_to_label)
x_val,y_val = load_data(validation_files,sr,file_length,path_dataset,word_to_label)
x_test,y_test = load_data(testing_files,sr,file_length,path_dataset,word_to_label)

100%|██████████| 25112/25112 [00:08<00:00, 2993.26it/s]
100%|██████████| 3424/3424 [00:01<00:00, 2981.59it/s]
100%|██████████| 3430/3430 [00:01<00:00, 3015.75it/s]


In [7]:
backgrounds = partition_directory(path_dataset,'_background_noise_',sr,file_length)

In [8]:
x_train,y_train = append_examples(x_train,y_train,backgrounds[:300],11)
x_val,y_val = append_examples(x_val,y_val,backgrounds[300:320],11)
x_test,y_test = append_examples(x_test,y_test,backgrounds[320:],11)

In [9]:
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)
print(x_test.shape)
print(y_test.shape)

(25412, 16000)
(25412,)
(3444, 16000)
(3444,)
(3508, 16000)
(3508,)


## Preprocess Data

In [10]:
from ProcessAudio import normalize_waveforms, normalize_2D
from ProcessAudio import power_spect_set, mel_spec_set, mfcc_set

Normalize (1) Waveform or (2) Spectrogram

In [11]:
n_fft = 512
hop_length = 512

In [12]:
x_train = normalize_waveforms(x_train)
x_train_2 = power_spect_set(x_train,sr,n_fft,hop_length)
x_train_2 = normalize_2D(x_train_2)

100%|██████████| 25412/25412 [00:10<00:00, 2492.04it/s]


In [13]:
x_val = normalize_waveforms(x_val)
x_val_2 = power_spect_set(x_val,sr,n_fft,hop_length)
x_val_2 = normalize_2D(x_val_2)

100%|██████████| 3444/3444 [00:01<00:00, 2520.45it/s]


In [14]:
x_test = normalize_waveforms(x_test)
x_test_2 = power_spect_set(x_test,sr,n_fft,hop_length)
x_test_2 = normalize_2D(x_test_2)

100%|██████████| 3508/3508 [00:01<00:00, 2547.71it/s]


In [15]:
def average_dob(x,freq_res,frames):
    
    x_2 = np.zeros((len(x),freq_res,frames))
    
    for i,spec in enumerate(x):
        for j in range(128):
            indx = j*2
            x_2[i,j,:] = (spec[indx]+spec[indx+1])/2
            
    return x_2

In [16]:
x_train_2 = average_dob(x_train_2,129,32)
x_val_2 = average_dob(x_val_2,129,32)
x_test_2 = average_dob(x_test_2,129,32)

## Other Processing

In [17]:
from Utilities import make_oh

In [18]:
N_train, _, _ = x_train_2.shape
N_val, _, _ = x_val_2.shape
N_test, _, _ = x_test_2.shape

frames = int(np.ceil(sr/hop_length))
n_classes = len(np.unique(y_train))

In [19]:
y_train_oh = make_oh(y_train)
y_val_oh = make_oh(y_val)
y_test_oh = make_oh(y_test)

In [20]:
print(x_train_2.shape)
print(y_train_oh.shape)
print(x_val_2.shape)
print(y_val_oh.shape)
print(x_test_2.shape)
print(y_test_oh.shape)

(25412, 129, 32)
(25412, 12)
(3444, 129, 32)
(3444, 12)
(3508, 129, 32)
(3508, 12)


In [21]:
x_train_2 = np.reshape(x_train_2,(N_train,129,frames,1))
x_val_2 = np.reshape(x_val_2,(N_val,129,frames,1))
x_test_2 = np.reshape(x_test_2,(N_test,129,frames,1))