# Wavelet Transform Concatenate into 1D

In this experiment we concatenate the bands of the wt into a 1D vector, we feed this 1D input to the network.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Audio
import librosa
import librosa.display
import pywt

In [2]:
from SimpleSpeechCommands import get_word_dict, read_list, load_data
from SimpleSpeechCommands import append_examples,partition_directory

In [3]:
word_to_label,label_to_word = get_word_dict()
path_dataset = '/home/edoardobucheli/TFSpeechCommands/train/audio'
#path_dataset = '/Users/edoardobucheli/Documents/MCC/Tesis/Kaggle_SpeechCommands/train/audio'
sr = 16000
file_length = 16000

In [4]:
training_files = read_list(path_dataset,'training_files.txt')
validation_files = read_list(path_dataset,'validation_files.txt')
testing_files = read_list(path_dataset,'testing_files.txt')

In [5]:
x_train,y_train = load_data(training_files,sr,file_length,path_dataset,word_to_label)
x_val,y_val = load_data(validation_files,sr,file_length,path_dataset,word_to_label)
x_test,y_test = load_data(testing_files,sr,file_length,path_dataset,word_to_label)

100%|██████████| 25112/25112 [00:08<00:00, 3080.25it/s]
100%|██████████| 3424/3424 [00:01<00:00, 3062.72it/s]
100%|██████████| 3430/3430 [00:01<00:00, 3059.48it/s]


In [6]:
backgrounds = partition_directory(path_dataset,'_background_noise_',sr,file_length)

In [7]:
x_train,y_train = append_examples(x_train,y_train,backgrounds[:300],11)
x_val,y_val = append_examples(x_val,y_val,backgrounds[300:320],11)
x_test,y_test = append_examples(x_test,y_test,backgrounds[320:],11)

In [8]:
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)
print(x_test.shape)
print(y_test.shape)

(25412, 16000)
(25412,)
(3444, 16000)
(3444,)
(3508, 16000)
(3508,)


### Preprocess Data

In [9]:
from pywt import wavedec
from tqdm import tqdm

In [10]:
test = wavedec(x_train[0],'db4',)

In [11]:
new_test = [f for l in test for f in l]

In [12]:
levels = len(test)
print(levels)
res = len(new_test)
print(res)

12
16070


In [13]:
x_train_2 = np.zeros((len(x_train),res))

for i, wave in enumerate(tqdm(x_train)):
    wt_this =  wavedec(wave,'db4')
    end = 0
    #this_plain = []
    for j in range(levels):
        start = end
        end += len(wt_this[j])
        x_train_2[i,start:end] = wt_this[j]/np.max(np.absolute(wt_this[j]))

100%|██████████| 25412/25412 [00:11<00:00, 2264.13it/s]


In [14]:
x_val_2 = np.zeros((len(x_val),res))

for i, wave in enumerate(tqdm(x_val)):
    wt_this =  wavedec(wave,'db4')
    end = 0
    for j in range(levels):
        start = end
        end += len(wt_this[j])
        x_val_2[i,start:end] = wt_this[j]/np.max(np.absolute(wt_this[j]))

100%|██████████| 3444/3444 [00:01<00:00, 2265.17it/s]


## Other Processing

In [15]:
file_length = res

In [16]:
from Utilities import make_oh

In [17]:
N_train, _ = x_train_2.shape
N_val, _ = x_val_2.shape
#N_test, _ = x_test_2.shape

n_classes = len(np.unique(y_train))

In [18]:
y_train_oh = make_oh(y_train)
y_val_oh = make_oh(y_val)
#y_test_oh = make_oh(y_test)

In [19]:
print(x_train_2.shape)
print(y_train_oh.shape)
print(x_val_2.shape)
print(y_val_oh.shape)
#print(x_test.shape)
#print(y_test_oh.shape)

(25412, 16070)
(25412, 12)
(3444, 16070)
(3444, 12)


## CRNN

In [20]:
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, CuDNNGRU
from tensorflow.keras.layers import Conv1D, MaxPool1D, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.backend import expand_dims
from tensorflow.keras.optimizers import Adam

In [21]:
def CRNN1_1D(input_shape, n_classes):

    X_input = Input(input_shape)

    X = Lambda(lambda q: expand_dims(q, -1), name='expand_dims') (X_input)

    X = Conv1D(16,9, activation='relu', padding='valid')(X)
    X = MaxPool1D(8)(X)

    X = Conv1D(32,9,activation='relu',padding='valid')(X)
    X = MaxPool1D(8)(X)

    X = Conv1D(32,9,activation='relu',padding='valid')(X)
    X = MaxPool1D(6)(X)

    X = CuDNNGRU(32, return_sequences = True)(X)
    X = Dropout(0.1)(X)
    X = CuDNNGRU(32, return_sequences = True)(X)
    X = Dropout(0.1)(X)
    X = Flatten()(X)

    X = Dense(64, activation='relu')(X)
    X = Dropout(0.5)(X)

    X = Dense(n_classes, activation = 'softmax')(X)

    model = Model(inputs = X_input, outputs = X)

    return model

In [22]:
input_shape = (file_length,)
lr = 0.001

In [23]:
crnn1D = CRNN1_1D(input_shape, n_classes)
crnn1D.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 16070)             0         
_________________________________________________________________
expand_dims (Lambda)         (None, 16070, 1)          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 16062, 16)         160       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 2007, 16)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1999, 32)          4640      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 249, 32)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 241, 32)           9248      
__________

In [24]:
crnn1D.compile(optimizer = Adam(lr),loss='categorical_crossentropy',metrics = ['accuracy'])

In [25]:
crnn1D.fit(x_train_2,y_train_oh,
           batch_size=256, epochs = 50,
           validation_data=[x_val_2,y_val_oh],
           shuffle=True)

Train on 25412 samples, validate on 3444 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f4fcc4f4f98>

### CNN 1D

In [26]:
from tensorflow.keras.layers import Conv1D, MaxPool1D, Dropout, Activation
from tensorflow.keras.layers import GlobalMaxPool1D, Dense, Input, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.backend import expand_dims

In [27]:
def conv1d_v1(input_shape,n_classes):

    X_input = Input(shape = input_shape)

    X = Lambda(lambda q: expand_dims(q, -1), name='expand_dims') (X_input)

    X = Conv1D(16,9,activation='relu',padding='valid')(X)
    X = Conv1D(16,9,activation='relu',padding='valid')(X)
    X = MaxPool1D(16)(X)
    X = Dropout(0.1)(X)

    X = Conv1D(32,3,activation='relu',padding='valid')(X)
    X = Conv1D(32,3,activation='relu',padding='valid')(X)
    X = MaxPool1D(4)(X)
    X = Dropout(0.1)(X)

    X = Conv1D(32,3,activation='relu',padding='valid')(X)
    X = Conv1D(32,3,activation='relu',padding='valid')(X)
    X = MaxPool1D(4)(X)
    X = Dropout(0.1)(X)

    X = Conv1D(256,3,activation='relu',padding='valid')(X)
    X = Conv1D(256,3,activation='relu',padding='valid')(X)
    X = GlobalMaxPool1D()(X)

    X = Dense(64,activation='relu')(X)
    X = Dense(128,activation='relu')(X)

    X = Dense(n_classes,activation='softmax')(X)

    model = Model(inputs = X_input,outputs = X)

    return model

In [28]:
input_shape = (file_length,)
lr = 0.001

In [29]:
cnn1d = conv1d_v1(input_shape,n_classes)
cnn1d.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 16070)             0         
_________________________________________________________________
expand_dims (Lambda)         (None, 16070, 1)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 16062, 16)         160       
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 16054, 16)         2320      
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 1003, 16)          0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 1003, 16)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 1001, 32)          1568      
__________

In [30]:
cnn1d.compile(optimizer=Adam(lr),loss='categorical_crossentropy',metrics = ['accuracy'])

In [31]:
cnn1d.fit(x_train_2,y_train_oh,
           batch_size=256, epochs = 50,
           validation_data=[x_val_2,y_val_oh])

Train on 25412 samples, validate on 3444 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f4fcb355908>

## attRNN

In [32]:
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.layers import Lambda, Dot, Softmax
from tensorflow.keras.layers import Conv1D, Reshape, Permute
from tensorflow.keras.layers import Bidirectional, CuDNNLSTM, MaxPool1D
from tensorflow.keras.activations import relu, softmax
from tensorflow.keras.models import Model
from tensorflow.keras.backend import squeeze,stack, expand_dims

In [33]:
def AttRNNSpeechModelWave(input_shape, n_classes):

    X_input = Input(input_shape)

    X = Lambda(lambda q: expand_dims(q, -1), name='expand_dims') (X_input)

    X = Conv1D(16,9, activation='relu', padding='valid')(X)
    X = MaxPool1D(8)(X)

    X = Conv1D(32,9,activation='relu',padding='valid')(X)
    X = MaxPool1D(8)(X)

    X = Conv1D(32,9,activation='relu',padding='valid')(X)
    X = MaxPool1D(6)(X)

    #X = Lambda(lambda q: squeeze(q, -1), name='squeeze_last_dim') (X)

    X = Bidirectional(CuDNNLSTM(64, return_sequences = True)) (X)
    X = Dropout(0.5)(X)
    X = Bidirectional(CuDNNLSTM(64, return_sequences = True)) (X)
    X = Dropout(0.5)(X)

    xFirst = Lambda(lambda q: q[:,16]) (X)
    query = Dense(128) (xFirst)
    query = Dropout(0.5)(query)

    attScores = Dot(axes=[1,2])([query, X])
    attScores = Softmax(name='attSoftmax')(attScores)

    attVector = Dot(axes=[1,1])([attScores, X])

    X = Dense(64, activation = 'relu')(attVector)
    X = Dropout(0.5)(X)
    X = Dense(32)(X)
    X = Dropout(0.5)(X)

    X = Dense(n_classes, activation = 'softmax', name='output')(X)

    model = Model(inputs = X_input, outputs = X)

    return model

In [34]:
input_shape = (file_length,)
lr = 0.001

In [35]:
attRNN = AttRNNSpeechModelWave(input_shape,n_classes)
attRNN.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 16070)        0                                            
__________________________________________________________________________________________________
expand_dims (Lambda)            (None, 16070, 1)     0           input_3[0][0]                    
__________________________________________________________________________________________________
conv1d_11 (Conv1D)              (None, 16062, 16)    160         expand_dims[0][0]                
__________________________________________________________________________________________________
max_pooling1d_6 (MaxPooling1D)  (None, 2007, 16)     0           conv1d_11[0][0]                  
__________________________________________________________________________________________________
conv1d_12 

In [36]:
attRNN.compile(optimizer=Adam(lr),loss='categorical_crossentropy',metrics = ['accuracy'])

In [37]:
attRNN.fit(x_train_2,y_train_oh,
           batch_size=256, epochs = 50,
           validation_data=[x_val_2,y_val_oh])

Train on 25412 samples, validate on 3444 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f4fcb032f98>