In [15]:
from tensorflow.keras import layers
from tensorflow.keras.layers import TimeDistributed, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
import kapre
from kapre.composed import get_melspectrogram_layer
import tensorflow as tf
import os

In [16]:
import tensorflow as tf
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
import os
from scipy.io import wavfile
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from glob import glob
import argparse
import warnings

In [28]:
args={'src_root':'cleanwavs',
      'batch_size':16,
     'delta_time':1.0,
     'model_type':'conv2d',
     'sample_rate':16000,
      'fn':'3a3d0279'
     }

In [18]:
def Conv2D(N_LABELS=2, SR=16000, DT=1.0):
    input_shape = (int(SR*DT), 1)
    i = get_melspectrogram_layer(input_shape=input_shape,
                                 n_mels=128,
                                 pad_end=True,
                                 n_fft=512,
                                 win_length=400,
                                 hop_length=160,
                                 sample_rate=SR,
                                 return_decibel=True,
                                 input_data_format='channels_last',
                                 output_data_format='channels_last')
    n = LayerNormalization(axis=2, name='batch_norm')(i.output)
    n = layers.Conv2D(8, kernel_size=(7, 7), activation='tanh',
                      padding='same', name='conv2d_tanh')(n)
    n = layers.MaxPooling2D(pool_size=(
        2, 2), padding='same', name='max_pool_2d_1')(n)
    n = layers.Conv2D(16, kernel_size=(5, 5), activation='relu',
                      padding='same', name='conv2d_relu_1')(n)
    n = layers.MaxPooling2D(pool_size=(
        2, 2), padding='same', name='max_pool_2d_2')(n)
    n = layers.Conv2D(16, kernel_size=(3, 3), activation='relu',
                      padding='same', name='conv2d_relu_2')(n)
    n = layers.MaxPooling2D(pool_size=(
        2, 2), padding='same', name='max_pool_2d_3')(n)
    n = layers.Conv2D(32, kernel_size=(3, 3), activation='relu',
                      padding='same', name='conv2d_relu_3')(n)
    n = layers.MaxPooling2D(pool_size=(
        2, 2), padding='same', name='max_pool_2d_4')(n)
    n = layers.Conv2D(32, kernel_size=(3, 3), activation='relu',
                      padding='same', name='conv2d_relu_4')(n)
    n = layers.Flatten(name='flatten')(n)
    n = layers.Dropout(rate=0.2, name='dropout')(n)
    n = layers.Dense(64, activation='relu',
                     activity_regularizer=l2(0.001), name='dense')(n)
    o = layers.Dense(N_LABELS, activation='softmax', name='softmax')(n)
    model = Model(inputs=i.input, outputs=o, name='2d_convolution')
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [19]:
def LSTM(N_LABELS=2, SR=16000, DT=1.0):
    input_shape = (int(SR*DT), 1)
    i = get_melspectrogram_layer(input_shape=input_shape,
                                 n_mels=128,
                                 pad_end=True,
                                 n_fft=512,
                                 win_length=400,
                                 hop_length=160,
                                 sample_rate=SR,
                                 return_decibel=True,
                                 input_data_format='channels_last',
                                 output_data_format='channels_last',
                                 name='2d_convolution')
    n = LayerNormalization(axis=2, name='batch_norm')(i.output)
    n = TimeDistributed(layers.Reshape((-1,)), name='reshape')(n)
    k = TimeDistributed(layers.Dense(64, activation='tanh'),
                        name='td_dense_tanh')(n)
    n = layers.Bidirectional(layers.LSTM(32, return_sequences=True),
                             name='bidirectional_lstm')(k)
    n = layers.concatenate([k, n], axis=2, name='skip_connection')
    n = layers.Dense(64, activation='relu', name='dense_1_relu')(n)
    n = layers.MaxPooling1D(name='max_pool_1d')(n)
    n = layers.Dense(32, activation='relu', name='dense_2_relu')(n)
    n = layers.Flatten(name='flatten')(n)
    n = layers.Dropout(rate=0.2, name='dropout')(n)
    n = layers.Dense(32, activation='relu',
                     activity_regularizer=l2(0.001),
                     name='dense_3_relu')(n)
    o = layers.Dense(N_LABELS, activation='softmax', name='softmax')(n)
    print(i.input)
    model = Model(inputs=i.input, outputs=o, name='long_short_term_memory')
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model

In [20]:
import tensorflow as tf
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
import os
from scipy.io import wavfile
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from glob import glob
import argparse
import warnings

In [26]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, wav_paths, labels, sr, dt,n_labels,
                 batch_size=32, shuffle=True):
        self.wav_paths = wav_paths
        self.labels = labels
        self.sr = sr
        self.dt = dt
        self.n_labels = n_labels
        self.batch_size = batch_size
        self.shuffle = True
        self.on_epoch_end()

    # retourne le nbr de batch dans le dataset

    def __len__(self):
        return int(np.floor(len(self.wav_paths) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        wav_paths = [self.wav_paths[k] for k in indexes]
        labels = [self.labels[k] for k in indexes]

        # genere un batch de time data
        X = np.empty((self.batch_size, int(self.sr*self.dt), 1),
                     dtype=np.float32)
        Y = np.empty((self.batch_size, self.n_labels), dtype=np.float32)

        for i, (path, label) in enumerate(zip(wav_paths, labels)):
            rate, wav = wavfile.read(path)
            X[i,] = X[i,][:16000]
            # X[i,] = wav.reshape(-1, 1)
            for j in range(len(X[i,])):
                X[i,][j] = wav.reshape(-1, 1)[j]
            Y[i,] = to_categorical(label, num_classes=self.n_labels)

        return X, Y

    # mélange les chemins et les étiquettes des fichiers audio à la fin de chaque époque si shufflec'est True.
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.wav_paths))
        if self.shuffle:
            np.random.shuffle(self.indexes)

In [22]:
def train(args):
    src_root = args['src_root']
    sr = args['sample_rate']
    dt = args['delta_time']
    batch_size = args['batch_size']
    model_type = args['model_type']
    params = {'N_LABELS': len(os.listdir(args['src_root'])),
              'SR': sr,
              'DT': dt}
    models = {'conv2d': Conv2D(**params),
              'lstm':  LSTM(**params)}
    assert model_type in models.keys(), '{} is unavailable'.format(model_type)
    csv_path = os.path.join('logs', '{}_history.csv'.format(model_type))

    wav_paths = glob('{}/**'.format(src_root), recursive=True)
    wav_paths = [n.replace(os.sep, '/') for n in wav_paths if '.wav' in n]
    classes = sorted(os.listdir(args['src_root']))
    le = LabelEncoder()
    le.fit(classes)
    labels = [os.path.split(n)[0].split('/')[-1] for n in wav_paths]
    labels = le.transform(labels)
    wav_train, wav_val, label_train, label_val = train_test_split(wav_paths,
                                                                  labels,
                                                                  test_size=0.1,
                                                                  random_state=10)

    assert len(label_train) >= args['batch_size'], 'Nbr of train samples should be higher than the batch_size'
    if len(set(label_train)) != params['N_LABELS']:
        warnings.warn('Found {}/{} labels in training data. Increase the size or change random_state.'.format(
            len(set(label_train)), params['N_LABELS']))
    if len(set(label_val)) != params['N_LABELS']:
        warnings.warn('Found {}/{} labels in validation data. Increase the size or change random_state.'.format(
            len(set(label_val)), params['N_LABELS']))

    tg = DataGenerator(wav_train, label_train, sr, dt,
                       params['N_LABELS'], batch_size=batch_size)
    vg = DataGenerator(wav_val, label_val, sr, dt,
                       params['N_LABELS'], batch_size=batch_size)
    model = models[model_type]
    cp = ModelCheckpoint('vers_fin/{}.h5'.format(model_type), monitor='val_loss',
                         save_best_only=True, save_weights_only=False,
                         mode='auto', save_freq='epoch', verbose=1)
    csv_logger = CSVLogger(csv_path, append=False)
    model.fit(tg, validation_data=vg,
              epochs=30, verbose=1,
              callbacks=[csv_logger, cp])

In [27]:
train(args)

Tensor("stft_5_input:0", shape=(None, 16000, 1), dtype=float32)
Epoch 1/30
Epoch 00001: val_loss improved from inf to 0.16463, saving model to vers_fin\lstm.h5
Epoch 2/30
Epoch 00002: val_loss improved from 0.16463 to 0.09015, saving model to vers_fin\lstm.h5
Epoch 3/30
Epoch 00003: val_loss improved from 0.09015 to 0.08445, saving model to vers_fin\lstm.h5
Epoch 4/30
Epoch 00004: val_loss did not improve from 0.08445
Epoch 5/30
Epoch 00005: val_loss improved from 0.08445 to 0.05930, saving model to vers_fin\lstm.h5
Epoch 6/30
Epoch 00006: val_loss did not improve from 0.05930
Epoch 7/30
Epoch 00007: val_loss did not improve from 0.05930
Epoch 8/30
Epoch 00008: val_loss did not improve from 0.05930
Epoch 9/30
Epoch 00009: val_loss improved from 0.05930 to 0.03655, saving model to vers_fin\lstm.h5
Epoch 10/30
Epoch 00010: val_loss improved from 0.03655 to 0.02623, saving model to vers_fin\lstm.h5
Epoch 11/30
Epoch 00011: val_loss improved from 0.02623 to 0.02447, saving model to vers_fi

Epoch 29/30
Epoch 00029: val_loss improved from 0.01575 to 0.01381, saving model to vers_fin\lstm.h5
Epoch 30/30
Epoch 00030: val_loss did not improve from 0.01381


In [29]:
train(args)

Tensor("stft_7_input:0", shape=(None, 16000, 1), dtype=float32)
Epoch 1/30
Epoch 00001: val_loss improved from inf to 0.24530, saving model to vers_fin\conv2d.h5
Epoch 2/30
Epoch 00002: val_loss improved from 0.24530 to 0.06703, saving model to vers_fin\conv2d.h5
Epoch 3/30
Epoch 00003: val_loss improved from 0.06703 to 0.06112, saving model to vers_fin\conv2d.h5
Epoch 4/30
Epoch 00004: val_loss improved from 0.06112 to 0.04171, saving model to vers_fin\conv2d.h5
Epoch 5/30
Epoch 00005: val_loss improved from 0.04171 to 0.03780, saving model to vers_fin\conv2d.h5
Epoch 6/30
Epoch 00006: val_loss improved from 0.03780 to 0.03300, saving model to vers_fin\conv2d.h5
Epoch 7/30
Epoch 00007: val_loss improved from 0.03300 to 0.02751, saving model to vers_fin\conv2d.h5
Epoch 8/30
Epoch 00008: val_loss did not improve from 0.02751
Epoch 9/30
Epoch 00009: val_loss improved from 0.02751 to 0.02266, saving model to vers_fin\conv2d.h5
Epoch 10/30
Epoch 00010: val_loss improved from 0.02266 to 0.0

Epoch 28/30
Epoch 00028: val_loss did not improve from 0.00832
Epoch 29/30
Epoch 00029: val_loss did not improve from 0.00832
Epoch 30/30
Epoch 00030: val_loss did not improve from 0.00832
