In [6]:
import tensorflow as tf
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
import os
from scipy.io import wavfile
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from glob import glob
import argparse
import warnings

In [7]:
from tensorflow.keras import layers, Sequential
from tensorflow.keras.layers import TimeDistributed, LayerNormalization
from tensorflow.keras.layers import Conv2D, BatchNormalization, ReLU, GlobalAveragePooling2D, Dense, Softmax, Dropout, Flatten, LSTM, Reshape, Conv1D, Bidirectional, MaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
import kapre
from kapre.composed import get_melspectrogram_layer
import tensorflow as tf
import os

In [15]:
args={'src_root':'cleanwavs',
      'batch_size':16,
     'delta_time':1.0,
     'model_type':'conv2d',
     'sample_rate':16000,
      'fn':'3a3d0279'
     }

In [9]:
def Conv2D(N_LABELS=2, SR=16000, DT=1.0):
    input_shape = (int(SR*DT), 1)
    i = get_melspectrogram_layer(input_shape=input_shape,
                                 n_mels=128,
                                 pad_end=True,
                                 n_fft=512,
                                 win_length=400,
                                 hop_length=160,
                                 sample_rate=SR,
                                 return_decibel=True,
                                 input_data_format='channels_last',
                                 output_data_format='channels_last')
    
    # même size
    k = tf.keras.layers.experimental.preprocessing.Resizing(150, 150)(i.output)

    k = tf.keras.layers.Conv2D(3, (3, 3), padding="same")(k)
    # print(i.shape)
    pretrained_model = tf.keras.applications.InceptionV3(input_shape=(150, 150, 3),
                                                         include_top=False,
                                                         weights='imagenet')

    pretrained_model.trainable = False 
    last_layer = pretrained_model.get_layer('mixed7')
    x = pretrained_model(k)

    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(rate=0.2, name='dropout')(x)
    x = layers.Dense(64, activation='relu',
                     activity_regularizer=l2(0.001))(x)
    o = layers.Dense(N_LABELS, activation='softmax', name='softmax')(x)
    model = Model(inputs=i.input, outputs=o, name='2d_convolution')
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [10]:
def LSTM(N_LABELS=2, SR=16000, DT=1.0):
    # shape de notre data (n, time, feat)
    model_lstm = Sequential()
    input_shape = (int(SR*DT), 1)
    
    i = get_melspectrogram_layer(input_shape=input_shape,
                                 n_mels=128,
                                 pad_end=True,
                                 n_fft=512,
                                 win_length=400,
                                 hop_length=160,
                                 sample_rate=SR,
                                 return_decibel=True,
                                 input_data_format='channels_last',
                                 output_data_format='channels_last',
                                 name='2d_convolution')

    # model = Sequential()
    
    model_lstm.add(i)
    model_lstm.add(LayerNormalization(axis=2, name='layer_norm'))

    model_lstm.add(TimeDistributed(Reshape((-1,)), name='td_reshape'))
    model_lstm.add(TimeDistributed(Dense(64, activation='tanh'),
                        name='td_ds_tanh'))
    model_lstm.add(Bidirectional(layers.GRU(32, return_sequences=True),
                             name='bd_lstm'))

    model_lstm.add(Bidirectional(layers.GRU(32, return_sequences=True),
                             name='bd_lstm_2'))
    model_lstm.add(Conv1D(64, 3, activation='relu', name='ds_relu_1'))
    model_lstm.add(MaxPooling1D(name='maxp_1d'))
    model_lstm.add(Dense(32, activation='relu', name='ds_relu_2'))
    model_lstm.add(Flatten(name='flatten'))
    model_lstm.add(Dropout(rate=0.2, name='dropout'))
    model_lstm.add(Dense(32, activation='relu',
                     activity_regularizer=l2(0.001),
                     name='ds_relu_3'))


    model_lstm.add(Dense(2, activation='softmax'))
    model_lstm.summary()
    model_lstm.compile(loss='categorical_crossentropy',
                  optimizer='adam', metrics=['acc'])
    return model_lstm

In [11]:
import tensorflow as tf
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
import os
from scipy.io import wavfile
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from glob import glob
import argparse
import warnings

In [12]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, wav_paths, labels, sr, dt,n_labels,
                 batch_size=32, shuffle=True):
        self.wav_paths = wav_paths
        self.labels = labels
        self.sr = sr
        self.dt = dt
        self.n_labels = n_labels
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()

    # retourne le nbr de batch dans le dataset

    def __len__(self):
        return int(np.floor(len(self.wav_paths) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        wav_paths = [self.wav_paths[k] for k in indexes]
        labels = [self.labels[k] for k in indexes]

        # genere un batch de time data
        X = np.empty((self.batch_size, int(self.sr*self.dt), 1),
                     dtype=np.float32)
        Y = np.empty((self.batch_size, self.n_labels), dtype=np.float32)

        for i, (path, label) in enumerate(zip(wav_paths, labels)):
            rate, wav = wavfile.read(path)
            X[i,] = X[i,][:16000]
            # X[i,] = wav.reshape(-1, 1)
            for j in range(len(X[i,])):
                X[i,][j] = wav.reshape(-1, 1)[j]
            Y[i,] = to_categorical(label, num_classes=self.n_labels)

        return X, Y

    # mélange les chemins et les étiquettes des fichiers audio à la fin de chaque époque si shufflec'est True.
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.wav_paths))
        if self.shuffle:
            np.random.shuffle(self.indexes)

In [13]:
def train(args):
    src_root = args['src_root']
    sr = args['sample_rate']
    dt = args['delta_time']
    batch_size = args['batch_size']
    model_type = args['model_type']
    params = {'N_LABELS': len(os.listdir(src_root)), 'SR': sr, 'DT': dt}
    models = {'conv2d': Conv2D(**params), 'lstm': LSTM(**params)}
    if model_type not in models.keys():
        raise ValueError(f"{model_type} not an available model")
    csv_path = os.path.join('logs', f'{model_type}_history.csv')

    wav_paths = [x.replace(os.sep, '/') for x in glob('{}/**'.format(src_root), recursive=True) if '.wav' in x]
    classes = sorted(os.listdir(src_root))
    le = LabelEncoder()
    le.fit(classes)
    labels = [os.path.split(x)[0].split('/')[-1] for x in wav_paths]
    labels = le.transform(labels)
    wav_train, wav_val, label_train, label_val = train_test_split(wav_paths, labels, test_size=0.1, random_state=10)

    if len(label_train) < batch_size:
        raise ValueError('Nbr of train audios must be superior than batch_size')
    if len(set(label_train)) != params['N_LABELS']:
        warnings.warn(f'Found {len(set(label_train))}/{params["N_LABELS"]} classes in training data. Increase the size of data or change random_state.')
    if len(set(label_val)) != params['N_LABELS']:
        warnings.warn(f'Found {len(set(label_val))}/{params["N_LABELS"]} classes in validation data. Increase the size of data  or change random_state.')

    tg = DataGenerator(wav_train, label_train, sr, dt, params['N_LABELS'], batch_size=batch_size)
    vg = DataGenerator(wav_val, label_val, sr, dt, params['N_LABELS'], batch_size=batch_size)
    model = models[model_type]
    cp = ModelCheckpoint(f'models/{model_type}.h5', monitor='val_loss', save_best_only=True, save_weights_only=False, mode='auto', save_freq='epoch', verbose=1)
    csv_logger = CSVLogger(csv_path, append=False)
    model.fit(tg, validation_data=vg, epochs=30, verbose=1, callbacks=[csv_logger, cp])

In [19]:
#lstm
train(args)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
2d_convolution (Sequential)  (None, 100, 128, 1)       0         
_________________________________________________________________
layer_norm (LayerNormalizati (None, 100, 128, 1)       256       
_________________________________________________________________
td_reshape (TimeDistributed) (None, 100, 128)          0         
_________________________________________________________________
td_ds_tanh (TimeDistributed) (None, 100, 64)           8256      
_________________________________________________________________
bd_lstm (Bidirectional)      (None, 100, 64)           18816     
_________________________________________________________________
bd_lstm_2 (Bidirectional)    (None, 100, 64)           18816     
_________________________________________________________________
ds_relu_1 (Conv1D)           (None, 98, 64)            1

Epoch 24/30
Epoch 00024: val_loss did not improve from 0.02284
Epoch 25/30
Epoch 00025: val_loss did not improve from 0.02284
Epoch 26/30
Epoch 00026: val_loss did not improve from 0.02284
Epoch 27/30
Epoch 00027: val_loss did not improve from 0.02284
Epoch 28/30
Epoch 00028: val_loss did not improve from 0.02284
Epoch 29/30
Epoch 00029: val_loss did not improve from 0.02284
Epoch 30/30
Epoch 00030: val_loss did not improve from 0.02284


In [16]:
train(args)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
2d_convolution (Sequential)  (None, 100, 128, 1)       0         
_________________________________________________________________
layer_norm (LayerNormalizati (None, 100, 128, 1)       256       
_________________________________________________________________
td_reshape (TimeDistributed) (None, 100, 128)          0         
_________________________________________________________________
td_ds_tanh (TimeDistributed) (None, 100, 64)           8256      
_________________________________________________________________
bd_lstm (Bidirectional)      (None, 100, 64)           18816     
_________________________________________________________________
bd_lstm_2 (Bidirectional)    (None, 100, 64)           18816     
_________________________________________________________________
ds_relu_1 (Conv1D)           (None, 98, 64)           

KeyboardInterrupt: 