In [None]:
#

### Processing Data

In [None]:
import os
import numpy as np
import tensorflow as tf # 2.3.0

import librosa
from pathlib import Path
import pickle

tf.random.set_seed(
    1234
)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
SHUFFLE_SEED = 43
VALID_SPLIT = 0.1
SAMPLING_RATE = 16000
BATCH_SIZE = 32 #128
DURATION = 5.0 #20 SECONDS

In [None]:
DATASET_AUDIO_PATH = '../your/pathto/train_audio'

In [None]:
def has_enough_samples(audio_path):
    """ Check if audio has enough samples if audio has exact duration """
    audio, y = librosa.load(audio_path, sr=16000)
    return len(audio) >= (SAMPLING_RATE * DURATION)

def is_long_audio(audio_path, threshold=DURATION):
    """ Check long audios from dataset by threshold """
    duration = librosa.get_duration(filename=audio_path)
    if duration > threshold:
        return True
    elif (duration == threshold) and has_enough_samples(audio_path):
        return True
    return False

def get_audio_metadata():
    """ Get audio paths and labels from directory metadata """
    audio_paths = []
    labels = []
    for label, name in enumerate(class_names):
        print(f"Processing class {name}")
        dir_path = Path(DATASET_AUDIO_PATH) / name
        class_sample_paths = [
                             os.path.join(dir_path, filepath)
                             for filepath in os.listdir(dir_path)
                             if filepath.endswith(".mp3") and
                             is_long_audio(os.path.join(dir_path, filepath))
                             ]
        audio_paths += class_sample_paths
        labels += [label] * len(class_sample_paths)

    print(
        f"Found {len(audio_paths)} files belonging to {len(class_names)} classes."
        )
    return audio_paths, labels

def tf_decode_mp3(mp3_path):
    """ Describe the return shapes and types of decode_mp3 function """
    audio_shape = [int(DURATION * SAMPLING_RATE), 1]
    [audio,] = tf.py_function(decode_mp3, [mp3_path], [tf.float32])
    audio.set_shape(audio_shape)
    return audio

def decode_mp3(mp3_path):
    """Reads and decodes an mp3 audio file."""
    mp3_path = mp3_path.numpy().decode("utf-8")
    audio, sr = librosa.load(mp3_path, duration=DURATION, sr=SAMPLING_RATE) # 5*16000 = 80000 SAMPLES
    return np.expand_dims(audio, axis=1)

def paths_and_labels_to_dataset(audio_paths, labels):
    """Constructs a dataset of audios and labels."""
    path_ds = tf.data.Dataset.from_tensor_slices(audio_paths)
    audio_ds = path_ds.map(tf_decode_mp3)
    label_ds = tf.data.Dataset.from_tensor_slices(labels)
    return tf.data.Dataset.zip((audio_ds, label_ds))

def audio_to_fft(audio):
    """
    Since tf.signal.fft applies FFT on the innermost dimension,
    we need to squeeze the dimensions and then expand them again
    after FFT
    """
    audio = tf.squeeze(audio, axis=-1)
    fft = tf.signal.fft(
                        tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)),
                                tf.complex64)
                        )
    fft = tf.expand_dims(fft, axis=-1)
    # Return the absolute value of the first half of the FFT
    # which represents the positive frequencies
    return tf.math.abs(fft[:, : (int(DURATION * SAMPLING_RATE) // 2), :])

In [None]:
class_names = os.listdir(DATASET_AUDIO_PATH)
print("Our class names: {}".format(class_names,))

In [None]:
import pickle

with open('class_names.pkl', 'wb') as f:
    pickle.dump(class_names, f)

In [None]:
!rm './class_names.pkl'

In [None]:
#TMP
# Get the list of audio file paths along with their corresponding labels

audio_paths, labels = get_audio_metadata()


In [None]:
# Shuffle
rng = np.random.RandomState(SHUFFLE_SEED)
rng.shuffle(audio_paths)
rng = np.random.RandomState(SHUFFLE_SEED)
rng.shuffle(labels)

# SPLIT into training and validation
num_val_samples = int(VALID_SPLIT * len(audio_paths))

print("Using {} files for training.".format(len(audio_paths) - num_val_samples))
train_audio_paths = audio_paths[:-num_val_samples]
train_labels = labels[:-num_val_samples]

print("Using {} files for validation.".format(num_val_samples))
valid_audio_paths = audio_paths[-num_val_samples:]
valid_labels = labels[-num_val_samples:]

In [None]:
# Get the list of audio file paths along with their corresponding labels

audio_paths, labels = get_audio_metadata()

# Shuffle
rng = np.random.RandomState(SHUFFLE_SEED)
rng.shuffle(audio_paths)
rng = np.random.RandomState(SHUFFLE_SEED)
rng.shuffle(labels)

# SPLIT into training and validation
num_val_samples = int(VALID_SPLIT * len(audio_paths))

print("Using {} files for training.".format(len(audio_paths) - num_val_samples))
train_audio_paths = audio_paths[:-num_val_samples]
train_labels = labels[:-num_val_samples]

print("Using {} files for validation.".format(num_val_samples))
valid_audio_paths = audio_paths[-num_val_samples:]
valid_labels = labels[-num_val_samples:]

In [None]:
# Memory Optimization
del audio_paths
del labels

In [None]:
# CREATE 2 DATASETS, one for training and the other for validation
train_ds = paths_and_labels_to_dataset(train_audio_paths, train_labels)
train_ds = train_ds.shuffle(buffer_size=BATCH_SIZE * 8,
                            seed=SHUFFLE_SEED).batch(BATCH_SIZE)
valid_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
valid_ds = valid_ds.shuffle(buffer_size=32 * 8, seed=SHUFFLE_SEED).batch(32)

# Transform audio wave to the frequency domain using `audio_to_fft`
train_ds = train_ds.map(
    lambda x, y: (audio_to_fft(x), y), num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
train_ds = train_ds.prefetch(tf.data.experimental.AUTOTUNE)

valid_ds = valid_ds.map(
    lambda x, y: (audio_to_fft(x), y), num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
valid_ds = valid_ds.prefetch(tf.data.experimental.AUTOTUNE)


In [None]:
tf.data.experimental.save(train_ds, "./train_ds-tfrecord", compression='GZIP') #, compression='GZIP'
!zip -9 -r 'train_ds-tfrecord.zip' 'train_ds-tfrecord'
!rm -rf ./train_ds-tfrecord
tf.data.experimental.save(valid_ds, "./valid_ds-tfrecord", compression='GZIP') #, compression='GZIP'
!zip -r 'valid_ds-tfrecord.zip' 'valid_ds-tfrecord'

#from IPython.display import FileLink
#FileLink(r'./valid_ds-tfrecord.zip')

### Carga de dataset tfrecords

In [None]:
element_specs = (tf.TensorSpec(shape=(None, 40000, 1), dtype=tf.float32, name=None),  tf.TensorSpec(shape=(None,), dtype=tf.int32, name=None))
train_ds = tf.data.experimental.load("../path/to/train_ds-tfrecord-3-9-2020/train_ds-tfrecord-3-9-2020", element_specs, 'GZIP') #tf.TensorSpec(shape=((None, 40000, 1), (None,)), dtype=(tf.float32, tf.int32))
valid_ds = tf.data.experimental.load("../path/to/valid_ds-tfrecord-3-9-2020/valid_ds-tfrecord", element_specs, 'GZIP') #tf.TensorSpec(shape=((None, 40000, 1), (None,)), dtype=(tf.float32, tf.int32))

In [None]:
with open('../path/to/class_names-3-9-2020.pkl', 'rb') as f:
    class_names = pickle.load(f)

### Model

In [None]:
EPOCHS = 50

In [None]:
def residual_block(x, filters, conv_num=1, activation="relu"): #3
    # Shortcut
    s = tf.keras.layers.Conv1D(filters, 1, padding="same")(x)
    for i in range(conv_num - 1):
        x = tf.keras.layers.Conv1D(filters, 3, padding="same")(x)
        x = tf.keras.layers.Activation(activation)(x)
    x = tf.keras.layers.Conv1D(filters, 3, padding="same")(x)
    x = tf.keras.layers.Add()([x, s])
    x = tf.keras.layers.Activation(activation)(x)
    return tf.keras.layers.MaxPool1D(pool_size=2, strides=2)(x)


def build_model(input_shape, num_classes):
    inputs = tf.keras.layers.Input(shape=input_shape, name="input")

    x = residual_block(inputs, 16, 2)
    x = residual_block(x, 32, 2)
    x = residual_block(x, 64, 3)
    x = residual_block(x, 128, 3)
    x = residual_block(x, 128, 3)

    x = tf.keras.layers.AveragePooling1D(pool_size=3, strides=3)(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(256, activation="relu")(x)
    x = tf.keras.layers.Dense(128, activation="relu")(x)

    outputs = tf.keras.layers.Dense(num_classes, activation="softmax", name="output")(x)

    return tf.keras.models.Model(inputs=inputs, outputs=outputs)


model = build_model((int(DURATION * SAMPLING_RATE) // 2, 1), len(class_names))

model.summary()

# Compile the model using Adam's default learning rate

base_learning_rate = 0.00001 #0.0001
opt = tf.keras.optimizers.RMSprop(learning_rate=base_learning_rate)

model.compile(
    optimizer='rmsprop', loss="sparse_categorical_crossentropy", metrics=['accuracy']#["sparse_categorical_accuracy"]
)#Adam

# Add callbacks:
# 'EarlyStopping' to stop training when the model is not enhancing anymore
# 'ModelCheckPoint' to always keep the model that has the best val_accuracy
model_save_filename = "model.h5"

earlystopping_cb = tf.keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True)
mdlcheckpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    model_save_filename, monitor="val_loss", save_best_only=True
)


In [None]:
def get_1d_conv_model(input_shape, num_classes):
    
    inp = tf.keras.layers.Input(shape=input_shape)
    x = tf.keras.layers.Convolution1D(16, 9, activation=relu, padding="valid")(inp)
    x = tf.keras.layers.Convolution1D(16, 9, activation=relu, padding="valid")(x)
    x = tf.keras.layers.MaxPool1D(16)(x)
    x = tf.keras.layers.Dropout(rate=0.1)(x)
    
    x = tf.keras.layers.Convolution1D(32, 3, activation=relu, padding="valid")(x)
    x = tf.keras.layers.Convolution1D(32, 3, activation=relu, padding="valid")(x)
    x = tf.keras.layers.MaxPool1D(4)(x)
    x = tf.keras.layers.Dropout(rate=0.1)(x)
    
    x = tf.keras.layers.Convolution1D(32, 3, activation=relu, padding="valid")(x)
    x = tf.keras.layers.Convolution1D(32, 3, activation=relu, padding="valid")(x)
    x = tf.keras.layers.MaxPool1D(4)(x)
    x = tf.keras.layers.Dropout(rate=0.1)(x)
    
    x = tf.keras.layers.Convolution1D(256, 3, activation=relu, padding="valid")(x)
    x = tf.keras.layers.Convolution1D(256, 3, activation=relu, padding="valid")(x)
    x = tf.keras.layers.GlobalMaxPool1D()(x)
    x = tf.keras.layers.Dropout(rate=0.2)(x)

    x = tf.keras.layers.Dense(64, activation=relu)(x)
    x = tf.keras.layers.Dense(1028, activation=relu)(x)
    out = tf.keras.layers.Dense(num_classes, activation=softmax)(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    #opt = optimizers.Adam(config.learning_rate)

    return model

model = build_model((int(DURATION * SAMPLING_RATE) // 2, 1), len(class_names))
model.compile(optimizer='rmsprop', loss="sparse_categorical_crossentropy", metrics=['sparse_categorical_accuracy'])

# Add callbacks:
# 'EarlyStopping' to stop training when the model is not enhancing anymore
# 'ModelCheckPoint' to always keep the model that has the best val_accuracy
model_save_filename = "model.h5"

earlystopping_cb = tf.keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True)
mdlcheckpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    model_save_filename, monitor="val_loss", save_best_only=True
)


In [None]:
history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=valid_ds,
    callbacks=[earlystopping_cb, mdlcheckpoint_cb], #
)


In [None]:
print(model.evaluate(valid_ds))

**** 2D


In [None]:
class LogMelSpectrogram(tf.keras.layers.Layer):
    """Compute log-magnitude mel-scaled spectrograms."""

    def __init__(self, sample_rate, fft_size, hop_size, n_mels,
                 f_min=0.0, f_max=None, **kwargs):
        super(LogMelSpectrogram, self).__init__(**kwargs)
        self.sample_rate = sample_rate
        self.fft_size = fft_size
        self.hop_size = hop_size
        self.n_mels = n_mels
        self.f_min = f_min
        self.f_max = f_max if f_max else sample_rate / 2
        self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins=self.n_mels,
            num_spectrogram_bins=fft_size // 2 + 1,
            sample_rate=self.sample_rate,
            lower_edge_hertz=self.f_min,
            upper_edge_hertz=self.f_max)

    def build(self, input_shape):
        self.non_trainable_weights.append(self.mel_filterbank)
        super(LogMelSpectrogram, self).build(input_shape)

    def call(self, waveforms):
        """Forward pass.
        Parameters
        ----------
        waveforms : tf.Tensor, shape = (None, n_samples)
            A Batch of mono waveforms.
        Returns
        -------
        log_mel_spectrograms : (tf.Tensor), shape = (None, time, freq, ch)
            The corresponding batch of log-mel-spectrograms
        """
        def _tf_log10(x):
            numerator = tf.math.log(x)
            denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
            return numerator / denominator

        def power_to_db(magnitude, amin=1e-16, top_db=80.0):
            """
            https://librosa.github.io/librosa/generated/librosa.core.power_to_db.html
            """
            ref_value = tf.reduce_max(magnitude)
            log_spec = 10.0 * _tf_log10(tf.maximum(amin, magnitude))
            log_spec -= 10.0 * _tf_log10(tf.maximum(amin, ref_value))
            log_spec = tf.maximum(log_spec, tf.reduce_max(log_spec) - top_db)

            return log_spec

        spectrograms = tf.signal.stft(waveforms,
                                      frame_length=self.fft_size,
                                      frame_step=self.hop_size,
                                      pad_end=False)

        magnitude_spectrograms = tf.abs(spectrograms)

        mel_spectrograms = tf.matmul(tf.square(magnitude_spectrograms),
                                     self.mel_filterbank)

        log_mel_spectrograms = power_to_db(mel_spectrograms)

        # add channel dimension
        log_mel_spectrograms = tf.expand_dims(log_mel_spectrograms, 3)

        return log_mel_spectrograms

    def get_config(self):
        config = {
            'fft_size': self.fft_size,
            'hop_size': self.hop_size,
            'n_mels': self.n_mels,
            'sample_rate': self.sample_rate,
            'f_min': self.f_min,
            'f_max': self.f_max,
        }
        config.update(super(LogMelSpectrogram, self).get_config())

        return config

_FFT_SIZE = 4096
_HOP_SIZE = 86
_N_MEL_BINS = 256

def ConvModel(n_classes, sample_rate=SAMPLING_RATE, duration=DURATION,
              fft_size=_FFT_SIZE, hop_size=_HOP_SIZE, n_mels=_N_MEL_BINS):
    n_samples = int(sample_rate * duration) // 2
    
    # Accept raw audio data as input
    x = tf.keras.Input(shape=(n_samples,), name='input', dtype='float32')
    # Process into log-mel-spectrograms. (This is your custom layer!)
    y = LogMelSpectrogram(sample_rate, fft_size, hop_size, n_mels)(x)
    # Normalize data (on frequency axis)
    y = tf.keras.layers.BatchNormalization(axis=2)(y)
    
    y = tf.keras.layers.Conv2D(32, (3, n_mels), activation='relu')(y)
    y = tf.keras.layers.BatchNormalization()(y)
    y = tf.keras.layers.MaxPool2D((1, y.shape[2]))(y)

    y = tf.keras.layers.Conv2D(32, (3, 1), activation='relu')(y)
    y = tf.keras.layers.BatchNormalization()(y)
    y = tf.keras.layers.MaxPool2D(pool_size=(2, 1))(y)

    y = tf.keras.layers.Flatten()(y)
    y = tf.keras.layers.Dense(64, activation='relu')(y)
    y = tf.keras.layers.Dropout(0.25)(y)
    y = tf.keras.layers.Dense(n_classes, activation='softmax')(y)

    return tf.keras.Model(inputs=x, outputs=y)


In [None]:
model = ConvModel(11)
model.compile(optimizer=opt, 
              loss='sparse_categorical_crossentropy', 
              metrics=['sparse_categorical_accuracy'])
model.summary()


EPOCHS = 50

history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=valid_ds,
    callbacks=[earlystopping_cb, mdlcheckpoint_cb],
)
