In [7]:
import librosa
import sys
import os
import numpy as np
import librosa
import tensorflow as tf
import tensorflow_hub as hub
import glob
import pandas as pd

from sklearn.preprocessing import LabelBinarizer
from tqdm import tqdm
from tensorflow.keras import layers, Model

In [None]:
def create_dataset(path):
    samples, labels = [], []
    model = yamnet_frames_model(Params())
    model.load_weights(YAMNET_PATH)
    for cls in os.listdir(path):
        for sound in tqdm(os.listdir(os.path.join(path, cls))):
            wav = librosa.load(os.path.join(os.path.join(path, cls, sound)), sr=16000)[0].astype(np.float64)

            #Here you can add preprocessing, augmentations, silence removal, etc.

            for feature in model(wav)[1]:
                samples.append(feature)
                labels.append(cls)
    samples = np.asarray(samples)
    labels = np.asarray(labels)
    return samples, labels

In [5]:
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

In [8]:
class_map_path = yamnet_model.class_map_path().numpy().decode('utf-8')
class_names =list(pd.read_csv(class_map_path)['display_name'])

In [9]:
gaspfiles = glob.glob("Sounds/gaspSounds/*")
coughfiles = glob.glob("Sounds/coughSounds/*")


In [23]:
samples, labels = [], []
for file in gaspfiles:
    try:
        wav_data = librosa.load(file,sr=16000)[0].astype(np.float32)
    except:
        continue
    for feature in yamnet_model(wav_data)[1]:
        samples.append(feature)
        labels.append("gasp")

In [24]:
for file in coughfiles:
    try:
        wav_data = librosa.load(file,sr=16000)[0].astype(np.float32)
    except:
        continue
    for feature in yamnet_model(wav_data)[1]:
        samples.append(feature)
        labels.append("cough")

In [26]:
samples = np.asarray(samples)
labels = np.asarray(labels)

In [27]:
def generate_model(num_classes,
                  num_hidden=64,
                  activation='softmax',
                  regularization=0.03,
                  ):

    input = layers.Input(shape=(1024,))
    net = layers.Dense(num_hidden, activation=None, kernel_regularizer=tf.keras.regularizers.l2(regularization))(input)
    net = layers.Dense(num_classes, activation=activation)(net)
    model = Model(inputs=input, outputs=net)
    return model

In [28]:
def train_model(X,
                y,
                fname,  # Path where to save the model
                activation='softmax',
                epochs=30,
                optimizer='adam',
                num_hidden=64,
                batch_size=64
                ):
    # Encode the labels
    encoder = LabelBinarizer()
    labels = encoder.fit_transform(y)

    # Save the names of the classes for future using.
    np.save(fname, encoder.classes_)
    num_classes = len(np.unique(y))

    # Generate the model
    general_model = generate_model(num_classes, num_hidden=num_hidden, activation=activation)
    general_model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                          metrics=['accuracy'])

    # Create some callbacks
    callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=fname, monitor='val_loss', save_best_only=True),
                 tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.9, patience=15, verbose=1,
                                                      min_lr=0.000001)]

    general_model.fit(X, labels, epochs=epochs, validation_split=0.20, batch_size=batch_size,
                      callbacks=callbacks, verbose=1)

    # Load the best weights after the training.
    general_model.load_weights(fname)

    return general_model

In [46]:
model2 = train_model(samples, labels, "tmp.pb")

Epoch 1/30



INFO:tensorflow:Assets written to: tmp.pb\assets


INFO:tensorflow:Assets written to: tmp.pb\assets


Epoch 2/30



INFO:tensorflow:Assets written to: tmp.pb\assets


INFO:tensorflow:Assets written to: tmp.pb\assets


Epoch 3/30



INFO:tensorflow:Assets written to: tmp.pb\assets


INFO:tensorflow:Assets written to: tmp.pb\assets


Epoch 4/30



INFO:tensorflow:Assets written to: tmp.pb\assets


INFO:tensorflow:Assets written to: tmp.pb\assets


Epoch 5/30



INFO:tensorflow:Assets written to: tmp.pb\assets


INFO:tensorflow:Assets written to: tmp.pb\assets


Epoch 6/30
Epoch 7/30



INFO:tensorflow:Assets written to: tmp.pb\assets


INFO:tensorflow:Assets written to: tmp.pb\assets


Epoch 8/30
Epoch 9/30



INFO:tensorflow:Assets written to: tmp.pb\assets


INFO:tensorflow:Assets written to: tmp.pb\assets


Epoch 10/30



INFO:tensorflow:Assets written to: tmp.pb\assets


INFO:tensorflow:Assets written to: tmp.pb\assets


Epoch 11/30



INFO:tensorflow:Assets written to: tmp.pb\assets


INFO:tensorflow:Assets written to: tmp.pb\assets


Epoch 12/30



INFO:tensorflow:Assets written to: tmp.pb\assets


INFO:tensorflow:Assets written to: tmp.pb\assets


Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
 1/28 [>.............................] - ETA: 0s - loss: 0.2943 - accuracy: 0.8750



INFO:tensorflow:Assets written to: tmp.pb\assets


INFO:tensorflow:Assets written to: tmp.pb\assets


Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [32]:
model2

<keras.engine.functional.Functional at 0x28719c411f0>

In [36]:
wav_data = librosa.load(file,sr=16000)[0].astype(np.float32)

In [42]:
class ReduceMeanLayer(tf.keras.layers.Layer):
  def __init__(self, axis=0, **kwargs):
    super(ReduceMeanLayer, self).__init__(**kwargs)
    self.axis = axis

In [35]:
file = glob.glob("Sounds/burpSounds/*")[0]
file

'Sounds/burpSounds\\19tPF3TY3g0_split.wav'

In [49]:
saved_model_path = "tmp.pb"
input_segment = tf.keras.layers.Input(shape=(), dtype=tf.float32, name='audio')
embedding_extraction_layer = hub.KerasLayer('https://tfhub.dev/google/yamnet/1',
                                            trainable=False, name='yamnet')
_, embeddings_output, _ = embedding_extraction_layer(input_segment)
serving_outputs = model2(embeddings_output)
serving_outputs = ReduceMeanLayer(axis=0, name='classifier')(serving_outputs)
serving_model = tf.keras.Model(input_segment, serving_outputs)
serving_model.save(saved_model_path, include_optimizer=False)





INFO:tensorflow:Assets written to: tmp.pb\assets


INFO:tensorflow:Assets written to: tmp.pb\assets


In [50]:
reloaded_model = tf.saved_model.load(saved_model_path)

In [59]:
testing_wav_data = librosa.load(gaspfiles[0],sr=16000)[0].astype(np.float32)
reloaded_results = reloaded_model(testing_wav_data)
class_scores = tf.reduce_mean(scores, axis=0)
top_class = tf.math.argmax(class_scores)
inferred_class = class_names[top_class]
top_score = class_scores[top_class]
print(f'[YAMNet] The main sound is: {inferred_class} ({top_score})')

[YAMNet] The main sound is: Silence (0.30727311968803406)


In [62]:
general_model.class_names

NameError: name 'general_model' is not defined

In [63]:
np.load("tmp.pb.npy")

array(['cough', 'gasp'], dtype='<U5')