## **Installing Dependencies**

In [0]:
!pip install tensorflow==1.15
!pip install scipy
!pip install resampy tensorflow six
!pip install tf_slim six soundfile

Clonning tensorflow repo

In [0]:
!git clone https://github.com/tensorflow/models.git

In [0]:
# Check to see where are in the kernel's file system.
!pwd

In [0]:
# Grab the VGGish model required weights and paramters
!curl -O https://storage.googleapis.com/audioset/vggish_model.ckpt
!curl -O https://storage.googleapis.com/audioset/vggish_pca_params.npz

In [0]:
# Verify the location of the AudioSet source files
!ls models/research/audioset/vggish/

In [0]:
# Copy the source files to the current directory.
!cp models/research/audioset/vggish/* .

In [0]:
# Run the test, which also loads all the necessary functions.
from vggish_smoke_test import *

In [0]:
# Run the test, which also loads all the necessary functions.

# from vggish_smoke_test import *
# either run above line or below line, both will give same results. 
!python vggish_smoke_test.py

In [0]:
# this is just for getting embeddings from each wav file
# ran it for test purpose.

!python vggish_inference_demo.py --wav_file "/content/drive/My Drive/test_dataset/not_sick/audioset_-21_SXelVNo_30_35.wav"\
                                    --tfrecord_file "/content/TfrecordFile" \
                                    --checkpoint "/content/vggish_model.ckpt" \
                                    --pca_params "/content/vggish_pca_params.npz"

**USING VGGish for extracting embeddings**

In [0]:
import vggish_slim
import vggish_params
import vggish_input
import tensorflow as tf
import numpy as np

def CreateVGGishNetwork(hop_size=0.96):   # Hop size is in seconds.
  """Define VGGish model, load the checkpoint, and return a dictionary that points
  to the different tensors defined by the model.
  """
  vggish_slim.define_vggish_slim()
  checkpoint_path = 'vggish_model.ckpt'
  vggish_params.EXAMPLE_HOP_SECONDS = hop_size
  
  vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

  features_tensor = sess.graph.get_tensor_by_name(
      vggish_params.INPUT_TENSOR_NAME)
  embedding_tensor = sess.graph.get_tensor_by_name(
      vggish_params.OUTPUT_TENSOR_NAME)

  layers = {'conv1': 'vggish/conv1/Relu',
            'pool1': 'vggish/pool1/MaxPool',
            'conv2': 'vggish/conv2/Relu',
            'pool2': 'vggish/pool2/MaxPool',
            'conv3': 'vggish/conv3/conv3_2/Relu',
            'pool3': 'vggish/pool3/MaxPool',
            'conv4': 'vggish/conv4/conv4_2/Relu',
            'pool4': 'vggish/pool4/MaxPool',
            'fc1': 'vggish/fc1/fc1_2/Relu',
            'fc2': 'vggish/fc2/Relu',
            'embedding': 'vggish/embedding',
            'features': 'vggish/input_features',
         }
  g = tf.get_default_graph()
  for k in layers:
    layers[k] = g.get_tensor_by_name( layers[k] + ':0')
    
  return {'features': features_tensor,
          'embedding': embedding_tensor,
          'layers': layers,
         }

In [0]:
def ProcessWithVGGish(vgg, x, sr):
  '''Run the VGGish model, starting with a sound (x) at sample rate
  (sr). Return a whitened version of the embeddings. Sound must be scaled to be
  floats between -1 and +1.'''

  # Produce a batch of log mel spectrogram examples.
  input_batch = vggish_input.waveform_to_examples(x, sr)
  # print('Log Mel Spectrogram example: ', input_batch[0])

  [embedding_batch] = sess.run([vgg['embedding']],
                               feed_dict={vgg['features']: input_batch})

  # Postprocess the results to produce whitened quantized embeddings.
  pca_params_path = 'vggish_pca_params.npz'

  pproc = vggish_postprocess.Postprocessor(pca_params_path)
  postprocessed_batch = pproc.postprocess(embedding_batch)
  # print('Postprocessed VGGish embedding: ', postprocessed_batch[0])
  return postprocessed_batch[0]


In [0]:
# Test these new functions with the original test.

tf.reset_default_graph()
sess = tf.Session()

vgg = CreateVGGishNetwork(0.01)

# Generate a 1 kHz sine wave at 44.1 kHz (we use a high sampling rate
# to test resampling to 16 kHz during feature extraction).
num_secs = 3
freq = 1000
sr = 44100
t = np.linspace(0, num_secs, int(num_secs * sr))
x = np.sin(2 * np.pi * freq * t)  # Unit amplitude input signal

postprocessed_batch = ProcessWithVGGish(vgg, x, sr)

# print('Postprocessed VGGish embedding: ', postprocessed_batch[0])
expected_postprocessed_mean = 123.0
expected_postprocessed_std = 75.0
np.testing.assert_allclose(
    [np.mean(postprocessed_batch), np.std(postprocessed_batch)],
    [expected_postprocessed_mean, expected_postprocessed_std],
    rtol=rel_error)


In [0]:
def EmbeddingsFromVGGish(vgg, x, sr):
  '''Run the VGGish model, starting with a sound (x) at sample rate
  (sr). Return a dictionary of embeddings from the different layers
  of the model.'''
  # Produce a batch of log mel spectrogram examples.
  input_batch = vggish_input.waveform_to_examples(x, sr)
  # print('Log Mel Spectrogram example: ', input_batch[0])

  layer_names = vgg['layers'].keys()
  tensors = [vgg['layers'][k] for k in layer_names]
  
  results = sess.run(tensors,
                     feed_dict={vgg['features']: input_batch})

  resdict = {}
  for i, k in enumerate(layer_names):
    resdict[k] = results[i]
    
  return resdict

In [0]:
resdict = EmbeddingsFromVGGish(vgg, x, sr)

for k in resdict:
  print( k, resdict[k].shape)

In [0]:
# My small test

import librosa, librosa.display
import matplotlib.pyplot as plt
import numpy as np


file = "/content/drive/My Drive/test_dataset/not_sick/audioset_-21_SXelVNo_30_35.wav"
sr = 44100         # by default sample rate for analyzing audio data.
duration = 4
samples_per_track = sr * duration

signal, sr = librosa.load(file, sr)
print(signal.shape)
print(signal.shape)

resdict = EmbeddingsFromVGGish(vgg, signal, sr)

for k in resdict:
  print( k, resdict[k].shape)

In [0]:
print("This is features shape: ",resdict["features"].shape)
print("This is embedding shape: ", resdict["embedding"].shape)

# If you want to make these embeddings more quantized so that it can run on edge then one can use vgg_postprocess.py file in order to make
# these embeddings more quantized and it converts them into 8 bit integer.
# or you can say it returns An nparray of the same shape as the input but of type uint8,
# containing the PCA-transformed and quantized version of the input.

# this postprocess file applies:
    # Apply PCA.
    # - Embeddings come in as [batch_size, embedding_size].
    # - Transpose to [embedding_size, batch_size].
    # - Subtract pca_means column vector from each column.
    # - Premultiply by PCA matrix of shape [output_dims, input_dims]
    #   where both are are equal to embedding_size in our case.
    # - Transpose result back to [batch_size, embedding_size].

In [0]:
# For plotting embeddings

import matplotlib.pyplot as plt
plt.imshow(resdict['embedding'],
           aspect='auto', cmap='binary')
plt.xlabel('Embedding Dimension')
plt.ylabel('Time (frame number)')
plt.title('Embedded Representation from a wav file')
plt.grid(False);

### **For extracting features from Audio dataset**

In [0]:
import os
import librosa, librosa.display
import matplotlib.pyplot as plt
import numpy as np
import json

dataset_path = "/content/drive/My Drive/test_dataset"
json_path = "embeddings2.json"
sr = 44100

duration = 4
samples_per_track = sr * duration


data_store = {
        "embeddings": [],
        "labels": []
    }

for i, (dirpath, dirname, filenames) in enumerate(os.walk(dataset_path)):
  if dirpath is not dataset_path:
    for f in filenames:
      file_path = os.path.join(dirpath, f)
      data, sr = librosa.load(file_path, sr)
      
      resdict = EmbeddingsFromVGGish(vgg, data[:samples_per_track], sr)
      print(resdict["embedding"].shape)
      data = resdict['embedding']
      data_store["embeddings"].append(data.tolist())
      data_store["labels"].append(i-1)
  
# final step: saving everything as a json file            
with open(json_path, "w") as fp:
    json.dump(data_store, fp, indent = 4)  # indent mean spaces while writing. fp mean file_path  

In [0]:
# Not necessary lines

t = np.array(data_store["embeddings"])
print(t.shape)
y = np.array(data_store["labels"])
j = y[..., np.newaxis]
print(j[1000])
print(j.shape)


#### **Building a binary class model**

In [0]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow.keras as keras
import matplotlib.pyplot as plt

DATASET_PATH = "embeddings.json"

def load_data(dataset_path):
    with open(dataset_path, "r") as fp:
        data = json.load(fp)
        
        
    # converting lists into numpy arrays
    X = np.array(data["embeddings"])
    y = np.array(data["labels"])
    
    return X, y

def plot_history(history):
    
    fig, axis = plt.subplots(2)
    
    # create accuracy subplot
    axis[0].plot(history.history["acc"], label = "train accuracy" )
    axis[0].plot(history.history["val_acc"], label = "test_accuracy" )
    axis[0].set_ylabel("Accuracy")
    axis[0].set_xlabel("Epochs")
    axis[0].legend(loc = "lower right")
    axis[0].set_title("Acccuracy eval")
    
    
    # create error subplot
    axis[1].plot(history.history["loss"], label = "train error" )
    axis[1].plot(history.history["val_loss"], label = "test error" )
    axis[1].set_ylabel("Error")
    axis[1].set_xlabel("Epochs")
    axis[1].legend(loc = "upper right")
    axis[1].set_title("Error eval")
    
    plt.show()

def prepare_datasets(test_size, validation_size):
    
    # load dataset
    X, y = load_data(DATASET_PATH)
    
    # create train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state=42)
    
    # create train/validataion split
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size= validation_size, random_state=32)
    
    print(X_train.shape)
    
    # so in X_train we have 3d array --> (sample_size, mfcc_vector, mfcc_coefficient)
    # so we have to introduce another dimension in it. and make it 4d array. 
    X_train = X_train[... , np.newaxis]
    # ... 3dots mean keep the rest of the shape same.
    X_validation = X_validation[..., np.newaxis]
    X_test = X_test[..., np.newaxis]
    
    print(X_train.shape)
    print(X_validation.shape)
    print(X_test.shape)
    
    
    return X_train, X_validation, X_test, y_train, y_validation, y_test
    

def build_model(input_shape):
    
    # create model
    model = keras.Sequential()
    
    # 1st conv layer
    model.add(keras.layers.Conv2D(32, (3, 3), padding='same', activation = "relu", input_shape = input_shape))
    model.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
#     model.add(keras.layers.MaxPooling2D(pool_size=(3, 3)))
    model.add(keras.layers.BatchNormalization())
    
    # 2nd conv layer
    model.add(keras.layers.Conv2D(32, (3, 3), padding='same', activation = "relu"))
    model.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
#     model.add(keras.layers.MaxPooling2D(pool_size=(3, 3)))
    model.add(keras.layers.BatchNormalization())

     # 3rd conv layer
    model.add(keras.layers.Conv2D(32, (2, 2), padding='same', activation = "relu"))
    model.add(keras.layers.MaxPooling2D((2, 2), strides=(2, 2), padding='same'))
    model.add(keras.layers.BatchNormalization())

    # Flatten the output and feed it into dense layer
    model.add(keras.layers.Flatten())
    
    # 1st dense layer
    model.add(keras.layers.Dense(128, activation = "relu", kernel_regularizer=keras.regularizers.l2(0.001)))
    model.add(keras.layers.Dropout(0.8))

    # 2nd dense layer
    model.add(keras.layers.Dense(64, activation = "relu", kernel_regularizer=keras.regularizers.l2(0.001)))
    model.add(keras.layers.Dropout(0.6))
    
    # output layer
    model.add(keras.layers.Dense(1, activation = "sigmoid"))
    
    return model


def predict(model, X, y):
    
    # add a dimension to input data for sample - model.predict() expects a 4d array in this case
    X = X[np.newaxis, ...] # array shape (1, 130, 13, 1)
    print(X.shape)

    # perform prediction
    prediction = model.predict(X)

    # get index with max value
    predicted_index = np.argmax(prediction, axis=1)

    print("Target: {}, Predicted label: {}".format(y, predicted_index))



if __name__ == "__main__":

    # Create train, validation and test set
    X_train, X_validation, X_test, y_train, y_validation, y_test = prepare_datasets(0.25, 0.20)
    
    input_shape = (X_train.shape[1], X_train.shape[2], 1)
    
    # build the CNN network architecture
    model = build_model(input_shape)
    
    # compile the network
    optimizer = keras.optimizers.SGD(learning_rate = 0.01, momentum=0.9, decay=0.01, nesterov=False,)
    model.compile(optimizer = optimizer,
                  loss = keras.losses.BinaryCrossentropy(),
                  metrics = ["accuracy"]
                 )
    
    model.summary()
    
    # train the CNN
    history = model.fit(X_train,
              y_train,
              validation_data = (X_validation, y_validation),
              batch_size = 32,
              epochs = 70
             )
    
    # plot accuracy/error for training and validation
    plot_history(history)
    
    # evaluate the CNN on the test set
    test_error, test_accuracy = model.evaluate(X_test, y_test, verbose = 1)
    print("Accuracy on test set is {}".format(test_accuracy))
    
    # pick a sample to predict from the test set
    X_to_predict = X_test[50]
    y_to_predict = y_test[50]

    # predict sample
    predict(model, X_to_predict, y_to_predict)
    

In [0]:
from sklearn.metrics import confusion_matrix

yhat_probs = model.predict(X_test, verbose=0)
tt = np.argmax(yhat_probs, axis = 1)

cm=confusion_matrix(y_test, tt)
print(cm)

#### **Computing different Features**

In [0]:
# For log mel spectogram

from mel_features import log_mel_spectrogram

path = "/content/drive/My Drive/small_dataset/not_sick/Copy of audioset_EQzYcBJ1Dec_100_105.wav"

sr = 44100
duration = 4
samples_per_track = sr * duration

signal, sr = librosa.load(path, sr)
hh = log_mel_spectrogram(signal,
                        audio_sample_rate=sr,
                        log_offset=0.0,
                        window_length_secs=0.025,
                        hop_length_secs=0.010,)

print(hh.shape)


import librosa, librosa.display
  
librosa.display.specshow(hh, sr = sr, hop_length = 0.010)
plt.xlabel("Time")
plt.ylabel("MFCC Coeffiecient")
plt.colorbar()
plt.show()

In [0]:
# yeh tarka hai direct wav file vala
# model input
from vggish_input import wavfile_to_examples

path = "/content/drive/My Drive/small_dataset/not_sick/Copy of audioset_EQzYcBJ1Dec_100_105.wav"

jj = wavfile_to_examples(path)
print(jj.shape)


import librosa, librosa.display

plt.imshow(jj[300])
plt.xlabel("no of sample points")
plt.ylabel("Log mel spectogram")
plt.colorbar()
plt.show()

In [0]:
# trying waveform to example method
# yeh tarika hai librosa k through load krny ka
# model input

from vggish_input import waveform_to_examples

path = "/content/drive/My Drive/small_dataset/not_sick/Copy of audioset_EQzYcBJ1Dec_100_105.wav"

signal, sr = librosa.load(path, sr)

kk = waveform_to_examples(signal, sr)
print(kk.shape)
print(signal.shape)

import librosa, librosa.display

plt.imshow(kk[300])
plt.xlabel("no of sample points")
plt.ylabel("Log mel spectogram")
plt.colorbar()
plt.show()

# **Now Working With keras Implementation of VGGish**

In [0]:
!git clone https://github.com/furqan4545/VGGish-Keras.git
!pip install sound

In [0]:
# Copy the source files to the current directory.
!cp /content/VGGish-Keras/* .

#### **Model 1 with MFCC coefficient**

In [0]:
from preprocess_sound import preprocess_sound
import numpy as np
from scipy.io import wavfile

from keras import backend as K
from keras import optimizers
from keras.layers import Flatten, Input, Dense, GlobalMaxPooling2D
from keras.models import Model
from keras.layers import GlobalAveragePooling2D
from vggish import VGGish
from preprocess_sound import preprocess_sound

import json
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

DATASET_PATH = "/content/drive/My Drive/sick_sound.json"

def load_data(dataset_path):
    with open(dataset_path, "r") as fp:
        data = json.load(fp)
        
        
    # converting lists into numpy arrays
    X = np.array(data["mfcc"])
    y = np.array(data["labels"])
    
    return X, y

def prepare_datasets(test_size, validation_size):
    
    # load dataset
    X, y = load_data(DATASET_PATH)
    
    # create train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state=42)
    
    # create train/validataion split
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size= validation_size, random_state=32)
    
    print(X_train.shape)
    
    # so in X_train we have 3d array --> (sample_size, mfcc_vector, mfcc_coefficient)
    # so we have to introduce another dimension in it. and make it 4d array. 
    X_train = X_train[... , np.newaxis]
    # ... 3dots mean keep the rest of the shape same.
    X_validation = X_validation[..., np.newaxis]
    X_test = X_test[..., np.newaxis]
    
    print(X_train.shape)
    print(X_validation.shape)
    print(X_test.shape)
    
    return X_train, X_validation, X_test, y_train, y_validation, y_test

def plot_history(history):
    
    fig, axis = plt.subplots(2)
    
    # create accuracy subplot
    axis[0].plot(history.history["accuracy"], label = "train accuracy" )
    axis[0].plot(history.history["val_accuracy"], label = "test_accuracy" )
    axis[0].set_ylabel("Accuracy")
    axis[0].set_xlabel("Epochs")
    axis[0].legend(loc = "lower right")
    axis[0].set_title("Acccuracy eval")
    
    
    # create error subplot
    axis[1].plot(history.history["loss"], label = "train error" )
    axis[1].plot(history.history["val_loss"], label = "test error" )
    axis[1].set_ylabel("Error")
    axis[1].set_xlabel("Epochs")
    axis[1].legend(loc = "upper right")
    axis[1].set_title("Error eval")
    
    plt.show()



def predict(model, X, y):
    
    # add a dimension to input data for sample - model.predict() expects a 4d array in this case
    X = X[np.newaxis, ...] # array shape (1, 130, 13, 1)
    print(X.shape)

    # perform prediction
    prediction = model.predict(X)

    # get index with max value
    predicted_index = np.argmax(prediction, axis=1)

    print("Target: {}, Predicted label: {}".format(y, predicted_index))



# Create train, validation and test set
X_train, X_validation, X_test, y_train, y_validation, y_test = prepare_datasets(0.25, 0.20)

input_shape = (X_train.shape[1], X_train.shape[2], 1)

new_input = Input(shape=(108, 13, 1))
sound_model = VGGish(include_top=False, load_weights=True, input_tensor = new_input)
x = sound_model.get_layer(name="conv4/conv4_2").output
# output_layer = GlobalAveragePooling2D()(x)
flat1 = GlobalAveragePooling2D()(x)
# flat1 = Flatten()(x)
class1 = Dense(1024, activation = 'relu')(flat1)
outputss = Dense(2, activation = "softmax")(class1)

# define new model
model = Model(input = sound_model.input, output = outputss)
model.summary()

for layer in sound_model.layers:
  layer.trainable = False

# compile the network
optimizer = keras.optimizers.Adam(learning_rate = 0.01, momentum=0.9, decay=0.01, nesterov=False,)
model.compile(optimizer = optimizer,
              loss = "sparse_categorical_crossentropy",
              metrics = ["accuracy"]
              )

# train the CNN
history = model.fit(X_train,
          y_train,
          validation_data = (X_validation, y_validation),
          batch_size = 64,
          epochs = 50
          )

# plot accuracy/error for training and validation
plot_history(history)

# evaluate the CNN on the test set
test_error, test_accuracy = model.evaluate(X_test, y_test, verbose = 1)
print("Accuracy on test set is {}".format(test_accuracy))

# pick a sample to predict from the test set
X_to_predict = X_test[50]
y_to_predict = y_test[50]

# predict sample
predict(model, X_to_predict, y_to_predict)



#### **Model 2 with Embeddings**

In [0]:
from preprocess_sound import preprocess_sound
import numpy as np

from keras import backend as K
from keras import optimizers
from keras.layers import Flatten, Input, Dense, GlobalMaxPooling2D
from keras.models import Model
from keras.layers import GlobalAveragePooling2D
from vggish import VGGish
from preprocess_sound import preprocess_sound


import json
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

DATASET_PATH = "/content/embeddings2.json"

def load_data(dataset_path):
    with open(dataset_path, "r") as fp:
        data = json.load(fp)
        
        
    # converting lists into numpy arrays
    X = np.array(data["embeddings"])
    y = np.array(data["labels"])
    j = y[..., np.newaxis]

    
    return X, j

def prepare_datasets(test_size, validation_size):
    
    # load dataset
    X, y = load_data(DATASET_PATH)
    
    # create train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state=42)
    
    # create train/validataion split
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size= validation_size, random_state=32)
    
    print(X_train.shape)
    
    # so in X_train we have 3d array --> (sample_size, mfcc_vector, mfcc_coefficient)
    # so we have to introduce another dimension in it. and make it 4d array. 
    X_train = X_train[... , np.newaxis]
    # ... 3dots mean keep the rest of the shape same.
    X_validation = X_validation[..., np.newaxis]
    X_test = X_test[..., np.newaxis]
    
    print(X_train.shape)
    print(X_validation.shape)
    print(X_test.shape)
    
    return X_train, X_validation, X_test, y_train, y_validation, y_test

def plot_history(history):
    
    fig, axis = plt.subplots(2)
    
    # create accuracy subplot
    axis[0].plot(history.history["accuracy"], label = "train accuracy" )
    axis[0].plot(history.history["val_accuracy"], label = "test_accuracy" )
    axis[0].set_ylabel("Accuracy")
    axis[0].set_xlabel("Epochs")
    axis[0].legend(loc = "lower right")
    axis[0].set_title("Acccuracy eval")
    
    
    # create error subplot
    axis[1].plot(history.history["loss"], label = "train error" )
    axis[1].plot(history.history["val_loss"], label = "test error" )
    axis[1].set_ylabel("Error")
    axis[1].set_xlabel("Epochs")
    axis[1].legend(loc = "upper right")
    axis[1].set_title("Error eval")
    
    plt.show()



def predict(model, X, y):
    
    # add a dimension to input data for sample - model.predict() expects a 4d array in this case
    X = X[np.newaxis, ...] # array shape (1, 130, 13, 1)
    print(X.shape)

    # perform prediction
    prediction = model.predict(X)

    # get index with max value
    predicted_index = np.argmax(prediction, axis=1)

    print("Target: {}, Predicted label: {}".format(y, predicted_index))



# Create train, validation and test set
X_train, X_validation, X_test, y_train, y_validation, y_test = prepare_datasets(0.25, 0.20)

input_shape = (X_train.shape[1], X_train.shape[2], 1)

new_input = Input(shape=(303, 128, 1))
sound_model = VGGish(include_top=False, load_weights=True, input_tensor = new_input)
x = sound_model.get_layer(name="conv4/conv4_2").output
flat1 = GlobalMaxPooling2D()(x)
# flat1 = GlobalAveragePooling2D()(x)
# flat1 = Flatten()(x)
class1 = Dense(1024, activation = 'relu')(flat1)
class2 = Dense(512, activation = 'relu')(class1)
class3 = Dense(256, activation = 'relu')(class2)
outputss = Dense(1, activation = "sigmoid")(class3)

# define new model
model = Model(input = sound_model.input, output = outputss)
model.summary()

for layer in sound_model.layers:
  layer.trainable = False

# compile the network
optimizer = optimizers.Adam(learning_rate=0.005)
model.compile(optimizer = optimizer,
              loss = keras.losses.BinaryCrossentropy(),
              metrics = ["accuracy"]
              )

# train the CNN
history = model.fit(X_train,
          y_train,
          validation_data = (X_validation, y_validation),
          batch_size = 32,
          epochs = 30
          )

# plot accuracy/error for training and validation
plot_history(history)

# evaluate the CNN on the test set
test_error, test_accuracy = model.evaluate(X_test, y_test, verbose = 1)
print("Accuracy on test set is {}".format(test_accuracy))

# pick a sample to predict from the test set
X_to_predict = X_test[50]
y_to_predict = y_test[50]

# predict sample
predict(model, X_to_predict, y_to_predict)



#### **Model 3 using Softmax and callbacks**

In [0]:
from preprocess_sound import preprocess_sound
import numpy as np
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

from keras import backend as K
from keras import optimizers
from keras.regularizers import l2
from keras.layers import Flatten, Input, Dense, GlobalMaxPooling2D
from keras.models import Model
from keras.layers import GlobalAveragePooling2D, BatchNormalization, Dropout
from vggish import VGGish
from preprocess_sound import preprocess_sound


import json
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


DATASET_PATH = "/content/embeddings2.json"

def load_data(dataset_path):
    with open(dataset_path, "r") as fp:
        data = json.load(fp)
        
        
    # converting lists into numpy arrays
    X = np.array(data["embeddings"])
    y = np.array(data["labels"])
    j = y[..., np.newaxis]
    
    return X, j

def prepare_datasets(test_size, validation_size):
    
    # load dataset
    X, y = load_data(DATASET_PATH)
    
    # create train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state=42)
    
    # create train/validataion split
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size= validation_size, random_state=32)
    
    print(X_train.shape)
    
    # so in X_train we have 3d array --> (sample_size, mfcc_vector, mfcc_coefficient)
    # so we have to introduce another dimension in it. and make it 4d array. 
    X_train = X_train[... , np.newaxis]
    # ... 3dots mean keep the rest of the shape same.
    X_validation = X_validation[..., np.newaxis]
    X_test = X_test[..., np.newaxis]
    
    print(X_train.shape)
    print(X_validation.shape)
    print(X_test.shape)
    
    return X_train, X_validation, X_test, y_train, y_validation, y_test

def plot_history(history):
    
    fig, axis = plt.subplots(2)
    
    # create accuracy subplot
    axis[0].plot(history.history["accuracy"], label = "train accuracy" )
    axis[0].plot(history.history["val_accuracy"], label = "test_accuracy" )
    axis[0].set_ylabel("Accuracy")
    axis[0].set_xlabel("Epochs")
    axis[0].legend(loc = "lower right")
    axis[0].set_title("Acccuracy eval")
    
    
    # create error subplot
    axis[1].plot(history.history["loss"], label = "train error" )
    axis[1].plot(history.history["val_loss"], label = "test error" )
    axis[1].set_ylabel("Error")
    axis[1].set_xlabel("Epochs")
    axis[1].legend(loc = "upper right")
    axis[1].set_title("Error eval")
    
    plt.show()



def predict(model, X, y):
    
    # add a dimension to input data for sample - model.predict() expects a 4d array in this case
    X = X[np.newaxis, ...] # array shape (1, 130, 13, 1)
    print(X.shape)

    # perform prediction
    prediction = model.predict(X)

    # get index with max value
    predicted_index = np.argmax(prediction, axis=1)

    print("Target: {}, Predicted label: {}".format(y, predicted_index))



# Create train, validation and test set
X_train, X_validation, X_test, y_train, y_validation, y_test = prepare_datasets(0.25, 0.20)

input_shape = (X_train.shape[1], X_train.shape[2], 1)

new_input = Input(shape=(303, 128, 1))
sound_model = VGGish(include_top=False, load_weights=True, input_tensor = new_input)

for layer in sound_model.layers:
  layer.trainable = False

x = sound_model.get_layer(name="conv4/conv4_2").output
flat1 = GlobalAveragePooling2D()(x)
# flat1 = GlobalAveragePooling2D()(x)
# flat1 = Flatten()(x)
rt =  BatchNormalization()(flat1)
class1 = Dense(1024, activation = 'relu', kernel_regularizer=l2(0.01))(rt)
class1 = Dropout(0.5)(class1)
class2 = Dense(512, activation = 'relu', kernel_regularizer=l2(0.01))(class1)
class2 = Dropout(0.5)(class2)
class2 = Dense(512, activation = 'relu', kernel_regularizer=l2(0.01))(class2)
class2 = Dropout(0.5)(class2)
class3 = Dense(256, activation = 'relu', kernel_regularizer=l2(0.01))(class2)
outputss = Dense(2, activation = "softmax")(class3)

# define new model
model = Model(input = sound_model.input, output = outputss)
model.summary()



reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, mode = "auto",
                              patience=5, min_delta=0.0001, cooldown=0, min_lr=0.001)

early_stoping = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=6, verbose=1,
              mode='auto', baseline=None, restore_best_weights=True)

# compile the network
optimizer = optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer = optimizer,
              loss = "sparse_categorical_crossentropy",
              metrics = ["accuracy"]
              )

# train the CNN
history = model.fit(X_train,
          y_train,
          validation_data = (X_validation, y_validation),
          batch_size = 32,
          epochs = 350,
          callbacks = [reduce_lr, early_stoping]
          )

# plot accuracy/error for training and validation
plot_history(history)

# evaluate the CNN on the test set
test_error, test_accuracy = model.evaluate(X_test, y_test, verbose = 1)
print("Accuracy on test set is {}".format(test_accuracy))

# pick a sample to predict from the test set
X_to_predict = X_test[50]
y_to_predict = y_test[50]

# predict sample
predict(model, X_to_predict, y_to_predict)
