### **MEL Spectograms Method**

In [0]:
from vggish_input import waveform_to_examples

path = "/content/drive/My Drive/small_dataset/not_sick/Copy of audioset_EQzYcBJ1Dec_100_105.wav"

sr = 22100
signal, sr = librosa.load(path, sr)

S = librosa.feature.melspectrogram(y=signal, sr=sr, n_mels=128, fmax=8000)
S_dB = librosa.power_to_db(S, ref=np.max)

print(S_dB.shape)
print(S.shape)

In [0]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 4))
librosa.display.specshow(S_dB, x_axis='time',
                          y_axis='mel', sr=sr,
                          fmax=8000)
plt.colorbar(format='%+2.0f dB')
plt.title('Mel-frequency spectrogram')
plt.tight_layout()
plt.show()

In [0]:
# Combining embeddings and mfcc

import os 
import librosa
import math
import json
from vggish_input import waveform_to_examples

DATASET_PATH = "/content/drive/My Drive/test_dataset"
JSON_PATH = "data5.json"
SAMPLE_RATE = 22050
DURATION = 4  # measured in seconds.
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION

def save_mfcc(dataset_path, json_path, n_mfcc = 13, n_fft = 2048, hop_length= 512, num_segments = 5):
   
    data = {
        "mapping": [],
        "spectograms": [],
        "labels": []
    }
    
    num_samples_per_segment =  int(SAMPLES_PER_TRACK / num_segments)
    
    expected_no_of_mfcc_vectors_per_segment = math.ceil(num_samples_per_segment / hop_length) 
    # 1) Loop through all the genres
    for i, (dirpath, dirname, filenames) in enumerate(os.walk(dataset_path)):
    # dirpath is the current path/folder we r in.
    # dirname are all the name of the subfolders. 
    # filenames are all the files that we have in dirpath.
        
        # Ensure that we r not at the root level
        if dirpath is not dataset_path:
            # save the semantic labels
            # sematic labels --> mapping contains semantic labels. e.g on 0 we have classical, on 1 we have blues etc
            dirpath_components = dirpath.split("\\")  # "genre/blues" => ["genre", "blues"]
            semantic_label = dirpath_components[-1]
            data["mapping"].append(semantic_label)
            
            print("\n Processing {}".format(semantic_label))
            
            # Process files for specific genre. 
            for f in filenames:
                # f just give us the name of the file it doesnt give us the full path.
                # we need full path for loading the file so for loading full path we do
                
                # laoding audio file
                file_path = os.path.join(dirpath, f)
                signal, sr = librosa.load(file_path, sr = SAMPLE_RATE)
                # we can't anaylyze mfcc's at this level blc we have broken our song into small chunks/segments
                # so we have to analyze mfcc's at segment level. so for this, we have to divide our signal/sound into bunch
                # of segments.
                
                # divide signals into segments,process segments, extract mfcc and at last store mfcc vectors.
                for s in range(num_segments):
                    
                    start_sample = num_samples_per_segment * s  # if s= 0 -> then start_sample = 0 
                    finish_sample = start_sample + num_samples_per_segment  # if s=0 -> num_segments_per_sample
                    
                    # mfcc = waveform_to_examples(signal[start_sample: finish_sample], sr)
                    S = librosa.feature.melspectrogram(y=signal[start_sample: finish_sample], sr=sr, n_mels=128, fmax=8000)
                    mfcc = librosa.power_to_db(S, ref=np.max)
                    
                    mfcc = mfcc.T 

                    # store mfcc vector for each segment, if it has the expected length. 
                    data["spectograms"].append(mfcc.tolist()) # we can not append mfcc directly blc its a numpy array so we have
                        # to first convert it into list.
                        
                    data["labels"].append(i-1)
                        
                    print("{}, segment : {}".format(file_path, s))
                
    # final step: saving everything as a json file            
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent = 4)  # indent mean spaces while writing. fp mean file_path  
        
        
if __name__ == "__main__":
    save_mfcc(DATASET_PATH, JSON_PATH, num_segments = 4)
                


#### **Model trained on Mel Spectograms**

In [0]:
from preprocess_sound import preprocess_sound
import numpy as np
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

from keras import backend as K
from keras import optimizers
from keras.regularizers import l2
from keras.layers import Flatten, Input, Dense, GlobalMaxPooling2D
from keras.models import Model
from keras.layers import GlobalAveragePooling2D, BatchNormalization, Dropout
from vggish import VGGish
from preprocess_sound import preprocess_sound


import json
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

DATASET_PATH = "/content/data5.json"

def load_data(dataset_path):
    with open(dataset_path, "r") as fp:
        data = json.load(fp)
        
        
    # converting lists into numpy arrays
    X = np.array(data["spectograms"])
    y = np.array(data["labels"])
    
    return X, y

def prepare_datasets(test_size, validation_size):
    
    # load dataset
    X, y = load_data(DATASET_PATH)
    
    # create train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state=42)
    
    # create train/validataion split
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size= validation_size, random_state=32)
    
    print(X_train.shape)
    
    # so in X_train we have 3d array --> (sample_size, mfcc_vector, mfcc_coefficient)
    # so we have to introduce another dimension in it. and make it 4d array. 
    X_train = X_train[... , np.newaxis]
    # ... 3dots mean keep the rest of the shape same.
    X_validation = X_validation[..., np.newaxis]
    X_test = X_test[..., np.newaxis]
    
    print(X_train.shape)
    print(X_validation.shape)
    print(X_test.shape)
    
    return X_train, X_validation, X_test, y_train, y_validation, y_test

def plot_history(history):
    
    fig, axis = plt.subplots(2)
    
    # create accuracy subplot
    axis[0].plot(history.history["accuracy"], label = "train accuracy" )
    axis[0].plot(history.history["val_accuracy"], label = "test_accuracy" )
    axis[0].set_ylabel("Accuracy")
    axis[0].set_xlabel("Epochs")
    axis[0].legend(loc = "lower right")
    axis[0].set_title("Acccuracy eval")
    
    
    # create error subplot
    axis[1].plot(history.history["loss"], label = "train error" )
    axis[1].plot(history.history["val_loss"], label = "test error" )
    axis[1].set_ylabel("Error")
    axis[1].set_xlabel("Epochs")
    axis[1].legend(loc = "upper right")
    axis[1].set_title("Error eval")
    
    plt.show()



def predict(model, X, y):
    
    # add a dimension to input data for sample - model.predict() expects a 4d array in this case
    X = X[np.newaxis, ...] # array shape (1, 130, 13, 1)
    print(X.shape)

    # perform prediction
    prediction = model.predict(X)

    # get index with max value
    predicted_index = np.argmax(prediction, axis=1)

    print("Target: {}, Predicted label: {}".format(y, predicted_index))



# Create train, validation and test set
X_train, X_validation, X_test, y_train, y_validation, y_test = prepare_datasets(0.25, 0.20)

input_shape = (X_train.shape[1], X_train.shape[2], 1)

new_input = Input(shape=(87, 128, 1))
sound_model = VGGish(include_top=False, load_weights=True, input_tensor = new_input)

for layer in sound_model.layers:
  layer.trainable = False

x = sound_model.get_layer(name="conv4/conv4_2").output
# x = GlobalMaxPooling2D()(x)
x = GlobalAveragePooling2D()(x)
# x = Flatten()(x)
class1 = Dense(1024, activation = 'relu', kernel_regularizer=l2(0.01))(x)
class1 = Dropout(0.5)(class1)
class2 = Dense(512, activation = 'relu', kernel_regularizer=l2(0.01))(class1)
class2 = Dropout(0.5)(class2)
class2 = Dense(512, activation = 'relu', kernel_regularizer=l2(0.01))(class2)
class2 = Dropout(0.5)(class2)
class3 = Dense(256, activation = 'relu', kernel_regularizer=l2(0.01))(class2)
outputss = Dense(2, activation = "softmax")(class3)

# define new model
model = Model(input = sound_model.input, output = outputss)
model.summary()



reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, mode = "auto",
                              patience=5, min_delta=0.0001, cooldown=0, min_lr=0.001)

early_stoping = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=9, verbose=1,
              mode='auto', baseline=None, restore_best_weights=True)


# compile the network
optimizer = optimizers.Adamax(learning_rate=0.0002, beta_1=0.9, beta_2=0.999)
model.compile(optimizer = optimizer,
              loss = "sparse_categorical_crossentropy",
              metrics = ["accuracy"]
              )

# train the CNN
history = model.fit(X_train,
          y_train,
          validation_data = (X_validation, y_validation),
          batch_size = 32,
          epochs = 300,
          callbacks = [reduce_lr, early_stoping]
          )

# plot accuracy/error for training and validation
plot_history(history)

# evaluate the CNN on the test set
test_error, test_accuracy = model.evaluate(X_test, y_test, verbose = 1)
print("Accuracy on test set is {}".format(test_accuracy))

# pick a sample to predict from the test set
X_to_predict = X_test[50]
y_to_predict = y_test[50]

# predict sample
predict(model, X_to_predict, y_to_predict)

