# A Hybrid model using Pre-trained VGGish and a Feed Forward Network

### importing libraries and loading pre-trained VGGish 

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
import re
import cv2
import os
from tqdm import tqdm


# Load the pre-trained VGGish model from TFHub
tf_hub_module = hub.load('https://tfhub.dev/google/vggish/1')

In [None]:
!wget https://github.com/karoldvl/ESC-50/archive/master.zip
!unzip master.zip

### Loading the audio files from the downloaded dataset

In [None]:
audio_files = []
PATH = '/content/ESC-50-master/audio/'
for file_name in tqdm(os.listdir(PATH)):
    try:
        audio, sampling_rate = librosa.load(os.path.join(PATH,file_name))
        
        # Since class name/number is hidden in file name of audio file, so we have to extract the class name/number by regular expression.
        exp = re.findall('\d{1,2}.wav',file_name)
        
        audio_files.append([audio,int(float(exp[0][0:2]))])
    except Exception as e:
        pass


100%|██████████| 2000/2000 [00:20<00:00, 99.12it/s] 


In [None]:
print(len(audio_files))
df = pd.read_csv('/content/classes.csv',delimiter = ',') ## header of classes.

categories = df.columns.tolist()

classes =      list(df[categories[0]].values)
classes.extend(list(df[categories[1]].values))
classes.extend(list(df[categories[2]].values))
classes.extend(list(df[categories[3]].values))
classes.extend(list(df[categories[4]].values))
df

2000


Unnamed: 0,Animals,Natural soundscapes & water sounds,Human/ non-speech sounds,Interior/domestic sounds,Exterior/urban noises
0,Dog,Rain,Crying baby,Door knock,Helicopter
1,Rooster,Sea waves,Sneezing,Mouse click,Chain saw
2,Pig,Crackling fire,Clapping,Keyboard typing,Siren
3,Cow,Crickets,Breathing,"Door,wood creaks",Car horn
4,Frog,Chirping birds,Coughing,Can opening,Engine
5,Cat,Water drops,Footsteps,Washing machine,Train
6,Hen,Wind,Laughing,Vacuum cleaner,Church bells
7,Insects (flying),Pouring water,Brushing teeth,Clock alarm,Airplane
8,Sheep,Toilet flush,Snoring,Clock tick,Crackers
9,Crow,Thunderstorm,Drinking/sipping,Glass breaking,Hand saw


In [None]:
#samples = 2000
#audio_files_load, labels_load = load_data(samples)
#audio_files_load = audio_files_load[:, np.newaxis]
print(len(audio_files))
print(len(classes))

2000
50


In [None]:
audio_files= np.array(audio_files)
X = list(audio_files[:,0])
Y = audio_files[:,1]

  audio_files= np.array(audio_files)


## Data Augmentation
Adding white noise to the data to create more robust dataset to reduce overfitting

In [None]:
# It will take few seconds for augmentation.
size_of_audio_files = len(X[0])
number_of_audio_files = len(Y)

augmented_audio_files = []

for i in range(number_of_audio_files): 
    
    # Adding white noise
    X.append(X[i] + 0.005*np.random.randn(size_of_audio_files))
    
Y = np.r_[Y,Y]

### Feature Extraction using pre-trained VGGish for each audio file
VGGish is designed to extract a sequence of 128-dimensional embeddings for every 1-second segment of an audio signal. So an avergrage of features is taken.

In [None]:
import numpy
def extract_features(audio_files, labels):


    # Extract VGGish features from the audio samples
    features = []
    valid_labels = []
    for i, audio in enumerate(audio_files):
        try:
            # Extract VGGish features from the audio using the pre-trained model
            vggish_features = tf_hub_module(audio)


            # Take the average of the VGGish features for each second of audio
            vggish_features_mean = tf.reduce_mean(vggish_features, axis=0)
            
            num_bands = 1  # replace with the actual number of bands
            feature_with_bands = numpy.concatenate([vggish_features_mean.numpy(), [num_bands]])

            # Append the features to the list
            features.append(feature_with_bands)
          
            valid_labels.append(labels[i])
        except:
            print(f"Failed to load {audio}")

    # Normalize the features
    features = numpy.array(features)
    features = (features - features.mean()) / features.std()

    # Convert the labels to integers
    valid_labels = np.array(valid_labels, dtype=np.int32)

    return features, valid_labels


In [None]:
features, labels = extract_features(X, Y)

print(features.shape)


(4000, 129)


### Splitting the data into training and test set

In [None]:
# split the data into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=5)

print(x_train.shape)

(3200, 129)


### Feeding the extracted features into a simple Feedforward network

Once the features are extracted they are then fed into a simple feed forward network of Dense layers to train

In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten

# Reshape features to 2D tensor
num_samples = x_train.shape[0]
num_features = x_train.shape[1]
X = x_train.reshape(num_samples, num_features, 1)

print(x_train.shape)

# Convert labels to one-hot encoded format
labels = to_categorical(labels)
y = to_categorical(y_train)

model = Sequential()
model.add(Dense(64, activation='relu', input_dim=num_features))
model.add(Dense(50, activation='softmax')) 

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


# Train model
model.fit(X, y, batch_size=32, epochs=20, validation_split=0.2)


(3200, 129)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fdb308e3970>

### Testing the accuracy of the model on the test set

In [None]:
X_test = x_test.reshape(x_test.shape[0], x_test.shape[1], 1)

_, accuracy = model.evaluate(X_test, to_categorical(y_test), verbose=0)

print(accuracy*100)

77.24999785423279


### Testing some random audio files picked from the test set to check the predicted classes



In [None]:
def test_model(test_features, test_labels):
    
    index = np.random.randint(0,len(test_features))
    feature = test_features[index]
    feature = feature.reshape(1, feature.shape[0], 1)

    # Predict the class of the feature using the trained model
    prediction = model.predict(feature)
    predicted_class = np.argmax(prediction, axis=1)

    print("Predicted category:", categories[predicted_class[0]//10])
    print("Actual category:", categories[test_labels[index]//10])

    print('\nPredicted class:', classes[predicted_class[0]])
    print('Actual class:', classes[test_labels[index]])




In [None]:
for i in range(2):
    test_model(x_test,y_test)

Predicted category: Natural soundscapes & water sounds
Actual category: Natural soundscapes & water sounds

Predicted class: Sea waves
Actual class: Sea waves
Predicted category: Interior/domestic sounds
Actual category: Interior/domestic sounds

Predicted class: Keyboard typing
Actual class: Keyboard typing


### Testing our case study audio 

In [None]:
from IPython.display import Audio
audio, sampling_rate = librosa.load('/content/113203-5-0-0.wav')
audio_file = np.array(audio)
vggish_features = tf_hub_module(audio_file)

test_features = []
vggish_features_mean = tf.reduce_mean(vggish_features, axis=0)
num_bands = 1
feature_with_bands = np.concatenate([vggish_features_mean.numpy(), [num_bands]])
test_features.append(feature_with_bands)
print(np.size(test_features))

test_features = np.array(test_features).reshape(1,np.size(test_features),1)

prediction = model.predict(test_features)
predicted_class = np.argmax(prediction, axis=1)

print("Predicted category:", categories[predicted_class[0]//10])
print("Actual category:","Exterior/urban noises")

print('\nPredicted class:', classes[predicted_class[0]])
print('Actual class:',"Engine")

#Audio
Audio(audio,rate = sampling_rate)


129
Predicted category: Natural soundscapes & water sounds
Actual category: Exterior/urban noises

Predicted class: Wind
Actual class: Engine


In [None]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 64)                8320      
                                                                 
 dense_7 (Dense)             (None, 50)                3250      
                                                                 
Total params: 11,570
Trainable params: 11,570
Non-trainable params: 0
_________________________________________________________________
