# Using a Pre-trained model for Audio Classification

###Loading all the libraries and the Pre-trained VGGish model from TFHub

In [3]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
import librosa
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import re
import cv2
import os
from tqdm import tqdm


# Load the pre-trained VGGish model from TFHub
tf_hub_module = hub.load('https://tfhub.dev/google/vggish/1')

### Downloading and unziping the Dataset

In [None]:
!wget https://github.com/karoldvl/ESC-50/archive/master.zip
!unzip master.zip

### Read the classes

In [5]:
df = pd.read_csv('/content/classes.csv',delimiter = ',') 

categories = df.columns.tolist()

classes =      list(df[categories[0]].values)
classes.extend(list(df[categories[1]].values))
classes.extend(list(df[categories[2]].values))
classes.extend(list(df[categories[3]].values))
classes.extend(list(df[categories[4]].values))
df

Unnamed: 0,Animals,Natural soundscapes & water sounds,Human/ non-speech sounds,Interior/domestic sounds,Exterior/urban noises
0,Dog,Rain,Crying baby,Door knock,Helicopter
1,Rooster,Sea waves,Sneezing,Mouse click,Chain saw
2,Pig,Crackling fire,Clapping,Keyboard typing,Siren
3,Cow,Crickets,Breathing,"Door,wood creaks",Car horn
4,Frog,Chirping birds,Coughing,Can opening,Engine
5,Cat,Water drops,Footsteps,Washing machine,Train
6,Hen,Wind,Laughing,Vacuum cleaner,Church bells
7,Insects (flying),Pouring water,Brushing teeth,Clock alarm,Airplane
8,Sheep,Toilet flush,Snoring,Clock tick,Crackers
9,Crow,Thunderstorm,Drinking/sipping,Glass breaking,Hand saw


### Load the audio files from the Dataset

In [6]:
audio_files = []
PATH = '/content/ESC-50-master/audio/'
for file_name in tqdm(os.listdir(PATH)):
    try:
        audio, sampling_rate = librosa.load(os.path.join(PATH,file_name))
        
        exp = re.findall('\d{1,2}.wav',file_name)
        
        audio_files.append([audio,int(float(exp[0][0:2]))])
    except Exception as e:
        pass



100%|██████████| 2000/2000 [00:20<00:00, 95.73it/s] 


In [7]:
print(len(audio_files))

2000


In [8]:
audio_files_load= np.array(audio_files)
X = list(audio_files_load[:,0])
Y = audio_files_load[:,1]


  audio_files_load= np.array(audio_files)


### Data Augmentation

In [9]:

size_of_audio_files = len(X[0])
number_of_audio_files = len(Y)

augmented_audio_files = []

for i in range(number_of_audio_files): 
    
    # Adding white noise
    X.append(X[i] + 0.005*np.random.randn(size_of_audio_files))
    
Y = np.r_[Y,Y]


### Extracting features of Audiofiles using a pretrained model(VGGish)

In [10]:
import numpy
def extract_features(audio_files, labels):


    # Extract VGGish features from the audio samples
    features = []
    valid_labels = []
    for i, audio in enumerate(audio_files):
        try:

            # Extract VGGish features from the audio using the pre-trained model
            vggish_features = tf_hub_module(audio)           

            # Take the average of the VGGish features for each second of audio
            vggish_features_mean = tf.reduce_mean(vggish_features, axis=0)
            

            # Append the features to the list
            features.append(vggish_features_mean.numpy())  

            valid_labels.append(labels[i])
        except:
            print(f"Failed to load {audio}")

    # Normalize the features
    features = numpy.array(features)
    features = (features - features.mean()) / features.std()

    # Convert the labels to integers
    valid_labels = np.array(valid_labels, dtype=np.int32)

    return features, valid_labels



In [11]:
features, labels = extract_features(X, Y)


### Splitting the data and definfing a simple Logistic Regression Classifier

In [18]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2)

# Train the logistic regression model
clf = LogisticRegression(max_iter=500, random_state=7)
clf.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'Test accuracy: {acc*100}')



Test accuracy: 84.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Testing the classifier for our case study sample

In [34]:
audio, sampling_rate = librosa.load('/content/113203-5-0-0.wav')
audio_file= np.array(audio)

feature = []
vggish_features = tf_hub_module(audio_file)           

vggish_features_mean = tf.reduce_mean(vggish_features, axis=0)
feature.append(vggish_features_mean.numpy())  

label_pred = clf.predict(feature)
# Print the predicted label

print(f'Predicted label: {classes[label_pred[0]]}')




Predicted label: Helicopter
