In [1]:
import librosa
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from scipy import ndimage
from keras.models import load_model

2024-05-06 19:40:48.193675: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def resize_spectrogram(spectrogram, target_size):
    if len(spectrogram.shape) == 1:  # If spectrogram has only one dimension
        # Reshape to (1, n) where n is the length of the spectrogram
        spectrogram = np.expand_dims(spectrogram, axis=0)
        # Compute zoom factors for one-dimensional spectrograms
        zoom_factors = (1, target_size[0] / spectrogram.shape[1])
    elif len(spectrogram.shape) == 2:  # If spectrogram has two dimensions
        # Compute zoom factors for two-dimensional spectrograms
        zoom_factors = (target_size[0] / spectrogram.shape[0], target_size[1] / spectrogram.shape[1])
    else:
        raise ValueError("Input spectrogram must have either one or two dimensions")
    # Resize using ndimage.zoom
    return ndimage.zoom(spectrogram, zoom_factors, order=1)

In [3]:
# Load the saved model
model = load_model('model_classification.keras')

# Define the list of classes
classes = ["air conditioner", "car horn", "children playing", "dog bark", "drilling", 
           "engine idling", "gun shot", "jackhammer", "siren", "street music"]

# Define function to predict and print classified sound
def predict_class(sound_file):
    # Load the sound file
    y, sr = librosa.load(sound_file, sr=None)
    # Compute spectrogram
    img = librosa.feature.melspectrogram(y=y, sr=sr)
    img = librosa.power_to_db(img, ref=np.max)
    img = resize_spectrogram(img, target_size=(224, 224))
    img = np.expand_dims(img, axis=-1)
    img = np.repeat(img, 3, axis=-1)
    # Make prediction
    prediction = model.predict(np.expand_dims(img, axis=0))
    # Get the index of the class with the highest probability
    predicted_class_index = np.argmax(prediction)
    # Map index to class label
    classified_sound = classes[predicted_class_index]
    # Print classified sound
    print(f"Sound is: {classified_sound}")

Sound is: drilling


In [4]:
# Example usage
test_sound_file = 'urbansounds/fold10/7913-3-0-0.wav'
predict_class(test_sound_file)

Sound is: drilling
