In [1]:
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Flatten, Dense

from tensorflow.keras.callbacks import EarlyStopping
import os
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet import preprocess_input

from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from keras.preprocessing import image
import librosa


In [3]:
classes = ["children_playing", "drilling", "street_music", "siren", "gun_shot", "car_horn", "air_conditioner", "engine_idling",  "dog_bark", "jackhammer"]

directory = 'C:\\Users\\giorg\\OneDrive\\Υπολογιστής\\DL Project\\UrbanSound'
os.chdir(directory)


In [4]:
def create_spectrogram(audio_file, image_file):
    fig = plt.figure(figsize=(4, 4))  # 4x4 inches figure to help with aspect ratio
    ax = fig.add_subplot(1, 1, 1)
    fig.subplots_adjust(left=0, right=1, bottom=0, top=1)

    y, sr = librosa.load(audio_file)
    ms = librosa.feature.melspectrogram(y=y, sr=sr)
    log_ms = librosa.power_to_db(ms, ref=np.max)
    img = librosa.display.specshow(log_ms, sr=sr, ax=ax)

    fig.savefig(image_file, bbox_inches='tight', pad_inches=0)
    plt.close(fig)  # Close the figure to free up memory


def datapreprocessing(x, y):
    le = LabelEncoder()
    x = np.array(x)
    normalized_data = x.astype('float32') / 255.0

    y_encoded = le.fit_transform(y)
    y_encoded = to_categorical(y_encoded)
    return normalized_data, y_encoded


def create_pngs_from_wavs(input_path, output_path):
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    folder_name = os.path.basename(os.path.normpath(input_path))
    for file in os.listdir(input_path):
        if file.endswith('.wav'):
            input_file = os.path.join(input_path, file)
            base_name = os.path.splitext(file)[0]
            output_file_name = f"{base_name}-{folder_name}.png"
            output_file = os.path.join(output_path, output_file_name)
            create_spectrogram(input_file, output_file)

def load_images_from_path(path, label):
    images = []
    labels = []

    for file in os.listdir(path):
        if file.endswith('.png'):  
            images.append(image.img_to_array(image.load_img(os.path.join(path, file), target_size=(224, 224, 3))))
            labels.append(label)

    return images, labels


def show_images(images):
    fig, axes = plt.subplots(1, 8, figsize=(20, 20), subplot_kw={'xticks': [], 'yticks': []})

    for i, ax in enumerate(axes.flat):
        ax.imshow(images[i] / 255)




The next function will create the spectogramms of the sounds we  hear in the specified directory

In [7]:


classes = ["children_playing", "drilling", "street_music", "siren", "gun_shot", "car_horn", "air_conditioner", "engine_idling",  "dog_bark", "jackhammer"]

for class_name in classes:
    input_path = f'Train/{class_name}'
    output_path = f'Images/Spectrograms/{class_name}'
    create_pngs_from_wavs(input_path, output_path)


In [7]:
x = []
y = []

for class_index, class_name in enumerate(classes):
    images, labels = load_images_from_path(f'Spectrograms/{class_name}', class_index)
    # Calculate the number of images to load (half of the total)
    num_images_to_load = len(images) 
    # Load only the first half of the images and labels
    x += images[:num_images_to_load]
    y += labels[:num_images_to_load]

In [5]:
x , y = datapreprocessing(x, y)
x_train, x_val, y_train, y_val = train_test_split(x, y, stratify=y, test_size=0.3, random_state=0)

In [None]:
model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)))
model.add(MaxPooling2D(2, 2))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(2, 2))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(2, 2))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(2, 2))
model.add(Flatten())
model.add(Dense(1024, activation='relu'))
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
hist = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=10, epochs=10)

In [None]:
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

x_train_norm = preprocess_input(np.array(x_train))
x_test_norm = preprocess_input(np.array(x_val))

train_features = base_model.predict(x_train_norm)
test_features = base_model.predict(x_test_norm)

In [66]:
model = Sequential()
model.add(Flatten(input_shape=train_features.shape[1:]))
model.add(Dense(1024, activation='relu'))
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [67]:
hist = model.fit(train_features, y_train_encoded, validation_data=(test_features, y_test_encoded), batch_size=10, epochs=10)

Epoch 1/10
[1m381/381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 415ms/step - accuracy: 0.6336 - loss: 10.6309 - val_accuracy: 0.7756 - val_loss: 0.7465
Epoch 2/10
[1m381/381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 417ms/step - accuracy: 0.9101 - loss: 0.2990 - val_accuracy: 0.8522 - val_loss: 0.5923
Epoch 3/10
[1m381/381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 418ms/step - accuracy: 0.9418 - loss: 0.1870 - val_accuracy: 0.8596 - val_loss: 0.6832
Epoch 4/10
[1m381/381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 426ms/step - accuracy: 0.9593 - loss: 0.1254 - val_accuracy: 0.8737 - val_loss: 0.6489
Epoch 5/10
[1m381/381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 428ms/step - accuracy: 0.9741 - loss: 0.0876 - val_accuracy: 0.9160 - val_loss: 0.4923
Epoch 6/10
[1m381/381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 426ms/step - accuracy: 0.9658 - loss: 0.1052 - val_accuracy: 0.8841 - val_loss: 0.6919
Epo