In [1]:
import librosa
import os
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import img_to_array, load_img
from keras.applications import ResNet50
from keras.models import Model, load_model
from keras.layers import GlobalAveragePooling2D, Dense
from keras.optimizers import Adam
from keras.utils import to_categorical
from scipy import ndimage
from keras import layers

2024-05-06 16:19:28.685121: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
def resize_spectrogram(spectrogram, target_size):
    if len(spectrogram.shape) == 1:  # If spectrogram has only one dimension
        # Reshape to (1, n) where n is the length of the spectrogram
        spectrogram = np.expand_dims(spectrogram, axis=0)
        # Compute zoom factors for one-dimensional spectrograms
        zoom_factors = (1, target_size[0] / spectrogram.shape[1])
    elif len(spectrogram.shape) == 2:  # If spectrogram has two dimensions
        # Compute zoom factors for two-dimensional spectrograms
        zoom_factors = (target_size[0] / spectrogram.shape[0], target_size[1] / spectrogram.shape[1])
    else:
        raise ValueError("Input spectrogram must have either one or two dimensions")
    # Resize using ndimage.zoom
    return ndimage.zoom(spectrogram, zoom_factors, order=1)

In [3]:
# Get data from GTZAN

# Get the paths for all the spectrograms
directory = 'dataset/images_original/'
spectro_imgs = []
genres = []

# Iterate through all folders, images to get all genres and all images into a list
for genre in os.listdir (directory):
    if genre != '.DS_Store':
        genre_dir = os.path.join(directory, genre)
        # Get files within this genre folder
        for file in os.listdir (genre_dir):
            if file.endswith('.png'):
                # Get image path
                img_path = os.path.join(genre_dir, file)
                # Get image of target_size
                img = load_img(img_path, target_size=(224, 224))
                # Add image to array with all data
                spectro_imgs.append(img)
                genres.append(genre)

# Conversion to numpy
X_imgs = np.array(spectro_imgs)
y_genres = np.array(genres)

# Do one hot encoding for genres
label_to_index = {genre: index for index, genre in enumerate(set(y_genres))}
y_indices = [label_to_index[genre] for genre in y_genres]

# One-hot encode the integer indices
num_classes = len(set(y_indices))
y_one_hot = to_categorical(y_indices, num_classes=num_classes)

# Split for train and test
X_train, X_test, y_train, y_test = train_test_split(X_imgs, y_one_hot, test_size=0.2, random_state=42)

In [4]:
audio_csv = 'urbansounds/UrbanSound8K.csv'
metadata = pd.read_csv (audio_csv)

labels = metadata['class'].tolist()

audio_folder = 'urbansounds/'
spectrograms = []

for fold in os.listdir (audio_folder):
    print (f'Current fold is {fold}')
    fold_path = os.path.join (audio_folder, fold)
    if os.path.isdir (fold_path):
        for audio_file in os.listdir (fold_path):
            if audio_file.endswith('.wav'):
                full_path = os.path.join (fold_path, audio_file)
                y, sr = librosa.load(full_path, sr=None)
                img = librosa.feature.melspectrogram(y=y, sr=sr)
                img = librosa.power_to_db(img, ref=np.max)
                img = resize_spectrogram(img, target_size=(224, 224))
                spectrograms.append(img)

spectro_imgs = np.array (spectrograms)
spectro_imgs = np.expand_dims (spectro_imgs, axis=-1)
spectro_imgs = np.repeat(spectro_imgs, 3, axis=-1)
labels = np.array (labels)

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
one_hot_labels = to_categorical(encoded_labels)

X_train, X_test, y_train, y_test = train_test_split(spectro_imgs, one_hot_labels, test_size=0.2, random_state=42)
print ("data split done")

Current fold is fold2
Current fold is fold5
Current fold is fold4
Current fold is fold3
Current fold is UrbanSound8K.csv
Current fold is .DS_Store
Current fold is fold8


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


Current fold is fold6
Current fold is fold1
Current fold is fold7
Current fold is fold9
Current fold is fold10
data split done


In [12]:
# Using 10 genres from gtzan
num_classes = 10

X_train_smaller, X_test_smaller, y_train_smaller, y_test_smaller = train_test_split (spectro_imgs, one_hot_labels, test_size=0.5, random_state=42)
# Get ResNet50. First load model, removing top layer to adapt the model to classify the spectrograms. 
resnet_model = ResNet50 (weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the weights of the pre-trained layers
for layer in resnet_model.layers:
    layer.trainable = False

# Pool pretrained layers
part_model = layers.GlobalAveragePooling2D ()(resnet_model.output)
part_model = layers.Dropout (0.2) (part_model)
# Add dense layer to the end of the model

# Use softmax activation since this is multiclass
predicted = Dense (num_classes, activation='softmax')(part_model)

model = Model (inputs=resnet_model.input, outputs=predicted)

model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_smaller, y_train_smaller, epochs=5, batch_size=100, validation_split=0.2, verbose=1)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [6]:
# Print summary of model:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 conv1_pad (ZeroPadding2D)   (None, 230, 230, 3)          0         ['input_1[0][0]']             
                                                                                                  
 conv1_conv (Conv2D)         (None, 112, 112, 64)         9472      ['conv1_pad[0][0]']           
                                                                                                  
 conv1_bn (BatchNormalizati  (None, 112, 112, 64)         256       ['conv1_conv[0][0]']          
 on)                                                                                          

In [11]:
# # Evaluate the model on test data
# test_loss, test_accuracy = model.evaluate(X_test, y_test)
# print(f'Loss: {test_loss}')
# print(f'Accuracy: {test_accuracy}')

model.save('model_classification_2.keras')
