# **Imports**

In [None]:
!pip install scikeras

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikeras
  Downloading scikeras-0.10.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.10.0


In [None]:
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install eli5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting eli5
  Downloading eli5-0.13.0.tar.gz (216 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.2/216.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: eli5
  Building wheel for eli5 (setup.py) ... [?25l[?25hdone
  Created wheel for eli5: filename=eli5-0.13.0-py2.py3-none-any.whl size=107747 sha256=a38643477a9a00eee1b9881984908e226c5a9d300d044ebe7772daeaff850d6f
  Stored in directory: /root/.cache/pip/wheels/7b/26/a5/8460416695a992a2966b41caa5338e5e7fcea98c9d032d055c
Successfully built eli5
Installing collected packages: eli5
Successfully installed eli5-0.13.0


In [None]:
# Data Manipulation and Analysis
import pandas as pd
import numpy as np

# Utilities
import warnings
from tqdm import tqdm
from google.colab import files, drive
import os
import librosa
import joblib
import typing

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, normalize
from sklearn.svm import SVC
from keras.layers import Input, Dense, GaussianNoise, Conv2D, MaxPooling2D, Dense, Flatten, LocallyConnected2D, BatchNormalization, Dropout
from keras.models import Model, Sequential
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from scikeras.wrappers import KerasRegressor
from sklearn.multiclass import OneVsRestClassifier
from skimage.transform import resize
from keras.utils import to_categorical
import xgboost as xgb
from xgboost import XGBClassifier, XGBRFClassifier

# **Data Creation**

In [None]:
def load_gtzan_data(path: str) -> pd.DataFrame:
    """
    Load the GTZAN dataset from the given path and extract audio features for each music slice.

    Parameters:
        path (str): The path to the directory containing the GTZAN dataset.

    Returns:
        pd.DataFrame: A Pandas dataframe containing the extracted features and labels for each slice.

    """
    # Different music genres in GTZAN
    genres = ['blues', 'classical', 'country', 'disco', 'hiphop',
              'jazz', 'metal', 'pop', 'reggae', 'rock']

    feature_cols = ['filename', 'slice', 'zcr_mean', 'zcr_var', 'rmse_mean', 
                    'rmse_var', 'sc_mean', 'sc_var', 'sbw_mean', 'sbw_var', 
                    'sro_mean', 'sro_var', 'tempo', 'harmony_mean', 
                    'perc_mean', 'harmony_var', 'perc_var']

    for i in range(1, 21):
        feature_cols += [f'mfcc{i}_mean', f'mfcc{i}_var', f'dmfcc{i}_mean', f'dmfcc{i}_var']

    for i in range(1, 13):
        feature_cols += [f'cstft{i}_mean', f'cstft{i}_var']

    feature_cols += ['label']

    data = []

    sr = 22050
    total_samples = 29 * sr
    num_slices = 10
    samples_in_slice = int(total_samples / num_slices)
    
    for genre in tqdm(genres):
      
        genre_dir = os.path.join(path, 'genres_original', genre)

        for filename in tqdm(os.listdir(genre_dir)):

          if filename.endswith('.wav'):
              
            # Load audio file
            filepath = os.path.join(genre_dir, filename)
            audio, sr = librosa.load(filepath, duration=29)

            for s in range(num_slices):

              start_sample = samples_in_slice * s
              end_sample = start_sample + samples_in_slice
              slice_audio = audio[start_sample:end_sample]

              # Extract features and their mean and variance
              zcr = librosa.feature.zero_crossing_rate(y=slice_audio)
              rmse = librosa.feature.rms(y=slice_audio)
              mag = np.abs(librosa.stft(slice_audio))

              f = librosa.fft_frequencies(sr=sr, n_fft=2048)
              sc = librosa.feature.spectral_centroid(S=mag, freq=f)
              sbw = librosa.feature.spectral_bandwidth(S=mag, freq=f, p=2)
              sro = librosa.feature.spectral_rolloff(S=mag, freq=f)

              mfcc = librosa.feature.mfcc(y=slice_audio, sr=sr, n_mfcc=20)
              dmfcc = librosa.feature.delta(mfcc)
              cstft = librosa.feature.chroma_stft(y=slice_audio, sr=sr, n_chroma=12)

              # Extract tempo
              tempo, _ = librosa.beat.beat_track(y=slice_audio, sr=sr)

              # Extract harmony and perceptual features
              S = librosa.feature.melspectrogram(y=slice_audio, sr=sr)
              S_harmonic, S_percussive = librosa.effects.hpss(S)

              # Append features and label
              row = [filename, s] + \
                    [np.mean(feature) for feature in [zcr, rmse, sc, sbw, sro]] + \
                    [np.var(feature) for feature in [zcr, rmse, sc, sbw, sro]] + \
                    [tempo, np.mean(S_harmonic), np.mean(S_percussive), np.var(S_harmonic), np.var(S_percussive)] + \
                    [np.mean(feature) for feature in mfcc] + [np.var(feature) for feature in mfcc] + \
                    [np.mean(feature) for feature in dmfcc] + [np.var(feature) for feature in dmfcc] + \
                    [np.mean(feature) for feature in cstft] + [np.var(feature) for feature in cstft] + \
                    [genre]

              data.append(row)

    # Create dataframe
    df = pd.DataFrame(data, columns=feature_cols)
    df = df.sort_values(['filename', 'slice'], ascending=[True, True]).reset_index(drop=True)

    return df

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
data_path = '/content/drive/MyDrive/MIE424 Project/Data'

gtzan_df = load_gtzan_data(data_path)

# Save the prediction dataframe as a CSV file
gtzan_df.to_csv('GTZAN.csv', index=False)
files.download('GTZAN.csv')

  0%|          | 0/10 [00:00<?, ?it/s]

  1%|          | 1/100 [00:16<27:04, 16.41s/it][A
  2%|▏         | 2/100 [00:20<14:29,  8.88s/it][A
  3%|▎         | 3/100 [00:23<10:13,  6.33s/it][A
  4%|▍         | 4/100 [00:26<07:50,  4.90s/it][A
  5%|▌         | 5/100 [00:28<06:19,  4.00s/it][A
  6%|▌         | 6/100 [00:30<05:25,  3.47s/it][A
  7%|▋         | 7/100 [00:33<04:51,  3.13s/it][A
  8%|▊         | 8/100 [00:35<04:34,  2.98s/it][A
  9%|▉         | 9/100 [00:38<04:28,  2.95s/it][A
 10%|█         | 10/100 [00:41<04:10,  2.79s/it][A
 11%|█         | 11/100 [00:43<03:58,  2.68s/it][A
 12%|█▏        | 12/100 [00:46<03:48,  2.59s/it][A
 13%|█▎        | 13/100 [00:48<03:40,  2.54s/it][A
 14%|█▍        | 14/100 [00:51<03:48,  2.65s/it][A
 15%|█▌        | 15/100 [00:54<03:47,  2.67s/it][A
 16%|█▌        | 16/100 [00:56<03:37,  2.59s/it][A
 17%|█▋        | 17/100 [00:58<03:30,  2.54s/it][A
 18%|█▊        | 18/100 [01:01<03:24,  2.50s/it][A
 19%|█▉        | 19/100 [01:03<03

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Load the GTZAN dataset
gtzan_df = pd.read_csv('https://docs.google.com/uc?export=download&id=10KjnUR4jAwsBYOi_I9OJkzUUOfv_RpUY')

In [None]:
gtzan_df.head()

Unnamed: 0,filename,slice,zcr_mean,zcr_var,rmse_mean,rmse_var,sc_mean,sc_var,sbw_mean,sbw_var,...,cstft8_var,cstft9_mean,cstft9_var,cstft10_mean,cstft10_var,cstft11_mean,cstft11_var,cstft12_mean,cstft12_var,label
0,blues.00000.wav,0,0.078156,0.132945,1709.638977,1933.232918,3576.148242,0.000212,0.003499,56966.914869,...,0.043161,0.090037,0.070312,0.064615,0.070386,0.054127,0.094938,0.108041,0.082374,blues
1,blues.00000.wav,1,0.089531,0.11306,1862.789479,2045.413635,3966.588281,0.001306,0.001535,191087.170258,...,0.04227,0.072976,0.098411,0.100363,0.106239,0.067768,0.083601,0.080848,0.082238,blues
2,blues.00000.wav,2,0.074934,0.124599,1821.64144,2088.265226,4028.603906,0.000526,0.004297,111485.826339,...,0.043978,0.025749,0.031514,0.043059,0.114197,0.052934,0.070832,0.129591,0.068633,blues
3,blues.00000.wav,3,0.069262,0.127767,1690.881466,2001.37306,3705.433594,0.000298,0.00262,109431.313114,...,0.053166,0.09899,0.092692,0.066647,0.112924,0.074798,0.071905,0.09662,0.096092,blues
4,blues.00000.wav,4,0.069543,0.146586,1648.480571,1990.784309,3534.890625,0.000246,0.001891,74168.14244,...,0.05083,0.084718,0.096228,0.055098,0.144812,0.0697,0.076376,0.119357,0.078988,blues


# **Data Processing**

In [None]:
def process_data(df: pd.DataFrame) -> tuple:
    """
    This function preprocesses the input DataFrame by dropping unnecessary columns, creating one-hot encoded columns
    for the categorical 'label' column, and standardizing the data using the standardize_data function. The data is
    then split into training and testing sets.

    Parameters:
        df (pd.DataFrame): A pandas DataFrame containing the data to be preprocessed.

    Returns:
        tuple: A tuple containing the standardized training and testing data and one-hot encoded labels
               (X_train, X_test, y_train, y_test).
    """

    # Drop unnecessary columns
    df = df.drop(['filename', 'slice'], axis=1)

    # Drop the target variable
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    # Create a LabelEncoder object and fit_transform the labels
    encoder = LabelEncoder()
    y_encoded = encoder.fit_transform(y)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=1)

    # Standardize the data
    X_train, X_test = scale_data(X_train, X_test)

    # Convert the DataFrames to numpy arrays
    X_train = X_train.to_numpy()
    X_test = X_test.to_numpy()

    return X_train, X_test, y_train, y_test

In [None]:
def scale_data(X_train: pd.DataFrame, X_test: pd.DataFrame) -> tuple:
    """
    This function scales the input data using the MinMaxScaler() method from scikit-learn. It creates a copy of 
    the input DataFrames, fits the scaler to the training data, and applies the same transformation to the test data.
    
    Parameters:
        X_train (pd.DataFrame): A pandas DataFrame containing the training data to be standardized.
        X_test (pd.DataFrame): A pandas DataFrame containing the test data to be standardized.
        
    Returns:
        tuple: A tuple containing the scaled training and testing data (X_train_standardized, X_test_standardized).
    """
    # Create copies of the input DataFrames
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()

    # Fit the scaler to the training data
    scaler = MinMaxScaler()
    scaler.fit(X_train)

    # Transform both the training and testing data
    X_train_scaled.loc[:] = scaler.transform(X_train)
    X_test_scaled.loc[:] = scaler.transform(X_test)
                        
    return X_train_scaled, X_test_scaled

In [None]:
# Load the GTZAN dataset
gtzan_df = pd.read_csv('https://docs.google.com/uc?export=download&id=10KjnUR4jAwsBYOi_I9OJkzUUOfv_RpUY')

X_train, X_test, y_train, y_test = process_data(gtzan_df)

In [None]:
X_train

array([[0.23936391, 0.10087654, 0.20650598, ..., 0.26586969, 0.06016398,
        0.58088135],
       [0.40780428, 0.34524603, 0.43181324, ..., 0.35797828, 0.39141417,
        0.43205243],
       [0.05952294, 0.32897971, 0.09161733, ..., 0.22929483, 0.23754764,
        0.12515692],
       ...,
       [0.16446483, 0.31402682, 0.25828213, ..., 0.16709446, 0.27540786,
        0.26091413],
       [0.16201835, 0.1198582 , 0.20848446, ..., 0.45900623, 0.25590396,
        0.2683524 ],
       [0.12696024, 0.20498714, 0.16516185, ..., 0.41442376, 0.29318487,
        0.25316619]])

# **Music Genre Classification Using Extreme Gradient Boosting (XGBoost)**

In [None]:
def evaluate_model(model: object, X_train: pd.DataFrame, y_train: pd.Series) -> tuple:
    """
    Evaluate a given model on the provided training data using a grid search over a range of hyperparameters.

    Args:
        model (object): An instance of the model to be evaluated.
        X_train (pd.DataFrame): A Pandas dataframe containing the feature data for the training set.
        y_train (pd.Series): A Pandas series containing the target labels for the training set.

    Returns:
        tuple: A tuple containing the best model, best hyperparameters, and accuracy score.

    """
    param_grid = get_grid_parameters(model)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    grid = GridSearchCV(model, cv=cv, param_grid=param_grid, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    best_parameters = grid.best_params_
    accuracy = grid.best_score_

    return best_model, best_parameters, accuracy

In [None]:
def get_grid_parameters(model: object) -> dict:
    """
    Get a grid of hyperparameters for a given model type.

    Parameters:
        model (object): An instance of the model for which to retrieve hyperparameters.

    Returns:
        dict: A dictionary containing the hyperparameters to be used in a grid search.

    Raises:
        ValueError: If the provided model type is not supported.

    """
    model_name = type(model).__name__.lower()

    if model_name == 'xgbclassifier':
        return {
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5, 7],
            'n_estimators': [50, 100, 200],
            'subsample': [0.5, 0.8],
            'colsample_bytree': [0.5, 0.8],
        }
    elif model_name == 'onevsrestclassifier':
        return {
            'estimator__C': [0.1, 1, 10, 100],
            'estimator__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'estimator__gamma': ['scale', 'auto', 0.01, 0.1, 1, 10],
        }
    else:
        raise ValueError(f'Unsupported model type: {model_name}')

# **Music Genre Classification Using Stacked Auto-Encoders (SAE)**

In [None]:
def create_denoising_autoencoder(input_shape: tuple, layer_size: int, noise_std_dev: float = 0.1) -> tuple:
    """
    Create a denoising autoencoder model using the given input shape and layer size.

    Parameters:
        input_shape (tuple): The shape of the input data.
        layer_size (int): The number of neurons in the encoding layer.
        noise_std_dev (float): The standard deviation of the noise to add to the input data.

    Returns:
        tuple: A tuple containing the denoising autoencoder model and the encoder model.

    """
    input_layer = Input(shape=(input_shape,))
    noise = GaussianNoise(noise_std_dev)(input_layer)
    
    encoded = Dense(layer_size, activation='relu')(noise)
    decoded = Dense(input_shape, activation='sigmoid')(encoded)

    autoencoder = Model(input_layer, decoded)
    encoder = Model(input_layer, encoded)

    return autoencoder, encoder

In [None]:
def train_stacked_denoising_autoencoder(X_train: np.ndarray, X_test: np.ndarray, num_layers: int, layer_sizes: list,
                                        learning_rate: float = 0.0001, epochs: int = 50, batch_size: int = 16) -> list:
    """
    Train a stacked denoising autoencoder model using the given training and testing data, number of layers, and layer sizes.

    Parameters:
        X_train (np.ndarray): The training data to use.
        X_test (np.ndarray): The testing data to use.
        num_layers (int): The number of layers in the stacked autoencoder.
        layer_sizes (list): A list of integers indicating the number of neurons in each layer of the stacked autoencoder.
        learning_rate (float): The learning rate to use for training.
        epochs (int): The number of epochs to train each autoencoder.
        batch_size (int): The batch size to use for training.

    Returns:
        list: A list of encoder models learned at each layer of the stacked autoencoder.

    """
    encoders = []
    autoencoders = []

    input_data = X_train
    input_shape = X_train.shape[1]

    validation_data = X_test

    # Train a denoising autoencoder for each layer of the stacked autoencoder
    for i in range(num_layers):

        # Create the denoising autoencoder
        autoencoder, encoder = create_denoising_autoencoder(input_shape, layer_sizes[i])

        # Compile the autoencoder model
        autoencoder.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error')

        # Train the autoencoder on the input data
        autoencoder.fit(input_data, input_data, epochs=epochs, batch_size=batch_size, validation_data=(validation_data, validation_data))

        # Save the encoder model and autoencoder model for this layer
        encoders.append(encoder)
        autoencoders.append(autoencoder)

        # Generate the input data for the next layer by encoding the current input data
        input_data = encoder.predict(input_data)
        input_shape = layer_sizes[i]

        # Generate the validation data for the next layer by encoding the current validation data
        validation_data = encoder.predict(validation_data)

    # Return the list of encoder models learned at each layer
    return encoders

In [None]:
def create_svc_classifier(kernel: str, gamma: float, C: float) -> OneVsRestClassifier:
    """
    Create a support vector machine classifier with the specified kernel, gamma, and regularization parameter.

    Parameters:
        kernel (str): The kernel to use for the support vector machine.
        gamma (float): The kernel coefficient for 'rbf', 'poly', and 'sigmoid'.
        C (float): The regularization parameter.

    Returns:
        OneVsRestClassifier: The trained support vector machine classifier.

    """
    svc = SVC(kernel=kernel, gamma=gamma, C=C)

    ovr_svc = OneVsRestClassifier(svc)

    return ovr_svc

In [None]:
def create_sdae_svc(X_train, X_test, y_train, y_test, num_layers, layer_sizes, kernel, gamma, C, learning_rate=0.001, epochs=50, batch_size=16):
    
    encoders = train_stacked_denoising_autoencoder(X_train, X_test, num_layers, layer_sizes, learning_rate, epochs, batch_size)
    stacked_encoder = Sequential(encoders)

    classifier = create_svc_classifier(kernel, gamma, C)

    return stacked_encoder, classifier

# **Music Genre Classification Using Convolutional Neural Networks (CNN)**

In [None]:
def create_spectrogram(audio: np.ndarray, sr: int) -> np.ndarray:
    """
    Create a normalized spectrogram of an audio file.

    Parameters:
        audio (np.ndarray): The audio data as a 1D NumPy array.
        sr (int): The sample rate of the audio file.

    Returns:
        np.ndarray: A normalized spectrogram of the audio file as a 2D NumPy array.

    """
    # Compute the mel spectrogram of the audio data
    melspectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)

    # Convert the mel spectrogram to decibels
    spectrogram = librosa.power_to_db(S=melspectrogram, ref=1.0)

    # Normalize the spectrogram to have values between 0 and 1
    normalized_spectrogram = normalize(spectrogram)

    # Return the normalized spectrogram as a 2D NumPy array
    return normalized_spectrogram

In [None]:
def augment_samples(audio: np.ndarray, sr: int, n_slices: int) -> list:
    """
    Augment an audio signal by applying various transformations to its slices.

    Parameters:
        audio (np.ndarray): The audio data as a 1D NumPy array.
        sr (int): The sample rate of the audio file.
        n_slices (int): The number of slices to split the audio data into.

    Returns:
        list: A list of augmented audio samples, each represented as a 1D NumPy array.

    """
    # Split the audio data into slices
    audio_samples = np.array_split(audio, n_slices)

    # Create a list to store the augmented samples
    augmented_samples = []

    # Apply different transformations to each slice
    for audio_sample in audio_samples:

        # Add the original slice to the list of augmented samples
        augmented_samples.append(audio_sample)

        # Apply pitch shifting to the slice with 3 and 5 steps
        for n_steps in [3, 5]:
            augmented_samples.append(librosa.effects.pitch_shift(y=audio_sample, sr=sr, n_steps=n_steps))

        # Apply time stretching to the slice with 0.5 and 1.5 rates
        for rate in [0.5, 1.5]:
            augmented_samples.append(librosa.effects.time_stretch(y=audio_sample, rate=rate))

        # Add white noise to the slice
        white_noise = np.random.randn(len(audio_sample))
        augmented_samples.append(audio_sample + 0.005 * white_noise)

    # Return the list of augmented samples
    return augmented_samples

In [None]:
def get_cnn_data(path: str) -> Tuple[List[np.ndarray], List[int]]:
    """
    Load and preprocess the data for a CNN model.

    Parameters:
        path (str): The path to the GTZAN dataset.

    Returns:
        Tuple[List[np.ndarray], List[int]]: A tuple containing a list of spectrograms and a list of corresponding labels.

    """
    # Define the music genres in GTZAN
    genres = ['blues', 'classical', 'country', 'disco', 'hiphop',
              'jazz', 'metal', 'pop', 'reggae', 'rock']

    # Determine the number of music genres
    n_classes = len(genres)

    # Define the number of audio slices
    n_slices = 3

    spectrograms = []
    labels = []

    for genre in tqdm(genres):

      genre_dir = os.path.join(path, 'genres_original', genre)
      genre_index = genres.index(genre)

      for filename in tqdm(os.listdir(genre_dir)):

          if filename.endswith('.wav'):

              filepath = os.path.join(genre_dir, filename)
              audio, sr = librosa.load(filepath, duration=29)

              input_length = len(audio) // n_slices
              
              augmented_audios = augment_samples(audio, sr, n_slices)

              for aug_audio in augmented_audios:

                  if len(aug_audio) > input_length:
                      aug_audio = aug_audio[:input_length]
                  else:
                      aug_audio = np.pad(aug_audio, (0, max(0, input_length - len(aug_audio))))

                  spectrogram = create_spectrogram(aug_audio, sr)
                  spectrogram = np.expand_dims(spectrogram, axis=-1)
                  spectrograms.append(spectrogram)
                  labels.append(genre_index)

    return spectrograms, labels

In [None]:
def process_cnn_data(spectograms: List[np.ndarray], labels: List[int]) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    Process spectrograms and their corresponding labels for use in a CNN.

    Parameters:
        spectograms (List[np.ndarray]): A list of spectrograms as 2D NumPy arrays.
        labels (List[int]): A list of labels corresponding to each spectrogram.

    Returns:
        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: A tuple containing the preprocessed training and testing data and labels as NumPy arrays.

    """
    X = np.array(spectograms)
    y = np.array(labels)

    encoder = LabelEncoder()
    y = encoder.fit_transform(y)
    y = to_categorical(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    return X_train, X_test, y_train, y_test

In [None]:
def create_cnn(input_shape, num_classes):

    cnn = Sequential()

    cnn.add(Conv2D(64, kernel_size=(5,5), activation='relu', input_shape=input_shape))
    cnn.add(MaxPooling2D(pool_size=(3,3), strides=(2,2)))

    cnn.add(Conv2D(64, kernel_size=(5,5), activation='relu', padding='same'))
    cnn.add(MaxPooling2D(pool_size=(3,3), strides=(2,2)))

    cnn.add(Conv2D(64, kernel_size=(5,5), activation='relu', padding='same'))
    cnn.add(MaxPooling2D(pool_size=(3,3), strides=(2,2)))

    cnn.add(LocallyConnected2D(32, kernel_size=(3,3), activation='relu'))

    cnn.add(Flatten())
    cnn.add(Dense(num_classes, activation='softmax'))

    cnn.compile(loss='categorical_crossentropy', 
                optimizer=Adam(learning_rate=0.001), 
                metrics=['accuracy'])
    
    return cnn

In [None]:
def create_cnn_with_bn_dropout(input_shape, num_classes):

    cnn = Sequential()

    cnn.add(Conv2D(64, kernel_size=(5,5), activation='relu', input_shape=input_shape))
    cnn.add(BatchNormalization())
    cnn.add(MaxPooling2D(pool_size=(3,3), strides=(2,2)))
    cnn.add(Dropout(0.3))

    cnn.add(Conv2D(64, kernel_size=(5,5), activation='relu', padding='same'))
    cnn.add(BatchNormalization())
    cnn.add(MaxPooling2D(pool_size=(3,3), strides=(2,2)))
    cnn.add(Dropout(0.3))

    cnn.add(Conv2D(64, kernel_size=(5,5), activation='relu', padding='same'))
    cnn.add(BatchNormalization())
    cnn.add(MaxPooling2D(pool_size=(3,3), strides=(2,2)))
    cnn.add(Dropout(0.3))

    cnn.add(LocallyConnected2D(32, kernel_size=(3,3), activation='relu'))
    cnn.add(BatchNormalization())

    cnn.add(Flatten())
    cnn.add(Dense(num_classes, activation='softmax'))

    cnn.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    
    return cnn

# **Experiments**

In [None]:
GENRES = ['blues', 'classical', 'country', 'disco', 'hiphop',
          'jazz', 'metal', 'pop', 'reggae', 'rock']

GTZAN = pd.read_csv('https://docs.google.com/uc?export=download&id=10KjnUR4jAwsBYOi_I9OJkzUUOfv_RpUY')

CMAP_LIGHT = sns.light_palette("#98D2AB", as_cmap=True)
CMAP_DARK = cmap = sns.dark_palette("#98D2AB", as_cmap=True)

In [None]:
from keras import backend as K
import tensorflow as tf


def get_recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    all_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (all_positives + K.epsilon())
    return recall

def get_precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(tf.cast(y_pred, tf.float64), 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def get_f1(y_true, y_pred):
    precision = get_precision(y_true, y_pred)
    recall = get_recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

## *Extreme Gradient Boosting*

In [None]:
# Acquire training, validation, and testing sets
X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb = process_data(GTZAN)
X_val_xgb, X_test_xgb, y_val_xgb, y_test_xgb = train_test_split(X_test_xgb, y_test_xgb, test_size=0.5, random_state=1)

In [None]:
# Define model
model = XGBClassifier(objective='multi:softmax')
best_xgb, best_xgb_parameters, best_xgb_accuracy = evaluate_model(model, X_train_xgb, y_train_xgb)

joblib.dump([best_xgb, best_xgb_parameters, best_xgb_accuracy], "best_xgb.joblib")
files.download("best_xgb.joblib")

In [None]:
[best_xgb, best_xgb_parameters, best_xgb_accuracy] = joblib.load('/content/best_xgb.joblib')

In [None]:
best_xgb = XGBClassifier(objective='multi:softmax', **best_xgb_parameters)

In [None]:
# Train model with full training data
best_xgb_history = best_xgb.fit(X=X_train_xgb, 
                                y=y_train_xgb, 
                                eval_set=[(X_train_xgb, y_train_xgb), (X_val_xgb, y_val_xgb)],
                                eval_metric=['mlogloss', 'merror'], 
                                verbose=True)
y_pred_xgb = best_xgb.predict(X_test_xgb)

In [None]:
# Evaluate classification performance
xbg_accuracy = accuracy_score(y_test_xgb, y_pred_xgb)
xbg_recall = get_recall(y_test_xgb, y_pred_xgb)
xbg_precision = get_precision(y_test_xgb, y_pred_xgb)
xbg_f1 = get_f1(y_test_xgb, y_pred_xgb)

In [None]:
print(f'XGBoost Accuracy: {xbg_accuracy * 100:.2f}%')
print(f'XGBoost Recall: {xbg_recall * 100:.2f}%')
print(f'XGBoost Precision: {xbg_precision * 100:.2f}%')
print(f'XGBoost F1 Score: {xbg_f1 * 100:.2f}%')

In [None]:
# Plot confusion matrix
confusion_matrix_xgb = confusion_matrix(y_test_xgb, y_pred_xgb)
plt.figure(figsize = (16, 9))
sns.heatmap(confusion_matrix_xgb, cmap=CMAP_LIGHT, annot=True, xticklabels=genres, yticklabels=genres)

In [None]:
# Plot feature importance
importances = best_xgb.feature_importances_
feature_names = GTZAN.iloc[:, 2:-1].columns.tolist()

sorted_idx = np.argsort(importances)[::-1]
sorted_importances = importances[sorted_idx][:20]
sorted_feature_names = np.array(feature_names)[sorted_idx][:20]  

n_colors = len(sorted_feature_names)
colors = CMAP_DARK(np.linspace(0, 1, n_colors))

fig, ax = plt.subplots(figsize=(10, 8))
sns.barplot(x=sorted_importances, y=sorted_feature_names, palette=colors, ax=ax)

ax.set_title("Feature Importance", fontsize=18)
ax.set_xlabel("Importance", fontsize=16)
ax.set_ylabel("Features", fontsize=16)
ax.tick_params(labelsize=14)
plt.show()

In [None]:
# Get the evaluation results
eval_results = best_xgb.evals_result()

# Log loss
train_logloss = eval_results['validation_0']['mlogloss']
val_logloss = eval_results['validation_1']['mlogloss']

# Accuracy (1 - merror)
train_accuracy = [1 - x for x in eval_results['validation_0']['merror']]
val_accuracy = [1 - x for x in eval_results['validation_1']['merror']]

epochs = range(1, len(train_logloss) + 1)

# Plot log loss
plt.figure(figsize=(10, 8))
plt.plot(epochs, train_logloss, label='Training Log Loss')
plt.plot(epochs, val_logloss, label='Validation Log Loss')
plt.xlabel('Epochs', fontsize=16)
plt.ylabel('Log Loss', fontsize=16)
plt.title('Log Loss', fontsize=18)
plt.legend(fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.grid(True)
plt.show()

## *Convolutional Neural Network*

In [None]:
data_path = '/content/drive/MyDrive/MIE424 Project/Data'

spectrograms, labels = get_cnn_data(data_path)

joblib.dump([spectrograms, labels], "cnn_raw.joblib")
files.download("cnn_raw.joblib")

  0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:02<04:38,  2.82s/it][A
  2%|▏         | 2/100 [00:06<05:18,  3.25s/it][A
  3%|▎         | 3/100 [00:09<05:05,  3.15s/it][A
  4%|▍         | 4/100 [00:11<04:34,  2.86s/it][A
  5%|▌         | 5/100 [00:14<04:16,  2.70s/it][A
  6%|▌         | 6/100 [00:16<04:04,  2.60s/it][A
  7%|▋         | 7/100 [00:19<04:04,  2.63s/it][A
  8%|▊         | 8/100 [00:21<03:58,  2.59s/it][A
  9%|▉         | 9/100 [00:24<03:50,  2.53s/it][A
 10%|█         | 10/100 [00:26<03:44,  2.49s/it][A
 11%|█         | 11/100 [00:29<03:39,  2.47s/it][A
 12%|█▏        | 12/100 [00:31<03:41,  2.52s/it][A
 13%|█▎        | 13/100 [00:34<03:39,  2.52s/it][A
 14%|█▍        | 14/100 [00:36<03:34,  2.50s/it][A
 15%|█▌        | 15/100 [00:39<03:30,  2.48s/it][A
 16%|█▌        | 16/100 [00:41<03:26,  2.45s/it][A
 17%|█▋        | 17/100 [00:44<03:26,  2.48s/it][A
 18%|█▊        | 18/100 [00:46<03:32,  2.59

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
[spectrograms, labels] = joblib.load('/content/cnn_raw.joblib')

In [None]:
X_train_cnn, X_test_cnn, y_train_cnn, y_test_cnn = process_cnn_data(spectrograms, labels)
X_val_cnn, X_test_cnn, y_val_cnn, y_test_cnn = train_test_split(X_test_cnn, y_test_cnn, test_size=0.5, random_state=1)

In [None]:
input_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3])

cnn = create_cnn(input_shape=input_shape, num_classes=len(GENRES))

In [None]:
cnn.summary()

In [None]:
cnn_history = cnn.fit(x=X_train, 
                      y=y_train, 
                      validation_data=(X_val_cnn, y_val_cnn), 
                      epochs=100,
                      verbose=True)

y_pred_cnn = cnn.predict(X_test_cnn)

In [None]:
# Evaluate performance
cnn_accuracy_1 = accuracy_score(y_test_cnn, y_pred_cnn)
cnn_recall_1 = get_recall(y_test_cnn, y_pred_cnn)
cnn_precision_1 = get_precision(y_test_cnn, y_pred_cnn)
cnn_f1_1 = get_f1(y_test_cnn, y_pred_cnn)

In [None]:
print(f'CNN Accuracy: {cnn_accuracy_1 * 100:.2f}%')
print(f'CNN Recall: {cnn_recall_1 * 100:.2f}%')
print(f'CNN Precision: {cnn_precision_1 * 100:.2f}%')
print(f'CNN F1 Score: {cnn_f1_1 * 100:.2f}%')

In [None]:
np.argmax(y_pred_cnn, axis=1)

In [None]:
confusion_matrix_cnn = confusion_matrix(np.argmax(y_test_cnn, axis=1), np.argmax(y_pred_cnn, axis=1))
plt.figure(figsize = (16, 9))
sns.heatmap(confusion_matrix_cnn, cmap=CMAP_LIGHT, annot=True, xticklabels = GENRES, yticklabels=GENRES)

In [None]:
plt.figure(figsize=(10, 8))
plt.plot(cnn_history.epoch, cnn_history.history['loss'], label='Training Loss')
plt.plot(cnn_history.epoch, cnn_history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs', fontsize=16)
plt.ylabel('Accuracy', fontsize=16)
plt.title('Training and Validation Loss', fontsize=18)
plt.legend(fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.grid(True)
plt.show()

Overfitting

In [None]:
input_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3])

cnn_with_bn_dropout = create_cnn_with_bn_dropout(input_shape=input_shape, num_classes=len(GENRES))
cnn_with_bn_dropout.summary()

cnn_history_2 = cnn.fit(x=X_train, 
                      y=y_train, 
                      validation_data=(X_val_cnn, y_val_cnn), 
                      epochs=100,
                      verbose=True)

y_pred_cnn_2 = cnn.predict(X_test_cnn)

Performance Metrics

In [None]:
cnn_accuracy_2 = accuracy_score(y_test_cnn, y_pred_cnn_2)
cnn_recall_2 = get_recall(y_test_cnn, y_pred_cnn_2)
cnn_precision_2 = get_precision(y_test_cnn, y_pred_cnn_2)
cnn_f1_2 = get_f1(y_test_cnn, y_pred_cnn_2)

In [None]:
print(f'CNN Accuracy: {cnn_accuracy_2 * 100:.2f}%')
print(f'CNN Recall: {cnn_recall_2 * 100:.2f}%')
print(f'CNN Precision: {cnn_precision_2 * 100:.2f}%')
print(f'CNN F1 Score: {cnn_f1_2 * 100:.2f}%')

## *SDA*

In [None]:
# Acquire training, validation, and testing sets
X_train_sda, X_test_sda, y_train_sda, y_test_sda = process_data(GTZAN)
X_val_sda, X_test_sda, y_val_sda, y_test_sda = train_test_split(X_test_sda, y_test_sda, test_size=0.2, random_state=1)

In [None]:
stacked_encoder_1, svc_1 = create_sdae_svc(X_train_sda, X_val_sda, y_train_sda, y_val_sda, num_layers=1, layer_sizes=[64], kernel='rbf', gamma=1, C=120, learning_rate=0.01, epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
X_train_features_1 = stacked_encoder_1.predict(X_train_sda)
X_test_features_1 = stacked_encoder_1.predict(X_test_sda)
X_val_features_1 = stacked_encoder_1.predict(X_val_sda)



In [None]:
sda_history_1 = svc_1.fit(X=X_train_features_1, y=to_categorical(y_train_sda))

y_pred_1 = svc_1.predict(X_test_features_1)

In [None]:
y_pred_1

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
sda_accuracy_1 = accuracy_score(to_categorical(y_test_sda), y_pred_1)
sda_recall_1 = get_recall(tf.cast(to_categorical(y_test_sda), tf.float32), y_pred_1)
sda_precision_1 = get_precision(tf.cast(to_categorical(y_test_sda), tf.float64), y_pred_1)
sda_f1_1 = get_f1(tf.cast(to_categorical(y_test_sda), tf.float64), y_pred_1)

In [None]:
print(f'SVC Accuracy: {sda_accuracy_1 * 100:.2f}%')
print(f'SVC Recall: {sda_recall_1 * 100:.2f}%')
print(f'SVC Precision: {sda_precision_1 * 100:.2f}%')
print(f'SVC F1 Score: {sda_f1_1 * 100:.2f}%')