# Imports

In [1]:
import pandas as pd
import numpy as np

import librosa
import librosa.display

import matplotlib.pyplot as plt

from tqdm import tqdm

# Load Feature File

In [2]:
df = pd.read_pickle('features.csv')

In [3]:
df.head()

Unnamed: 0,file,label,mel_spec,chroma,mfcc
0,data/train/train1.aiff,0,"[[-23.034714, -21.979626, -21.229372, -24.6853...","[[0.31054255, 0.492938, 0.4222786, 0.2000359, ...","[[-583.5913, -585.6181, -586.71844, -586.3785,..."
1,data/train/train2.aiff,0,"[[-39.132935, -43.144306, -44.267673, -40.4978...","[[0.77240217, 0.4858926, 0.57792246, 0.6777284...","[[-551.0518, -595.86365, -686.11633, -690.5628..."
2,data/train/train3.aiff,0,"[[-29.182926, -34.866005, -49.983154, -50.7379...","[[0.34971952, 0.612205, 0.78210807, 1.0, 0.630...","[[-580.9305, -609.74207, -674.57465, -674.7717..."
3,data/train/train4.aiff,0,"[[-38.25903, -41.157684, -50.174606, -53.21215...","[[0.9187672, 1.0, 0.65901333, 1.0, 1.0, 0.5160...","[[-655.11835, -657.95215, -661.39966, -655.009..."
4,data/train/train5.aiff,0,"[[-29.123068, -35.563293, -50.79712, -52.36865...","[[0.9564307, 0.88239145, 1.0, 1.0, 0.52427757,...","[[-556.8616, -578.51294, -618.99274, -616.5934..."


# Define Features and Target

In [5]:
X = df[['mel_spec','chroma','mfcc']]
y = df['label']

# Split Data

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [10]:
ms_train = x_train['mel_spec']
ch_train = x_train['chroma']
mf_train = x_train['mfcc']

ms_test = x_test['mel_spec']
ch_test = x_test['chroma']
mf_test = x_test['mfcc']

# Convert Data

In [11]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [12]:
le = LabelEncoder()
y_test = to_categorical(le.fit_transform(y_test)) 
y_train = to_categorical(le.fit_transform(y_train)) 

In [13]:
ms_train = np.array(ms_train.tolist())
ch_train = np.array(ch_train.tolist())
mf_train = np.array(mf_train.tolist())

ms_test = np.array(ms_test.tolist())
ch_test = np.array(ch_test.tolist())
mf_test = np.array(mf_test.tolist())

y_train = np.array(y_train.tolist())
y_test = np.array(y_test.tolist())

In [14]:
num_labels = 2
filter_size = 2

# Model

### Model Imports

In [15]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, concatenate, Input
from tensorflow.keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from sklearn import metrics 

In [22]:
ms_train.shape

(24000, 87, 128, 1)

In [23]:
ch_train.shape

(24000, 87, 12, 1)

In [17]:
mf_train.shape

(24000, 20, 87)

In [19]:
num_rows = 87

num_columns_ms = 128
num_columns_ch = 12
num_columns_mf = 20

num_channels = 1

In [20]:
ms_train = ms_train.reshape(ms_train.shape[0], num_rows, num_columns_ms, num_channels)
ms_test = ms_test.reshape(ms_test.shape[0], num_rows, num_columns_ms, num_channels)

ch_train = ch_train.reshape(ch_train.shape[0], num_rows, num_columns_ch, num_channels)
ch_test = ch_test.reshape(ch_test.shape[0], num_rows, num_columns_ch, num_channels)

mf_train = mf_train.reshape(mf_train.shape[0], num_rows, num_columns_mf, num_channels)
mf_test = mf_test.reshape(mf_test.shape[0], num_rows, num_columns_mf, num_channels)

In [26]:
# Construct model with Mel Spectrograms 
ms = Sequential()
ms.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns_ms, num_channels), activation='relu'))
ms.add(MaxPooling2D(pool_size=2))
ms.add(Dropout(0.2))

ms.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
ms.add(MaxPooling2D(pool_size=2))
ms.add(Dropout(0.2))

ms.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
ms.add(MaxPooling2D(pool_size=2))
ms.add(Dropout(0.2))

ms.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
ms.add(MaxPooling2D(pool_size=2))
ms.add(Dropout(0.2))
ms.add(GlobalAveragePooling2D())

# Construct model with Chroma Feature 
ch = Sequential()
ch.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns_ch, num_channels), activation='relu'))
ch.add(MaxPooling2D(pool_size=1))
ch.add(Dropout(0.2))

ch.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
ch.add(MaxPooling2D(pool_size=1))
ch.add(Dropout(0.2))

ch.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
ch.add(MaxPooling2D(pool_size=1))
ch.add(Dropout(0.2))

ch.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
ch.add(MaxPooling2D(pool_size=1))
ch.add(Dropout(0.2))
ch.add(GlobalAveragePooling2D())

#Merge Outputs

model_concat = concatenate([ms.output, ch.output])

model_concat = Dense(num_labels, activation='softmax')(model_concat)

model = Model(inputs=[ms.input, ch.input], outputs=model_concat)

In [27]:
# Compile the model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

# Display model architecture summary 
model.summary()

# Calculate pre-training accuracy 
score = model.evaluate([ms_test, ch_test], y_test, verbose=1)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy) 

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
conv2d_8_input (InputLayer)     [(None, 87, 128, 1)] 0                                            
__________________________________________________________________________________________________
conv2d_12_input (InputLayer)    [(None, 87, 12, 1)]  0                                            
__________________________________________________________________________________________________
conv2d_8 (Conv2D)               (None, 86, 127, 16)  80          conv2d_8_input[0][0]             
__________________________________________________________________________________________________
conv2d_12 (Conv2D)              (None, 86, 11, 16)   80          conv2d_12_input[0][0]            
______________________________________________________________________________________________

In [23]:
from tensorflow.keras.callbacks import ModelCheckpoint 
from datetime import datetime

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.basic_cnn.hdf5', 
                               verbose=1, save_best_only=True)

# Define early_stopping_monitor
from tensorflow.keras.callbacks import EarlyStopping
early_stopping_monitor = EarlyStopping(patience=10)

my_callbacks = [early_stopping_monitor, checkpointer]

In [30]:
from datetime import datetime 

num_epochs = 1000
num_batch_size = 10

start = datetime.now()

model.fit([ms_train, ch_train], y_train, 
          batch_size=num_batch_size, 
          epochs=num_epochs, 
          validation_data=([ms_test, ch_test], y_test), 
          callbacks=my_callbacks, 
          verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/1000

Epoch 00001: val_loss improved from 0.45080 to 0.36629, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 2/1000

Epoch 00002: val_loss improved from 0.36629 to 0.31897, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 3/1000

Epoch 00003: val_loss did not improve from 0.31897
Epoch 4/1000

Epoch 00004: val_loss did not improve from 0.31897
Epoch 5/1000

Epoch 00005: val_loss did not improve from 0.31897
Epoch 6/1000

Epoch 00006: val_loss improved from 0.31897 to 0.29913, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 7/1000

Epoch 00007: val_loss improved from 0.29913 to 0.28051, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 8/1000

Epoch 00008: val_loss improved from 0.28051 to 0.27558, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 9/1000

Epoch 00009: val_loss did not improve from 0.27558
Epoch 10/1000

Epoch 00010: val_loss improved from 0.27558 to 0.26966, saving model to saved_models/weigh

In [32]:
# Evaluating the model on the training and testing set
score = model.evaluate([ms_train, ch_train], y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate([ms_test,ch_test], y_test, verbose=0)
print("Testing Accuracy: ", score[1])

Training Accuracy:  0.8846250176429749
Testing Accuracy:  0.8858333230018616


In [33]:
y_pred = model.predict([ms_test,ch_test])

In [43]:
y_pred = np.argmax(y_pred, axis=1)

In [34]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [49]:
y_test[:,1]

array([0., 0., 0., ..., 1., 1., 0.])

In [50]:
classification_report(y_test[:,1], y_pred)

'              precision    recall  f1-score   support\n\n         0.0       0.88      0.98      0.93      4639\n         1.0       0.90      0.56      0.69      1361\n\n    accuracy                           0.89      6000\n   macro avg       0.89      0.77      0.81      6000\nweighted avg       0.89      0.89      0.88      6000\n'

In [51]:
confusion_matrix(y_test[:,1], y_pred)

array([[4555,   84],
       [ 601,  760]])

In [53]:
model.save('multicnn')

INFO:tensorflow:Assets written to: multicnn/assets


# CNN with 3 Features

In [22]:
# Construct model with Mel Spectrograms 
ms2 = Sequential()
ms2.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns_ms, num_channels), activation='relu'))
ms2.add(MaxPooling2D(pool_size=2))
ms2.add(Dropout(0.2))

ms2.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
ms2.add(MaxPooling2D(pool_size=2))
ms2.add(Dropout(0.2))

ms2.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
ms2.add(MaxPooling2D(pool_size=2))
ms2.add(Dropout(0.2))

ms2.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
ms2.add(MaxPooling2D(pool_size=2))
ms2.add(Dropout(0.2))
ms2.add(GlobalAveragePooling2D())

# Construct model with Chroma Feature 
ch2 = Sequential()
ch2.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns_ch, num_channels), activation='relu'))
ch2.add(MaxPooling2D(pool_size=1))
ch2.add(Dropout(0.2))

ch2.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
ch2.add(MaxPooling2D(pool_size=1))
ch2.add(Dropout(0.2))

ch2.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
ch2.add(MaxPooling2D(pool_size=1))
ch2.add(Dropout(0.2))

ch2.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
ch2.add(MaxPooling2D(pool_size=1))
ch2.add(Dropout(0.2))
ch2.add(GlobalAveragePooling2D())

#Consturct model with MFCC
mf2 = Sequential()
mf2.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns_mf, num_channels), activation='relu'))
mf2.add(MaxPooling2D(pool_size=1))
mf2.add(Dropout(0.2))

mf2.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
mf2.add(MaxPooling2D(pool_size=1))
mf2.add(Dropout(0.2))

mf2.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
mf2.add(MaxPooling2D(pool_size=1))
mf2.add(Dropout(0.2))

mf2.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
mf2.add(MaxPooling2D(pool_size=1))
mf2.add(Dropout(0.2))
mf2.add(GlobalAveragePooling2D())

#Merge Outputs

model_concat2 = concatenate([ms2.output, ch2.output, mf2.output])

model_concat2 = Dense(num_labels, activation='softmax')(model_concat2)

model2 = Model(inputs=[ms2.input, ch2.input, mf2.input], outputs=model_concat2)

In [25]:
# Compile the model
model2.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

# Display model architecture summary 
model2.summary()

# Calculate pre-training accuracy 
score = model2.evaluate([ms_test, ch_test, mf_test], y_test, verbose=1)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy) 

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
conv2d_8_input (InputLayer)     [(None, 87, 128, 1)] 0                                            
__________________________________________________________________________________________________
conv2d_12_input (InputLayer)    [(None, 87, 12, 1)]  0                                            
__________________________________________________________________________________________________
conv2d_16_input (InputLayer)    [(None, 87, 20, 1)]  0                                            
__________________________________________________________________________________________________
conv2d_8 (Conv2D)               (None, 86, 127, 16)  80          conv2d_8_input[0][0]             
______________________________________________________________________________________________

In [26]:
from datetime import datetime 

num_epochs = 1000
num_batch_size = 10

start = datetime.now()

model2.fit([ms_train, ch_train, mf_train], y_train, 
          batch_size=num_batch_size, 
          epochs=num_epochs, 
          validation_data=([ms_test, ch_test, mf_test], y_test), 
          callbacks=my_callbacks, 
          verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/1000

Epoch 00001: val_loss improved from inf to 0.36656, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 2/1000

Epoch 00002: val_loss improved from 0.36656 to 0.30455, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 3/1000

Epoch 00003: val_loss improved from 0.30455 to 0.28822, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 4/1000

Epoch 00004: val_loss did not improve from 0.28822
Epoch 5/1000

Epoch 00005: val_loss did not improve from 0.28822
Epoch 6/1000

Epoch 00006: val_loss improved from 0.28822 to 0.26133, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 7/1000

Epoch 00007: val_loss improved from 0.26133 to 0.25124, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 8/1000

Epoch 00008: val_loss did not improve from 0.25124
Epoch 9/1000

Epoch 00009: val_loss improved from 0.25124 to 0.24449, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 10/1000

Epoch 00010: val_loss did not im

In [28]:
# Evaluating the model on the training and testing set
score = model2.evaluate([ms_train, ch_train, mf_train], y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model2.evaluate([ms_test,ch_test, mf_test], y_test, verbose=0)
print("Testing Accuracy: ", score[1])

Training Accuracy:  0.9125833511352539
Testing Accuracy:  0.9011666774749756


In [30]:
model2.save('multicnn3')

INFO:tensorflow:Assets written to: multicnn3/assets
