# Deep Convolutional Neural Network for Makam Recognition
10-fold cross validation

### Library importing for file reading and preprocessing

In [2]:
import glob
import os
import sys
import json
import numpy as np
from mutagen.mp3 import MP3
import librosa
from scipy.interpolate import interp2d 
from sklearn.preprocessing import normalize
import warnings
from math import floor
import dill
import matplotlib.pyplot as plt
%matplotlib inline

### Pickle session

In [None]:
dill.dump_session('cnn10fold.db')

In [4]:
dill.load_session('cnn1fold.db')

Using TensorFlow backend.


### File reading using provided folds and preprocessing

In [27]:
#--------PARAMETERS--------#

#Choose truncation option
# 1 : 15 seconds starting from the start
# 2 : 15 seconds around the middle
# 3 : 15 seconds from the end
trunc_option = 2

#Choose bins and bins per octave
n_bins = 371
bins_per_octave = 53

#--------------------------#

#Ignore mp3 read warning
warnings.filterwarnings("ignore", message="PySoundFile failed. Trying audioread instead.")

makams = ["Acemasiran", "Acemkurdi", "Bestenigar", "Beyati", "Hicaz", "Hicazkar", "Huseyni", "Huzzam", "Karcigar", "Kurdilihicazkar", "Mahur", "Muhayyer", "Neva", "Nihavent", "Rast", "Saba", "Segah", "Sultaniyegah", "Suzinak", "Ussak"]

#Arrays containing all constant-Q transforms of the soundfiles per fold
X_train = [[],[],[],[],[],[],[],[],[],[]]
X_test = [[],[],[],[],[],[],[],[],[],[]]

#Array containing all makam labels per fold
y_train = [[],[],[],[],[],[],[],[],[],[]]
y_test = [[],[],[],[],[],[],[],[],[],[]]

#Import folds
f = open("./dlfm_makam_recognition_data/folds_updated.json")
folds = json.load(f)


#Compute cqts and import labels
for i in range(10): #in number of folds
    
    print("\nLoading fold " + str(i) + ".")
    
    #compute all training cqts and import train labels
    for source in range(len(folds[i][1]["training"]["sources"])):
        
        file_name = ("./soundfiles/" + folds[i][1]["training"]["sources"][source] + ".mp3")

        #get file duration for loading
        try:
            audio = MP3(file_name)
        except:
            print("Problem with:", file_name)
            continue
        audio = MP3(file_name)
        duration = audio.info.length
        
        #load 15 seconds from the mp3 according to truncation option
        if (trunc_option == 1): #15 seconds form the start
            #load soundfile
            y, sr = librosa.core.load(file_name, duration=15)
            #compute the constant-Q transform from audio signal
            cqt = librosa.core.cqt(y, sr, n_bins=n_bins, bins_per_octave=bins_per_octave) #up to C7 in 53TET
            
        if (trunc_option == 2): #15 seconds around the center
            offset = (duration-15)/2
            #load soundfile
            y, sr = librosa.core.load(file_name, offset=offset, duration=15)
            #compute the constant-Q transform from audio signal
            cqt = librosa.core.cqt(y, sr, n_bins=n_bins, bins_per_octave=bins_per_octave) #up to C7 in 53TET
            
        if (trunc_option == 3): #15 seconds from the end
            offset = duration-15
            #load soundfile
            y, sr = librosa.core.load(file_name, offset=offset, duration=15)
            #compute the constant-Q transform from audio signal
            cqt = librosa.core.cqt(y, sr, n_bins=n_bins, bins_per_octave=bins_per_octave) #up to C7 in 53TET
        
        #Normalization
        cqt = (cqt - np.mean(cqt)) / (np.std(cqt) + 1e-8)

        X_train[i].append(cqt)
        
        #import makam training label
        y_train[i].append(makams.index(folds[i][1]["training"]["modes"][source]))
        
        sys.stdout.write("\rLoading training %i/900" % (source+1))
        sys.stdout.flush()
        
    print()
    
    #compute all testing cqts and import test labels
    for ref in range(len(folds[i][1]["testing"])):
        
        file_name = ("./soundfiles/" + folds[i][1]["testing"][ref]["source"] + ".mp3")

        #get file duration for loading
        try:
            audio = MP3(file_name)
        except:
            print("Problem with:", file_name)
            continue
        duration = audio.info.length
        
        #load 15 seconds from the mp3 according to truncation option
        if (trunc_option == 1): #15 seconds form the start
            #load soundfile
            y, sr = librosa.core.load(file_name, duration=15)
            #compute the constant-Q transform from audio signal
            cqt = librosa.core.cqt(y, sr, n_bins=n_bins, bins_per_octave=bins_per_octave) #up to C7 in 53TET
            
        if (trunc_option == 2): #15 seconds around the center
            offset = (duration-15)/2
            #load soundfile
            y, sr = librosa.core.load(file_name, offset=offset, duration=15)
            #compute the constant-Q transform from audio signal
            cqt = librosa.core.cqt(y, sr, n_bins=n_bins, bins_per_octave=bins_per_octave) #up to C7 in 53TET
            
        if (trunc_option == 3): #15 seconds from the end
            offset = duration-15
            #load soundfile
            y, sr = librosa.core.load(file_name, offset=offset, duration=15)
            #compute the constant-Q transform from audio signal
            cqt = librosa.core.cqt(y, sr, n_bins=n_bins, bins_per_octave=bins_per_octave) #up to C7 in 53TET
        
        #Normalization
        cqt = (cqt - np.mean(cqt)) / (np.std(cqt) + 1e-8)
        
        X_test[i].append(cqt)
        
        #import makam testing label
        y_test[i].append(makams.index(folds[i][1]["testing"][ref]["mode"]))
        
        sys.stdout.write("\rLoading testing %i/100" % (ref+1))
        sys.stdout.flush()
        
print("\nFold loading completed.")

print("X_train shape:", len(X_train), len(X_train[0]), X_train[0][0].shape)
print("X_test shape:", len(X_test), len(X_test[0]), X_test[0][0].shape)
print("y_train shape:", len(y_train))
print("y_test shape:", len(y_test))

f.close()


Loading fold 0.
Loading training 900/900
Loading testing 100/100
Fold loading completed.
X_train shape: 10 900 (371, 646)
X_test shape: 10 100 (371, 646)
y_train shape: 10
y_test shape: 10


### Library importing for deep learning

In [25]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Activation, BatchNormalization, Dropout
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.optimizers import Adam

### Data preparation

In [34]:
#one-hot encode target vector
Y_train = []
Y_test = []
for i in range(10):
    Y_train.append(to_categorical(np.asarray(y_train[i])))
    Y_test.append(to_categorical(np.asarray(y_test[i])))
Y_train = np.asarray(Y_train)
Y_test = np.asarray(Y_test)
#channel dimension
for i in range(10):
    X_train[i] = (np.asarray(X_train[i])).reshape(len(X_train[i]), X_train[i][0].shape[0], X_train[i][0].shape[1], 1)
    X_test[i] = (np.asarray(X_test[i])).reshape(len(X_test[i]), X_test[i][0].shape[0], X_test[i][0].shape[1], 1)
# X_train = np.asarray(X_train)
# X_test = np.asarray(X_test)
print("X_train shape:", X_train[0].shape)
print("X_test shape:", X_test[0].shape)
print("Y_train shape:", Y_train.shape)
print("Y_test shape:", Y_test.shape)

X_train shape: (900, 371, 646, 1)
X_test shape: (100, 371, 646, 1)
Y_train shape: (1, 900, 20)
Y_test shape: (1, 100, 20)


### Constructing model

In [47]:
models = []
for i in range(10):
    #Network topology
    model = Sequential()

    #3 convolutional layers
    model.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu', input_shape=(X_train[0].shape[1], X_train[0].shape[2], 1)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(BatchNormalization())

    model.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu', input_shape=(X_train[0].shape[1], X_train[0].shape[2], 1)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(BatchNormalization())

    model.add(Conv2D(filters=64, kernel_size=(2,2), activation='relu', input_shape=(X_train[0].shape[1], X_train[0].shape[2], 1)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(BatchNormalization())

    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))

    #Output layer
    model.add(Dense(20, activation='softmax'))
    
    #Optimizer
    optimizer = Adam(learning_rate=0.0001)
    
    #Compile model
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    model.summary()
    
    models.append(model)

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_28 (Conv2D)           (None, 369, 644, 64)      640       
_________________________________________________________________
max_pooling2d_28 (MaxPooling (None, 184, 322, 64)      0         
_________________________________________________________________
batch_normalization_28 (Batc (None, 184, 322, 64)      256       
_________________________________________________________________
conv2d_29 (Conv2D)           (None, 182, 320, 64)      36928     
_________________________________________________________________
max_pooling2d_29 (MaxPooling (None, 91, 160, 64)       0         
_________________________________________________________________
batch_normalization_29 (Batc (None, 91, 160, 64)       256       
_________________________________________________________________
conv2d_30 (Conv2D)           (None, 90, 159, 64)     

### Model training

In [48]:
histories = []
for i in range(1):
    history = models[i].fit(X_train[i], Y_train[i], validation_split=0.25, epochs=50, batch_size=32)
    histories.append(history)

Train on 675 samples, validate on 225 samples
Epoch 1/5


  return ops.EagerTensor(value, ctx.device_name, dtype)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Model evaluation

In [52]:
scores = [[],[],[],[],[],[],[],[],[],[]]
y_pred = [[],[],[],[],[],[],[],[],[],[]]
for i in range(1):
    scores[i].append(models[i].evaluate(X_test[i], Y_test[i], verbose=1))
    y_pred[i].append(np.argmax(models[i].predict(X_test[i], verbose=1)))
scores = np.asarray(scores)
y_pred = np.asarray(y_pred)



In [61]:
from sklearn.metrics import confusion_matrix
y_test=np.asarray(y_test)
cms = np.zeros((20,20))
for i in range(10):
    cms += confusion_matrix(y_test[i], y_pred[i])

[list([1995]) list([]) list([]) list([]) list([]) list([]) list([])
 list([]) list([]) list([])]


In [None]:
plt.matshow(cms)
plt.show()