# Deep Convolutional Neural Network for Makam Recognition
10-fold cross validation

### Library importing for file reading and preprocessing

In [9]:
import glob
import os
import json
import numpy as np
from mutagen.mp3 import MP3
import librosa
from scipy.interpolate import interp2d 
from sklearn.preprocessing import normalize
import warnings
from math import floor
import dill

### Pickle session

In [None]:
dill.dump_session('cnn10fold.db')

In [None]:
dill.load_session('cnn10fold.db')

### File reading using provided folds and preprocessing

In [None]:
#--------PARAMETERS--------#

#Choose truncation option
# 1 : 15 seconds starting from the start
# 2 : 15 seconds around the middle
# 3 : 15 seconds from the end
trunc_option = 2

#Choose bins and bins per octave
n_bins = 371
bins_per_octave = 53

#--------------------------#

#Ignore mp3 read warning
warnings.filterwarnings("ignore", message="PySoundFile failed. Trying audioread instead.")

makams = ["Acemasiran", "Acemkurdi", "Bestenigar", "Beyati", "Hicaz", "Hicazkar", "Huseyni", "Huzzam", "Karcigar", "Kurdilihicazkar", "Mahur", "Muhayyer", "Neva", "Nihavent", "Rast", "Saba", "Segah", "Sultaniyegah", "Suzinak", "Ussak"]

#Arrays containing all constant-Q transforms of the soundfiles per fold
X_train = [[],[],[],[],[],[],[],[],[],[]]
X_test = [[],[],[],[],[],[],[],[],[],[]]

#Array containing all makam labels per fold
y_train = [[],[],[],[],[],[],[],[],[],[]]
y_test = [[],[],[],[],[],[],[],[],[],[]]

#Import folds
f = open("./dlfm_makam_recognition_data/folds_updated.json")
folds = json.load(f)


#Compute cqts and import labels
for i in range(2): #in number of folds
    
    
    #compute all training cqts and import train labels
    for source in range(len(folds[i][1]["training"]["sources"])):
        
        file_name = ("./soundfiles/" + folds[i][1]["training"]["sources"][source] + ".mp3")

        #get file duration for loading
        try:
            audio = MP3(file_name)
        except:
            print("Problem with:", file_name)
            continue
        audio = MP3(file_name)
        duration = audio.info.length
        
        #load 15 seconds from the mp3 according to truncation option
        if (trunc_option == 1): #15 seconds form the start
            #load soundfile
            y, sr = librosa.core.load(file_name, duration=15)
            #compute the constant-Q transform from audio signal
            cqt = librosa.core.cqt(y, sr, n_bins=n_bins, bins_per_octave=bins_per_octave) #up to C7 in 53TET
            
        if (trunc_option == 2): #15 seconds around the center
            offset = (duration-15)/2
            #load soundfile
            y, sr = librosa.core.load(file_name, offset=offset, duration=15)
            #compute the constant-Q transform from audio signal
            cqt = librosa.core.cqt(y, sr, n_bins=n_bins, bins_per_octave=bins_per_octave) #up to C7 in 53TET
            
        if (trunc_option == 3): #15 seconds from the end
            offset = duration-15
            #load soundfile
            y, sr = librosa.core.load(file_name, offset=offset, duration=15)
            #compute the constant-Q transform from audio signal
            cqt = librosa.core.cqt(y, sr, n_bins=n_bins, bins_per_octave=bins_per_octave) #up to C7 in 53TET
        
        #Normalization
        cqt = (cqt - np.mean(cqt)) / (np.std(cqt) + 1e-8)
        
        X_train[i].append(cqt)
        
        #import makam training label
        y_train[i].append(makams.index(folds[i][1]["training"]["modes"][source]))
        
        
    #compute all testing cqts and import test labels
    for ref in range(len(folds[i][1]["testing"])):
        
        file_name = ("./soundfiles/" + folds[i][1]["testing"][ref]["source"] + ".mp3")

        #get file duration for loading
        try:
            audio = MP3(file_name)
        except:
            print("Problem with:", file_name)
            continue
        duration = audio.info.length
        
        #load 15 seconds from the mp3 according to truncation option
        if (trunc_option == 1): #15 seconds form the start
            #load soundfile
            y, sr = librosa.core.load(file_name, duration=15)
            #compute the constant-Q transform from audio signal
            cqt = librosa.core.cqt(y, sr, n_bins=n_bins, bins_per_octave=bins_per_octave) #up to C7 in 53TET
            
        if (trunc_option == 2): #15 seconds around the center
            offset = (duration-15)/2
            #load soundfile
            y, sr = librosa.core.load(file_name, offset=offset, duration=15)
            #compute the constant-Q transform from audio signal
            cqt = librosa.core.cqt(y, sr, n_bins=n_bins, bins_per_octave=bins_per_octave) #up to C7 in 53TET
            
        if (trunc_option == 3): #15 seconds from the end
            offset = duration-15
            #load soundfile
            y, sr = librosa.core.load(file_name, offset=offset, duration=15)
            #compute the constant-Q transform from audio signal
            cqt = librosa.core.cqt(y, sr, n_bins=n_bins, bins_per_octave=bins_per_octave) #up to C7 in 53TET
        
        #Normalization
        cqt = (cqt - np.mean(cqt)) / (np.std(cqt) + 1e-8)
        
        X_test[i].append(cqt)
        
        #import makam testing label
        y_test[i].append(makams.index(folds[i][1]["testing"][ref]["mode"]))
        
    print("Fold", i, "loaded.")

print("X_train shape:", len(X_train), len(X_train[0]), X_train[0][0].shape)
print("X_test shape:", len(X_test), len(X_test[0]), X_test[0][0].shape)
print("y_train shape:", len(y_train))
print("y_test shape:", len(y_test))

### Library importing for deep learning

In [15]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Activation, BatchNormalization, Dropout
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

Using TensorFlow backend.


### Data preparation

In [21]:
#one-hot encode target vector
Y_train = []
Y_test = []
for i in range(len(y_train)):
    Y_train.append(to_categorical(np.asarray(y_train[i])))
    Y_test.append(to_categorical(np.asarray(y_test[i])))
Y_train = to_categorical(np.asarray(y_train))
Y_test = to_categorical(np.asarray(y_test))
#channel dimension
for i in range(len(X_train)):
    X_train[i] = (np.asarray(X_train[i])).reshape(len(X_train[i]), X_train[i][0].shape[0], X_train[i][0].shape[1], 1)
    X_test[i] = (np.asarray(X_test[i])).reshape(len(X_test[i]), X_test[i][0].shape[0], X_test[i][0].shape[1], 1)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", len(y_train))
print("y_test shape:", len(y_test))

ValueError: zero-size array to reduction operation maximum which has no identity

### Constructing model

In [None]:
models = []
for i in range(len(folds)):
    #Network topology
    model = Sequential()

    #3 convolutional layers
    model.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu', input_shape=(X_train[i][0].shape[0], X_train[i][0].shape[1], 1)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(BatchNormalization())

    model.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu', input_shape=(X_train[i][0].shape[0], X_train[i][0].shape[1], 1)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(BatchNormalization())

    model.add(Conv2D(filters=64, kernel_size=(2,2), activation='relu', input_shape=(X_train[i][0].shape[0], X_train[i][0].shape[1], 1)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(BatchNormalization())

    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))

    #Output layer
    model.add(Dense(20, activation='softmax'))

    #Compile model
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    model.summary()
    
    models.append(model)

### Model training

In [None]:
histories = []
for i in range(len(folds)):
    history = model[i].fit(X_train[i], Y_train[i], validation_split=0.25, epochs=10, batch_size=30)
    histories.append(history)

### Model evaluation

In [None]:
scores = []
y_pred = []
for i in range(len(folds)):
    score = model[i].evaluate(X_test[i], Y_test[i], verbose=1)
    scores.append(score)