# Deep Convolutional Neural Network for Makam Recognition

### Library importing for file reading and preprocessing

In [2]:
import glob
import os
import numpy as np
import librosa
from scipy.interpolate import interp2d 
from sklearn.preprocessing import normalize

### File reading and preprocessing

Read all .mp3 files and retrieve their makam based on the folder that their equivalent .pitch files exists.

In [5]:
#Array containing all constant-Q tranforms of the soundfiles
cqts = []

#Array containing all makam labels
Y = []

#Makam list for more efficient file searching during label retrieval
makams = ["Acemasiran", "Acemkurdi", "Bestenigar", "Beyati", "Hicaz", "Hicazkar", "Huseyni", "Huzzam", "Karcigar", "Kurdilihicazkar", "Mahur", "Muhayyer", "Neva", "Nihavent", "Rast", "Saba", "Segah", "Sultaniyegah", "Suzinak", "Ussak"]

#Traverse directory
for root, dirs, files in os.walk('./soundfiles'):
        for name in files:
            
            #----------------------Labels------------------------#
            
            #find under which folder the file is (for makam retrieval) and append label set
            matched = 0
            for makam in makams:
                if (os.path.isfile("./otmm_makam_recognition_dataset/data/" + makam + "/" + name[:-4] + ".pitch") == True):
                    Y.append(makam)
                    matched = 1
                    break
                    
            #if soundfile not in pitch data, ignore
            if (matched == 0):
                continue
            
            #----------------Constant-Q Transform----------------#
            
            #construct soundfile directory
            filedir = os.path.join(root, name)
            
            #load soundfile
            y, sr = librosa.core.load(filedir)
            
            #compute the constant-Q transform from audio signal
            cqt = librosa.core.cqt(y, sr, n_bins=371, bins_per_octave=53)
            
            #append constat-Q transform to set
            cqts.append(cqt)
            
            print(cqt.shape)
            
print(len(cqts))
print(len(Y))
print(cqts)
print(Y)
            



(84, 7396)
1
1
[array([[ 2.83952995e-02-3.89740239e-05j,  1.10968372e-03-2.82432084e-02j,
        -2.81479643e-02-1.54263462e-03j, ...,
         0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
         0.00000000e+00+0.00000000e+00j],
       [ 8.23263045e-03-3.99725673e-04j, -3.47274064e-03-9.69378004e-03j,
        -1.25403658e-02+4.46751686e-03j, ...,
         0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
         0.00000000e+00+0.00000000e+00j],
       [-1.87201765e-02-1.32284282e-04j, -1.31531823e-02+1.36977796e-02j,
         2.40908465e-04+1.90910697e-02j, ...,
         0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
         0.00000000e+00+0.00000000e+00j],
       ...,
       [ 2.30075738e-06-2.02411164e-07j,  2.20067607e-04+5.59977871e-04j,
        -8.98479333e-05+5.08891169e-04j, ...,
         0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
         0.00000000e+00+0.00000000e+00j],
       [-5.64719955e-06-4.34

### Constant-Q transform resampling
CQTs are going to vary in length based on the length of the audio file. One wait to deal with this is resampling the CQTs to the common size (371, 3710). 

** Calculate average song length to justify that 3710 is a reasonable middle ground because some will need upscalling while most downscaling **

** I don't think resampled arrays contain complex numbers anymore. **

In [12]:
#Compute all interpolation functions
interpol_f = []
for cqt in cqts:
    Xindex = np.linspace(0, 1, num=371)
    Yindex = np.linspace(0, 1, num=cqt.shape[1])
    f = interp2d(Xindex, Yindex, cqt.flatten(), kind='linear')
    interpol_f.append(f)
    
#Resample cqts
X = []
for i in range(len(cqts)):
    Xindex_rs = np.linspace(0, 1, num=3710)
    Yindex_rs = np.linspace(0, 1, num=371)
    X.append(np.reshape(interpol_f[i](Xindex_rs, Yindex_rs), (371, 3710)))

### Constant-Q transform truncation
An alternative to resampling the cqts would be to truncate all soundfiles to the length of the shortest soundfiles.

** This seems like a less productive approach, but probably depends on duration deviation among soundfiles. **

### Library importing for deep learning

In [None]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Activation
from sklearn.model_selection import train_test_split

### Train - Test split


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Building the CNN

In [120]:
#Network topology
model = Sequential()

#3 convolutional layers
model.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu', input_shape=(371, 3710)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(keras.layers.BatchNormalization())

model.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu', input_shape=(371, 3710)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(keras.layers.BatchNormalization())

model.add(Conv2D(filters=64, kernel_size=(2,2), activation='relu', input_shape=(371, 3710)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(keras.layers.BatchNormalization())

model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(keras.layers.Dropout(0.3))

#Output layer
model.add(Dense(20, activation='softmax'))

#Compile model
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.summary()

### Model training

In [122]:
history = model.fit(X_train, y_train, validation_split=0.25, epochs=20, batch_size=30)

Train on 18750 samples, validate on 6250 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
