# Deep Convolutional Neural Network for Makam Recognition

### Library importing for file reading and preprocessing

In [1]:
import glob
import os
import numpy as np
import librosa
from scipy.interpolate import interp2d 
from sklearn.preprocessing import normalize
import warnings
from math import floor
import dill

In [3]:
dill.dump_session('cnn.db')

In [5]:
dill.load_session('cnn_trained.db')

Using TensorFlow backend.


### File reading and preprocessing

Read all .mp3 files and retrieve their makam based on the folder that their equivalent .pitch files exists.

In [25]:
#Array containing all constant-Q tranforms of the soundfiles
cqts = []

#Array containing all makam labels
Y = []

#Makam list for more efficient file searching during label retrieval
makams = ["Acemasiran", "Acemkurdi", "Bestenigar", "Beyati", "Hicaz", "Hicazkar", "Huseyni", "Huzzam", "Karcigar", "Kurdilihicazkar", "Mahur", "Muhayyer", "Neva", "Nihavent", "Rast", "Saba", "Segah", "Sultaniyegah", "Suzinak", "Ussak"]

#Ignore mp3 read warning
warnings.filterwarnings("ignore", message="PySoundFile failed. Trying audioread instead.")

#keep shortest cqt length
minlen = 100000

index = 0
#Traverse directory
for root, dirs, files in os.walk('./soundfiles'):
        for name in files:
            
            #----------------------Labels------------------------#
            
            #find under which folder the file is (for makam retrieval) and append label set
            matched = 0
            for makam in makams:
                if (os.path.isfile("./otmm_makam_recognition_dataset/data/" + makam + "/" + name[:-4] + ".pitch") == True):
                    Y.append(makams.index(makam))
                    matched = 1
                    break
                    
            #if soundfile not in pitch data, ignore
            if (matched == 0):
                continue
            
            #----------------Constant-Q Transform----------------#
            
            #construct soundfile directory
            filedir = os.path.join(root, name)
            
#             #load MP3 length metadata
#             audio = MP3(filedir)
#             length = audio.info.length
#             if (length < minlen):
#                 minlen = length
#                 print(minlen)
            
#             if (index == 424 or index == 425 or index == 426 or index == 427):
#                 y, sr = librosa.core.load(filedir)
#                 cqt = librosa.core.cqt(y, sr, n_bins=371, bins_per_octave=53)
#                 print(cqt.shape)
                
            #load soundfile
            y, sr = librosa.core.load(filedir)
            
            #compute the constant-Q transform from audio signal
            cqt = librosa.core.cqt(y, sr, n_bins=371, bins_per_octave=53) #up to C7 in 53TET
            
            #keeping 15 seconds around center
            offset = floor((cqt.shape[1] - 645)/2)
            t_cqt = np.zeros((371, 645), dtype=complex)
            for i in range(371):
                for j in range(645):
                    t_cqt[i][j] = cqt[i][j+offset]
            cqts.append(t_cqt)
            
            
#             #Resampling
#             interpol_f = []
#             for cqt in cqts:
#                 Xindex = np.linspace(0, 1, num=371)
#                 Yindex = np.linspace(0, 1, num=cqt.shape[1])
#                 f = interp2d(Xindex, Yindex, cqt.flatten(), kind='linear')
    
#             #Resample cqts
#             X = np.zeros((371, 1855), dtype=complex)
#             for i in range(len(cqts)):
#                 Xindex_rs = np.linspace(0, 1, num=371)
#                 Yindex_rs = np.linspace(0, 1, num=3710)
#                 X.append(np.reshape(interpol_f[i](Xindex_rs, Yindex_rs), (371, 3710)))
            
#             #append constat-Q transform to set
#             cqts.append(cqt)
            
#             #update minlen
#             if (cqt.shape[1])<minlen:
#                 minlen = cqt.shape[1]
#                 print (minlen, end=', ')
            
            
            #print files processed
            print(index, end=", ")
            index+=1

            
print(len(cqts))
print(len(Y))   


0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 

In [16]:
hi = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
for i in range(len(Y)):
    hi[Y[i]]+=1
print (hi)

[50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]


### Constant-Q transform resampling
CQTs are going to vary in length based on the length of the audio file. One wait to deal with this is resampling the CQTs to the common size (371, 3710). 

** Calculate average song length to justify that 3710 is a reasonable middle ground because some will need upscalling while most downscaling **

** I don't think resampled arrays contain complex numbers anymore. **

In [4]:
#Compute all interpolation functions
interpol_f = []
for cqt in cqts:
    Xindex = np.linspace(0, 1, num=371)
    Yindex = np.linspace(0, 1, num=cqt.shape[1])
    f = interp2d(Xindex, Yindex, cqt.flatten(), kind='linear')
    interpol_f.append(f)
    
#Resample cqts
X = np.zeros((371, 3710), dtype=complex)
for i in range(len(cqts)):
    Xindex_rs = np.linspace(0, 1, num=371)
    Yindex_rs = np.linspace(0, 1, num=3710)
    X.append(np.reshape(interpol_f[i](Xindex_rs, Yindex_rs), (371, 3710)))
    
print(X)

  kx=kx, ky=ky, s=0.0)


### Constant-Q transform truncation
An alternative to resampling the cqts would be to truncate all soundfiles to the length of the shortest soundfiles.

** This seems like a less productive approach, but probably depends on duration deviation among soundfiles. **

##### Use this, interpolation is bad for retaining frequency

In [None]:
print(minlen)
for i in range(len(cqts)):
    #create truncated cqt array of shape (371, minlen)
    t_cqt = np.zeros((371, minlen))
    for j in range(371):
        for k in range(minlen):
            t_cqt[j][k]

### Library importing for deep learning

In [6]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Activation, BatchNormalization, Dropout
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

### Train - Test split


In [17]:
#one-hot encode target vector
y_c = to_categorical(Y)
X_train, X_test, y_train, y_test = train_test_split((np.asarray(cqts)).reshape(len(cqts),371,645,1), np.asarray(y_c), test_size=0.33, random_state=42)
print(X_train.shape)
print(y_c)

(670, 371, 645, 1)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]]


In [18]:
print(y_c.shape)

(1000, 20)


### Building the CNN

In [19]:
#Network topology
model = Sequential()

#3 convolutional layers
model.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu', input_shape=(371, 645, 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())

model.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu', input_shape=(371, 645, 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())

model.add(Conv2D(filters=64, kernel_size=(2,2), activation='relu', input_shape=(371, 645, 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())

model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))

#Output layer
model.add(Dense(20, activation='softmax'))

#Compile model
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (None, 369, 643, 64)      640       
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 184, 321, 64)      0         
_________________________________________________________________
batch_normalization_7 (Batch (None, 184, 321, 64)      256       
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 182, 319, 64)      36928     
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 91, 159, 64)       0         
_________________________________________________________________
batch_normalization_8 (Batch (None, 91, 159, 64)       256       
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 90, 158, 64)      

### Model training

In [20]:
history = model.fit(X_train, y_train, validation_split=0.25, epochs=10, batch_size=30)


Train on 502 samples, validate on 168 samples
Epoch 1/10


  return ops.EagerTensor(value, ctx.device_name, dtype)


Epoch 2/10
Epoch 3/10
 30/502 [>.............................] - ETA: 2:32 - loss: 1.3290 - accuracy: 0.9050

KeyboardInterrupt: 

In [12]:
#Model evaluation
scores = model.evaluate(X_test, y_test, verbose=1)
print(scores)

[0.5556676138531078, 0.9499999284744263]
