# Classification of musical rhythms of ecuador using Deep Learning
### 1 Data Exploration and Visualisation

In [1]:
# Load imports

import IPython.display as ipd
import librosa
import librosa.display
import matplotlib.pyplot as plt

In [7]:
import pandas as pd
metadata = pd.read_csv('audios/dataset.csv',delimiter=';',encoding = 'unicode_escape')
metadata.head()

Unnamed: 0,Id,Sonido,Clase,Clase_Id
0,1,1.wav,Pasillo,1
1,2,2.wav,Pasillo,1
2,3,3.wav,Pasillo,1
3,4,4.wav,Pasillo,1
4,5,5.wav,Pasillo,1


In [8]:
print(metadata.Clase.value_counts())

Capishca      60
Tonada        60
Danzante      60
Pasillo       60
Alza          60
SanJuanito    60
Yumbo         60
Albazo        60
Pasacalle     60
Name: Clase, dtype: int64


In [12]:
import struct

class WavFileHelper():
    
    def read_file_properties(self, filename):

        wave_file = open(filename,"rb")
        
        riff = wave_file.read(12)
        fmt = wave_file.read(36)
        
        num_channels_string = fmt[10:12]
        num_channels = struct.unpack('<H', num_channels_string)[0]

        sample_rate_string = fmt[12:16]
        sample_rate = struct.unpack("<I",sample_rate_string)[0]
        
        bit_depth_string = fmt[22:24]
        bit_depth = struct.unpack("<H",bit_depth_string)[0]

        return (num_channels, sample_rate, bit_depth)

In [13]:
# Load various imports 
import pandas as pd
import os
import librosa
import librosa.display

from wavfilehelper import WavFileHelper
wavfilehelper = WavFileHelper()
audiodata = []
for index, row in metadata.iterrows():
    
    file_name = os.path.join(os.path.abspath('audios/'),str(row["Sonido"]))
    data = wavfilehelper.read_file_properties(file_name)
    audiodata.append(data)

# Convert into a Panda dataframe
audiodf = pd.DataFrame(audiodata, columns=['num_channels','sample_rate','bit_depth'])

In [14]:
# num of channels 

print(audiodf.num_channels.value_counts(normalize=True))

2    0.97037
1    0.02963
Name: num_channels, dtype: float64


In [15]:
# sample rates 

print(audiodf.sample_rate.value_counts(normalize=True))

48000    0.998148
44100    0.001852
Name: sample_rate, dtype: float64


In [16]:
# bit depth

print(audiodf.bit_depth.value_counts(normalize=True))

16    1.0
Name: bit_depth, dtype: float64


### 2 Data Preprocessing

In [22]:
import librosa 
from scipy.io import wavfile as wav
import numpy as np

filename = 'audios/38.wav' 

librosa_audio, librosa_sample_rate = librosa.load(filename,mono=True) 
scipy_sample_rate, scipy_audio = wav.read(filename) 

print('Original sample rate:', scipy_sample_rate) 
print('Librosa sample rate:', librosa_sample_rate)

Original sample rate: 48000
Librosa sample rate: 22050


In [23]:
def extract_features(file_name):
   
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccsscaled = np.mean(mfccs.T,axis=0)
        
    except Exception as e:
        print("Error encountered while parsing file: ", file)
        return None 
     
    return mfccsscaled

In [24]:
# Load various imports 
import pandas as pd
import os
import librosa

# Set the path to the full UrbanSound dataset 

metadata = pd.read_csv('audios/dataset.csv',delimiter=';',encoding = 'unicode_escape')

features = []

# Iterate through each sound file and extract the features 
for index, row in metadata.iterrows():
    
    
    file_name = os.path.join(os.path.abspath('audios/'),str(row["Sonido"]))
    class_label = row["Clase"]
    data = extract_features(file_name)
    
    features.append([data, class_label])

# Convert into a Panda dataframe 
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])

print('Finished feature extraction from ', len(featuresdf), ' files')

Finished feature extraction from  540  files


In [26]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Convert features and corresponding classification labels into numpy arrays
X = np.array(featuresdf.feature.tolist())
y = np.array(featuresdf.class_label.tolist())

# Encode the classification labels
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y))

In [27]:
# split the dataset 
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state = 42)

In [28]:
### store the preprocessed data for use in the next notebook

%store x_train 
%store x_test 
%store y_train 
%store y_test 
%store yy 
%store le

Stored 'x_train' (ndarray)
Stored 'x_test' (ndarray)
Stored 'y_train' (ndarray)
Stored 'y_test' (ndarray)
Stored 'yy' (ndarray)
Stored 'le' (LabelEncoder)


### 3 Model Training and Evaluation

In [47]:
import numpy as np
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import Dense, Dropout, Activation, Flatten
#from keras.models import Sequential
#from keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.python.keras.layers import Convolution2D, MaxPooling2D
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.utils import np_utils
from sklearn import metrics 





num_labels = yy.shape[1]
filter_size = 2

# Construct model 
model = Sequential()

model.add(Dense(256, input_shape=(40,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(num_labels))
model.add(Activation('softmax'))

In [48]:
# Compile the model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [49]:
# Display model architecture summary 
model.summary()

# Calculate pre-training accuracy 
score = model.evaluate(x_test, y_test, verbose=0)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy)


Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 256)               10496     
_________________________________________________________________
activation_12 (Activation)   (None, 256)               0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 256)               65792     
_________________________________________________________________
activation_13 (Activation)   (None, 256)               0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 9)                

In [50]:
from tensorflow.python.keras.callbacks import ModelCheckpoint 
from datetime import datetime 
num_epochs = 100
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.basic_mlp.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/100
 1/14 [=>............................] - ETA: 0s - loss: 36.0215 - accuracy: 0.1562
Epoch 00001: val_loss improved from inf to 8.13176, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 2/100
Epoch 00002: val_loss improved from 8.13176 to 4.19648, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 3/100
 1/14 [=>............................] - ETA: 0s - loss: 13.4110 - accuracy: 0.1875
Epoch 00003: val_loss improved from 4.19648 to 2.12964, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 4/100
Epoch 00004: val_loss improved from 2.12964 to 1.75211, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 5/100
 1/14 [=>............................] - ETA: 0s - loss: 6.1006 - accuracy: 0.2500
Epoch 00005: val_loss improved from 1.75211 to 1.68142, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 6/100
 1/14 [=>............................] - ETA: 0s - loss: 4.6491 - accuracy: 0.1562
Epoch 00006: val_loss did not imp

 1/14 [=>............................] - ETA: 0s - loss: 0.7505 - accuracy: 0.6875
Epoch 00027: val_loss improved from 0.95766 to 0.90075, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 28/100
 1/14 [=>............................] - ETA: 0s - loss: 1.5803 - accuracy: 0.4375
Epoch 00028: val_loss improved from 0.90075 to 0.86008, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 29/100
 1/14 [=>............................] - ETA: 0s - loss: 1.0540 - accuracy: 0.6250
Epoch 00029: val_loss improved from 0.86008 to 0.79420, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 30/100
 1/14 [=>............................] - ETA: 0s - loss: 0.9302 - accuracy: 0.6875
Epoch 00030: val_loss improved from 0.79420 to 0.68122, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 31/100
Epoch 00031: val_loss improved from 0.68122 to 0.63575, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 32/100
 1/14 [=>............................]

Epoch 52/100
 1/14 [=>............................] - ETA: 0s - loss: 0.3021 - accuracy: 0.8750
Epoch 00052: val_loss did not improve from 0.15371
Epoch 53/100
 1/14 [=>............................] - ETA: 0s - loss: 0.3903 - accuracy: 0.8750
Epoch 00053: val_loss did not improve from 0.15371
Epoch 54/100
 1/14 [=>............................] - ETA: 0s - loss: 0.3308 - accuracy: 0.8750
Epoch 00054: val_loss improved from 0.15371 to 0.13593, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 55/100
 1/14 [=>............................] - ETA: 0s - loss: 0.2657 - accuracy: 0.9062
Epoch 00055: val_loss improved from 0.13593 to 0.13295, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 56/100
 1/14 [=>............................] - ETA: 0s - loss: 0.5140 - accuracy: 0.8438
Epoch 00056: val_loss improved from 0.13295 to 0.12021, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 57/100
 1/14 [=>............................] - ETA: 0s - loss: 0.3844 - 

 1/14 [=>............................] - ETA: 0s - loss: 0.0880 - accuracy: 1.0000
Epoch 00079: val_loss did not improve from 0.05762
Epoch 80/100
 1/14 [=>............................] - ETA: 0s - loss: 0.4052 - accuracy: 0.9062
Epoch 00080: val_loss improved from 0.05762 to 0.05651, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 81/100
 1/14 [=>............................] - ETA: 0s - loss: 0.3241 - accuracy: 0.9062
Epoch 00081: val_loss improved from 0.05651 to 0.05206, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 82/100
 1/14 [=>............................] - ETA: 0s - loss: 0.1363 - accuracy: 0.9688
Epoch 00082: val_loss improved from 0.05206 to 0.05072, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 83/100
Epoch 00083: val_loss improved from 0.05072 to 0.04758, saving model to saved_models\weights.best.basic_mlp.hdf5
Epoch 84/100
 1/14 [=>............................] - ETA: 0s - loss: 0.1330 - accuracy: 0.9375
Epoch 00084: val_

In [51]:
# Evaluating the model on the training and testing set
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

Training Accuracy:  1.0
Testing Accuracy:  0.9907407164573669


In [52]:
import librosa 
import numpy as np 

def extract_feature(file_name):
   
    try:
        audio_data, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=40)
        mfccsscaled = np.mean(mfccs.T,axis=0)
        
    except Exception as e:
        print("Error encountered while parsing file: ", file)
        return None, None

    return np.array([mfccsscaled])

In [53]:
def print_prediction(file_name):
    prediction_feature = extract_feature(file_name) 

    predicted_vector = model.predict_classes(prediction_feature)
    predicted_class = le.inverse_transform(predicted_vector) 
    print("The predicted class is:", predicted_class[0], '\n') 

    predicted_proba_vector = model.predict_proba(prediction_feature) 
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)): 
        category = le.inverse_transform(np.array([i]))
        print(category[0], "\t\t : ", format(predicted_proba[i], '.32f') )

In [54]:
# Class: Pasillo
filename = 'audio_modelo/Pasillo.wav'
print_prediction(filename)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
The predicted class is: Pasillo 

Instructions for updating:
Please use `model.predict()` instead.
Albazo 		 :  0.00618882663547992706298828125000
Alza 		 :  0.00551735376939177513122558593750
Capishca 		 :  0.01099091675132513046264648437500
Danzante 		 :  0.01667597144842147827148437500000
Pasacalle 		 :  0.01843697577714920043945312500000
Pasillo 		 :  0.89201587438583374023437500000000
SanJuanito 		 :  0.00701656518504023551940917968750
Tonada 		 :  0.00294312182813882827758789062500
Yumbo 		 :  0.04021449014544486999511718750000
