In [8]:
# the dataset link
# https://www.kaggle.com/hosseinmousavi/pcmir-database

In [9]:
import numpy as np
import pandas as pd
import IPython.display as ipd
import librosa
import librosa.display
import matplotlib.pyplot as plt
%matplotlib inline
# to have a clean notebook
import warnings
warnings.filterwarnings('ignore')

In [10]:
# librosa is a tool to read audio files and transform it to waves
test_filename='../input/pcmir-database/Persian Classical Music Instrument Recognition (PCMIR) Database/Ney/Ney (10).mp3'
plt.figure(figsize=(14,5))
# the sample rate is the nbr of samples of audio we take in a seconde by default it is 22 khz 
data,sample_rate=librosa.load(test_filename)
librosa.display.waveplot(data,sr=sample_rate)
ipd.Audio(test_filename)
# all files are 5s length

In [11]:
data,sample_rate

In [12]:
# we can also read audio files with scipy
# from scipy.io import wavfile as wav
# wave_sample_rate, wave_audio=wav.read(test_filename)
# but the values will not be normlized between 0 and 1 and it can bee in 2 channels librosa does all this work for us so we will use it

<h3>this waveform for the audio doesn't give us that much of information for the classification so we need to do a bunch of operations on it to become ready for the model</h3>

<h1>1- fast fourier transform</h1>

In [13]:
fft = np.fft.fft(data)

# calculate abs value to get magnitude
spectrum = np.abs(fft)

# create frequency variable
f = np.linspace(0, sample_rate, len(spectrum))

# take half of the spectrum and frequency because with the fft we will get a mirror diagram so we take only the half 
left_spectrum = spectrum[:int(len(spectrum)/2)]
left_f = f[:int(len(spectrum)/2)]

# plot spectrum
plt.figure(figsize=(14,5))
plt.plot(left_f, left_spectrum, alpha=0.4)
plt.xlabel("Frequency")
plt.ylabel("Magnitude")
plt.title("Power spectrum")

<h3>here we moved from amplitude in function of time to frequency which is better and gives us more informations but still it is static because we lost the information of time .... for this we will apply the stft (apply many fft on short times so we can have all the info needed in on spectrogram)</h3>

In [14]:
hop_length = 512 # how many sample we move between each step
n_fft = 2048 # how many samples to take for each fft step

# perform stft
stft = librosa.stft(data, n_fft=n_fft, hop_length=hop_length)

# calculate abs value to get magnitude
spectrogram = np.abs(stft)

# display spectrogram
plt.figure(figsize=(14,5))
librosa.display.specshow(spectrogram, sr=sample_rate, hop_length=hop_length)
plt.xlabel("Time")
plt.ylabel("Frequency")
plt.colorbar()
plt.title("Spectrogram")

<h3> here we have a spectrogram wich represent the magnitude and frequency in the time but its not that obvious so we will apply logarithm to cast from amplitude to Decibels</h3>

In [15]:
log_spectrogram = librosa.amplitude_to_db(spectrogram)
plt.figure(figsize=(14,5))
librosa.display.specshow(log_spectrogram, sr=sample_rate, hop_length=hop_length)
plt.xlabel("Time")
plt.ylabel("Frequency")
plt.colorbar()
plt.title("Spectrogram")
# here we see that the energy of the notes (red) is in the low frequencys all the 5s

<h3>since nothing is perfect.. the problem with this spectrogram is that if we apply the same notes with diffrent instruments (guitar and violin) it will give us a pretty similar spectrograme which is problem for classification.. and for this the solution is to use mfcc </h3>

In [16]:
MFCCs = librosa.feature.mfcc(data, sample_rate, n_fft=n_fft, hop_length=hop_length, n_mfcc=13)
# generally n_mfcc is between 13 and 40
# display MFCCs
plt.figure(figsize=(14,5))
librosa.display.specshow(MFCCs, sr=sample_rate, hop_length=hop_length)
plt.xlabel("Time")
plt.ylabel("MFCC coefficients")
plt.colorbar()
plt.title("MFCCs")

<h1>loading the data</h1>

In [17]:
import os
from tqdm import tqdm
def read_data(dataset_path):
    hop_length = 512 
    n_fft = 2048 
    x, x_mean, y = [],[],[]
    classes = os.listdir(dataset_path)
    classes.remove("ReadMe.txt")
    for classe in classes:
        print(classe)
        files = os.listdir(os.path.join(dataset_path,str(classe)))
        for file in tqdm(files):
            file_path = os.path.join(dataset_path,str(classe),str(file))
            signal, sample_rate = librosa.load(file_path)
            MFCCs = librosa.feature.mfcc(signal, sample_rate, n_fft=n_fft, hop_length=hop_length, n_mfcc=13)
            # here we will use both of this values and see wich one is better 
            # i think the use of the transpose is for the shapes to be correct
            mfcc = MFCCs.T
            mfcc_mean = np.mean(MFCCs.T,axis=0)
            x.append(mfcc)
            x_mean.append(mfcc_mean)
            y.append(classe)
    y = np.array(pd.get_dummies(y))
    return x, x_mean, y

In [18]:
dataset_path="../input/pcmir-database/Persian Classical Music Instrument Recognition (PCMIR) Database"
x, x_mean, y = read_data(dataset_path)

In [19]:
# sometimes the sound is bit more then 5s so the nfcc return a diffrent values
min_length = 238
for i in range(len(x)):
    if (len(x[i]) < min_length):
        min_length = len(x[i])
for i in range(len(x)):
    x[i] = x[i][:min_length]

In [20]:
for i in range(len(x)):
    if len(x[i]) != 219:
        print(len(x[i]))

<h1>ANN</h1>

In [21]:
print(f'the shape of x is {len(x),len(x[0]),len(x[0][0])}')
print(f'the shape of x_mean is {len(x_mean),len(x_mean[0])}')
print(f'the shape of y is {y.shape}')

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(np.array(x), y, test_size=0.33, random_state=42)

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
import tensorflow.keras as keras

In [24]:
# the kernel_regularizer and dropout are used to prevent over fitting
model = Sequential()
model.add(Flatten(input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)))
model.add(Dropout(0.3))
model.add(Dense(y.shape[1], activation="softmax"))

In [25]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [26]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=256, epochs=300)

In [27]:
def plot_history(history):
    fig, axs = plt.subplots(2)

    # create accuracy sublpot
    axs[0].plot(history.history["accuracy"], label="train accuracy")
    axs[0].plot(history.history["val_accuracy"], label="test accuracy")
    axs[0].set_ylabel("Accuracy")
    axs[0].legend(loc="lower right")
    axs[0].set_title("Accuracy eval")

    # create error sublpot
    axs[1].plot(history.history["loss"], label="train error")
    axs[1].plot(history.history["val_loss"], label="test error")
    axs[1].set_ylabel("Error")
    axs[1].set_xlabel("Epoch")
    axs[1].legend(loc="upper right")
    axs[1].set_title("Error eval")

    plt.show()
plot_history(history)

In [28]:
X_train_mean, X_test_mean, y_train_mean, y_test_mean = train_test_split(np.array(x_mean), y, test_size=0.33, random_state=42)

In [29]:
model2 = Sequential()
model2.add(Dense(256, activation='relu',input_shape=(13,), kernel_regularizer=keras.regularizers.l2(0.001)))
model2.add(Dropout(0.3))
model2.add(Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)))
model2.add(Dropout(0.3))
model2.add(Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)))
model2.add(Dropout(0.3))
model2.add(Dense(y.shape[1], activation="softmax"))
model2.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')
history2 = model2.fit(X_train_mean, y_train_mean, validation_data=(X_test_mean, y_test_mean), batch_size=256, epochs=300)

In [30]:
# using the mean ann worked much better
plot_history(history2)

<h1>lets use CNN</h1>

In [31]:
from tensorflow.keras.layers import Conv2D,MaxPooling2D,BatchNormalization

In [32]:
# conv2d expects a 4d array
X_train_cnn = X_train[..., np.newaxis]
X_test_cnn = X_test[..., np.newaxis]
X_train_cnn.shape,X_test_cnn.shape

In [33]:
# batch normlization helps the model converge way faster
cnn = Sequential()

# 1 st covolution layer
cnn.add(Conv2D(32, (3,3),activation='relu', input_shape=(X_train.shape[1], X_train.shape[2],1)))
cnn.add(MaxPooling2D((3,3), strides=(2,2), padding='same'))
cnn.add(BatchNormalization())
cnn.add(Dropout(0.3))
# 2 nd covolution layer
cnn.add(Conv2D(32, (3,3),activation='relu', input_shape=(X_train.shape[1], X_train.shape[2],1)))
cnn.add(MaxPooling2D((3,3), strides=(2,2), padding='same'))
cnn.add(BatchNormalization())
cnn.add(Dropout(0.3))
# 3 rd covolution layer
cnn.add(Conv2D(32, (2,2),activation='relu', input_shape=(X_train.shape[1], X_train.shape[2],1)))
cnn.add(MaxPooling2D((2,2), strides=(2,2), padding='same'))
cnn.add(BatchNormalization())
cnn.add(Dropout(0.3))
# ann layer
cnn.add(Flatten())
cnn.add(Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)))
cnn.add(Dropout(0.3))

# output
cnn.add(Dense(y_train.shape[1], activation='softmax'))


In [34]:
cnn.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')
cnn_history = cnn.fit(X_train_cnn, y_train, validation_data=(X_test_cnn, y_test), batch_size=256, epochs=200)

In [35]:
plot_history(cnn_history)
# cnn is also doing very well and converge faster than ann with mfcc_mean

<h1>lets try lstm</h1>

In [42]:
from tensorflow.keras.layers import LSTM,Bidirectional
lstm = Sequential()

# stack of 2 bidirectional sltm
lstm.add(Bidirectional(LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True)))
lstm.add(Bidirectional(LSTM(64)))
lstm.add(Dropout(0.3))

# dense layer with kernel regulizer and dropout to prevent overfitting
lstm.add(Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)))
lstm.add(Dropout(0.3))

# output
lstm.add(Dense(y_train.shape[1], activation='softmax'))

In [43]:
lstm.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')
lstm_history = lstm.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=256, epochs=200)

In [44]:
plot_history(lstm_history)

<h2>for this i foun out that using cnn gives best accuracy and coverge in a faster way than if i am using mfcc with ann it is better to use the mean of every row in mfcc</h2>