In [1]:
import pandas as pd
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, BatchNormalization, Bidirectional, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import numpy as np
import scipy

In [2]:
df = pd.read_csv('./ESC-50-master/meta/esc50.csv')

category_to_class = {
    'dog': 'hayvan',
    'chirping_birds': 'hayvan',
    'vacuum_cleaner': 'arkaplan', 
    'thunderstorm': 'arkaplan',
    'door_wood_knock': 'arkaplan',
    'can_opening': 'arkaplan',
    'crow': 'arkaplan',
    'clapping': 'insan',
    'fireworks': 'arkaplan',
    'chainsaw': 'arac',
    'airplane': 'arac',
    'mouse_click': 'arkaplan',
    'pouring_water': 'arkaplan',
    'train': 'arac',
    'sheep': 'hayvan',
    'water_drops': 'arkaplan',
    'church_bells': 'arkaplan',
    'clock_alarm': 'arkaplan',
    'keyboard_typing': 'arkaplan',
    'wind': 'arkaplan',
    'footsteps': 'insan',
    'frog': 'arkaplan',
    'cow': 'hayvan',
    'brushing_teeth': 'arkaplan',
    'car_horn': 'arac',
    'crackling_fire': 'arkaplan',
    'helicopter': 'arac',
    'drinking_sipping': 'arkaplan',
    'rain': 'arkaplan',
    'insects': 'arkaplan',
    'laughing': 'insan',
    'hen': 'hayvan',
    'engine': 'arac',
    'breathing': 'insan',
    'crying_baby': 'insan', 
    'hand_saw': 'arac',
    'coughing': 'insan',
    'glass_breaking': 'arkaplan',
    'snoring' : 'insan',
    'toilet_flush': 'arkaplan',
    'pig': 'hayvan',
    'washing_machine': 'arac',
    'clock_tick': 'arkaplan',
    'sneezing' : 'insan',
    'rooster': 'hayvan',
    'sea_waves': 'arkaplan',
    'siren': 'arac',
    'cat': 'hayvan',
    'door_wood_creaks': 'arkaplan',
    'crickets': 'arkaplan',
}

df['class'] = df['category'].map(category_to_class)

In [3]:
df

Unnamed: 0,filename,fold,target,category,esc10,src_file,take,class
0,1-100032-A-0.wav,1,0,dog,True,100032,A,hayvan
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A,hayvan
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A,arkaplan
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B,arkaplan
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A,arkaplan
...,...,...,...,...,...,...,...,...
1995,5-263831-B-6.wav,5,6,hen,False,263831,B,hayvan
1996,5-263902-A-36.wav,5,36,vacuum_cleaner,False,263902,A,arkaplan
1997,5-51149-A-25.wav,5,25,footsteps,False,51149,A,insan
1998,5-61635-A-8.wav,5,8,sheep,False,61635,A,hayvan


In [4]:
x_data = []

for _, row in df.iterrows():

    file_name = f"./ESC-50-master/audio/{row['filename']}"
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
    audio_16k = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
    
    x_data.append(audio_16k.copy())
    
x_data = np.array(x_data)

In [5]:
df['class_categorical'] = df['class'].map({'insan':0, "hayvan":1, "arac":2, "arkaplan":3})
y_cat = tf.keras.utils.to_categorical(df['class_categorical'])
y_cat

array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.]], dtype=float32)

In [6]:
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from sklearn.preprocessing import * 

In [7]:
def normalize(img):
    '''
    Normalizes an array 
    (subtract mean and divide by standard deviation)
    '''
    eps = 0.001
    if np.std(img) != 0:
        img = (img - np.mean(img)) / np.std(img)
    else:
        img = (img - np.mean(img)) / eps
    return img


In [9]:
NUM_FEATURES = 64
from scipy.fftpack import dct

x_train_mfcc_64 = librosa.feature.mfcc(y=x_data, sr=16000, n_mfcc=64)
x_train_mfcc_48 = librosa.feature.mfcc(y=x_data, sr=16000, n_mfcc=49)
x_train_mfcc_128 = librosa.feature.mfcc(y=x_data, sr=16000, n_mfcc=128)
x_train_mfcc_32 = librosa.feature.mfcc(y=x_data, sr=16000, n_mfcc=32)

x_train_stft_2048 = librosa.feature.chroma_stft(y = x_data , sr= 16000)
x_train_stft_512 = librosa.feature.chroma_stft(y = x_data , sr= 16000, n_fft=512)
x_train_stft_256 = librosa.feature.chroma_stft(y = x_data , sr= 16000, n_fft=256)
x_train_stft_64 = librosa.feature.chroma_stft(y = x_data , sr= 16000, n_fft=64)
x_train_stft_1024 = librosa.feature.chroma_stft(y = x_data , sr= 16000, n_fft=1024)

x_train_melspectrogram_512 = librosa.feature.melspectrogram(y=x_data, sr=16000, n_fft = 512)
x_train_melspectrogram_1024 = librosa.feature.melspectrogram(y=x_data, sr=16000, n_fft = 1024)
x_train_melspectrogram_2048 = librosa.feature.melspectrogram(y=x_data, sr=16000, n_fft = 2048)
x_train_melspectrogram_256 = librosa.feature.melspectrogram(y=x_data, sr=16000, n_fft = 256)
x_train_melspectrogram_64 = librosa.feature.melspectrogram(y=x_data, sr=16000, n_fft = 64)
x_train_melspectrogram_32 = librosa.feature.melspectrogram(y=x_data, sr=16000, n_fft = 32)

x_train_dct = dct(x_data)



############
x_train_mfcc_64_ = normalize(x_train_mfcc_64)
x_train_mfcc_48_ = normalize(x_train_mfcc_48)
x_train_mfcc_128_ = normalize(x_train_mfcc_128)
x_train_mfcc_32_ = normalize(x_train_mfcc_32)

x_train_stft_2048_ = normalize(x_train_stft_2048)
x_train_stft_512_ = normalize(x_train_stft_512)
x_train_stft_1024_ = normalize(x_train_stft_1024)
x_train_stft_256_ = normalize(x_train_stft_256)
x_train_stft_64_ = normalize(x_train_stft_64)

x_train_melspectrogram_512_ = normalize(x_train_melspectrogram_512)
x_train_melspectrogram_1024_ = normalize(x_train_melspectrogram_1024)
x_train_melspectrogram_2048_ = normalize(x_train_melspectrogram_2048)
x_train_melspectrogram_256_ = normalize(x_train_melspectrogram_256)
x_train_melspectrogram_64_ = normalize(x_train_melspectrogram_64)
x_train_melspectrogram_32_ = normalize(x_train_melspectrogram_32)
                                        
x_train_dct_ = normalize(x_train_dct)
x_train_dct = x_train_dct.resize((x_train_dct.shape[0],1,x_train_dct.shape[1]))
x_train_dct_ = x_train_dct_.resize((x_train_dct_.shape[0],1,x_train_dct_.shape[1]))

In [24]:
n

15

In [25]:
from sklearn.metrics import *

feat_type = 'mfcc'
NUM_FEATURES = 64

names = [
    'MFCC-64',
    'MFCC-48',
    'MFCC-128',
    'MFCC-32',
    'STFT-2048',
    'STFT-512',
    'STFT-256',
    'STFT-64',
    'STFT-1024',
    'Mel-Spectrogram-512',
    'Mel-Spectrogram-1024',
    'Mel-Spectrogram-2048',
    'Mel-Spectrogram-256',
    'Mel-Spectrogram-64',
    'Mel-Spectrogram-32',
    'Discrete Cosine Transform',
    'MFCC-64-normalized',
    'MFCC-48-normalized',
    'MFCC-128-normalized',
    'MFCC-32-normalized',
    'STFT-2048-normalized',
    'STFT-512-normalized',
    'STFT-256-normalized',
    'STFT-64-normalized',
    'STFT-1024-normalized',
    'Mel-Spectrogram-512-normalized',
    'Mel-Spectrogram-1024-normalized',
    'Mel-Spectrogram-2048-normalized',
    'Mel-Spectrogram-256-normalized',
    'Mel-Spectrogram-64-normalized',
    'Mel-Spectrogram-32-normalized',   
    'Discrete Cosine Transform-normalized',
]

report = []

for n, dat_o in enumerate([
        x_train_mfcc_64,
        x_train_mfcc_48,
        x_train_mfcc_128,
        x_train_mfcc_32,    
        x_train_stft_2048,
        x_train_stft_512,
        x_train_stft_256,
        x_train_stft_64,
        x_train_stft_1024,
        x_train_melspectrogram_512,
        x_train_melspectrogram_1024,
        x_train_melspectrogram_2048,
        x_train_melspectrogram_256,
        x_train_melspectrogram_64,
        x_train_melspectrogram_32,
        x_train_dct,
        x_train_mfcc_64_,
        x_train_mfcc_48_,
        x_train_mfcc_128_,
        x_train_mfcc_32_,
        x_train_stft_2048_,
        x_train_stft_512_,
        x_train_stft_256_,
        x_train_stft_64_,
        x_train_stft_1024_,
        x_train_melspectrogram_512_,
        x_train_melspectrogram_1024_,
        x_train_melspectrogram_2048_,
        x_train_melspectrogram_256_,
        x_train_melspectrogram_64_,
        x_train_melspectrogram_32_,
        x_train_dct_,
           ]):
    
    for n_lstm in [5,30]:
     for d1 in [32]:
      for d2 in [32]:
       for bs in [32]:
        for rs in [True, False]:
        
            try:
                if rs:
                    dat = dat_o.reshape((dat_o.shape[0], dat_o.shape[2], dat_o.shape[1]))
                else:
                    dat = dat_o.copy()
            except:
                continue
                
            ind_test = df[df['fold']==5].index.tolist()
            ind_val = df[df['fold']==4].index.tolist()
            ind_tr = df[df['fold'].isin([1,2,3])].index.tolist()

            
            x_val = dat[ind_val]
            x_train = dat[ind_tr]
            x_test = dat[ind_test]

            y_train = y_cat[ind_tr]
            y_val = y_cat[ind_val]
            y_test = y_cat[ind_test]


            yoff = np.zeros(len(x_test))
            
            inp = Input(shape=(x_val.shape[1], x_val.shape[2]))
            x1 = LSTM(n_lstm, return_sequences = True)(inp)
            x1 = LSTM(n_lstm)(x1)
            x1 = Dense(d1,activation='relu')(x1)
            x1 = Dense(d2,activation='relu')(x1)
            out = Dense(y_cat.shape[1],activation='softmax')(x1)
            model = Model(inputs=inp, outputs=out)

            es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose = 0)
            optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
            model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
            history = model.fit(x_train, y_train, batch_size=bs, epochs=500, validation_data=(x_val, y_val), verbose = 0, callbacks = [es])

            pred_train = model.predict(x_train)
            pred_val = model.predict(x_val)
            pred_test = model.predict(x_test)

            curacc_train = accuracy_score(np.argmax(y_train, axis=1), np.argmax(pred_train, axis=1))
            curacc_val = accuracy_score(np.argmax(y_val, axis=1), np.argmax(pred_val, axis=1))
            curacc_test = accuracy_score(np.argmax(y_test, axis=1), np.argmax(pred_test, axis=1))

            yoff = np.argmax(pred_val, axis=1)
            #print([curacc_train, curacc_val])

            name = names[n]
            
            r = {}
            r['name'] = name
            r['accuracy_test'] = curacc_test
            r['accuracy_val'] = curacc_val
            r['accuracy_train'] = curacc_train
            
            r['num_LSTM'] = n_lstm
            r['num_Dense_1'] = d1
            r['num_Dense_2'] = d2
            r['batch_size'] = bs
            
            r['reshape'] = rs
            
            report.append(r.copy())
            
            print([curacc_train, curacc_val, curacc_test, n, n_lstm, d1, d2 , bs, rs])

[0.5, 0.5, 0.5, 0, 5, 32, 32, 32, True]
[0.5616666666666666, 0.5025, 0.5, 0, 5, 32, 32, 32, False]
[0.51, 0.4925, 0.5025, 0, 30, 32, 32, 32, True]
[0.565, 0.515, 0.5025, 0, 30, 32, 32, 32, False]
[0.49833333333333335, 0.5025, 0.5, 1, 5, 32, 32, 32, True]
[0.5, 0.5, 0.5, 1, 5, 32, 32, 32, False]
[0.5775, 0.53, 0.4675, 1, 30, 32, 32, 32, True]
[0.5, 0.5, 0.5, 1, 30, 32, 32, 32, False]
[0.5, 0.5, 0.5, 2, 5, 32, 32, 32, True]
[0.5, 0.5, 0.5, 2, 5, 32, 32, 32, False]
[0.5333333333333333, 0.495, 0.5025, 2, 30, 32, 32, 32, True]
[0.5308333333333334, 0.5, 0.505, 2, 30, 32, 32, 32, False]
[0.5, 0.5, 0.5, 3, 5, 32, 32, 32, True]
[0.5, 0.5, 0.5, 3, 5, 32, 32, 32, False]
[0.575, 0.495, 0.5025, 3, 30, 32, 32, 32, True]
[0.5358333333333334, 0.495, 0.5025, 3, 30, 32, 32, 32, False]
[0.5241666666666667, 0.5075, 0.4775, 4, 5, 32, 32, 32, True]
[0.5208333333333334, 0.51, 0.51, 4, 5, 32, 32, 32, False]
[0.5, 0.5, 0.5, 4, 30, 32, 32, 32, True]
[0.6175, 0.4825, 0.4975, 4, 30, 32, 32, 32, False]
[0.5, 0.5, 

In [27]:
pd.DataFrame(report).sort_values('accuracy_test')

Unnamed: 0,name,accuracy_test,accuracy_val,accuracy_train,num_LSTM,num_Dense_1,num_Dense_2,batch_size,reshape
6,MFCC-48,0.4675,0.5300,0.577500,30,32,32,32,True
35,STFT-1024,0.4750,0.4850,0.555000,30,32,32,32,False
16,STFT-2048,0.4775,0.5075,0.524167,5,32,32,32,True
27,STFT-256,0.4800,0.4425,0.531667,30,32,32,32,False
39,Mel-Spectrogram-512,0.4850,0.5200,0.547500,30,32,32,32,False
...,...,...,...,...,...,...,...,...,...
67,MFCC-48-normalized,0.5475,0.5375,0.640000,30,32,32,32,False
62,MFCC-64-normalized,0.5500,0.5775,0.625000,30,32,32,32,True
74,MFCC-32-normalized,0.5500,0.5325,0.562500,30,32,32,32,True
63,MFCC-64-normalized,0.5700,0.5400,0.617500,30,32,32,32,False


In [None]:
for n, dat in enumerate([x_train_mfcc_64,
           x_train_mfcc_128,
           x_train_mfcc_32,
           x_train_stft_2048,
           x_train_stft_512,
           x_train_stft_1024,
           x_train_melspectrogram_512,
           x_train_melspectrogram_1024,
           x_train_melspectrogram_2048,
           x_train_dct
           ]):
    
    print(dat.shape)

In [None]:
for fold in df['fold'].unique():
    
    ind_val = df[df['fold']==fold].index
    ind_tr = df[df['fold']!=fold].index
    
    x_val = x[ind_val]
    x_train = x[ind_tr]
    
    y_train = y_cat[ind_tr]
    y_val = y_cat[ind_val]


    if transpose:
        x_train_copy = x_train.reshape((x_train.shape[0], x_train.shape[2], x_train.shape[1]))
        x_val_copy = x_val.reshape((x_val.shape[0], x_val.shape[2], x_val.shape[1]))
        x_test_copy = x_test.reshape((x_test.shape[0], x_test.shape[2], x_test.shape[1]))

    else:
        x_train_copy = x_train.copy()
        x_val_copy = x_val.copy()
        x_test_copy = x_test.copy()

    tf.random.set_seed(3)


    inp = Input(shape=(x_train_copy.shape[1],x_train_copy.shape[2]))

    if mitr == 0:
        #x = Conv1D(25, kernel_size=2)(inp)
        x = Conv1D(64,5,strides=1,activation="relu",)(inp)
        x = Attention()([x,x])
        if ump:
            x = MaxPooling1D()(x)
        x = Bidirectional(LSTM(l))(x)
        x = Dense(d,activation='relu')(x)
        x = Dense(d,activation='relu')(x)

    elif mitr == 1:
        #x = Conv1D(25, kernel_size=2)(inp)
        x = LSTM(l,return_sequences= True )(inp)
        if ump:
            x = MaxPooling1D()(x)
        x = LSTM(l)(x)
        x = Dense(d,activation='relu')(x)
        x = Dense(d,activation='relu')(x)

    elif mitr == 2:
        #x = Conv1D(25, kernel_size=2)(inp)
        x = Bidirectional(LSTM(l,return_sequences= True ))(inp)
        if ump:
            x = MaxPooling1D()(x)
        x = Bidirectional(LSTM(l))(x)
        x = Dense(d,activation='relu')(x)
        x = Dense(d,activation='relu')(x)

    elif mitr == 3:
        #x = Conv1D(25, kernel_size=2)(inp)
        x = Bidirectional(LSTM(l,return_sequences= True ))(inp)
        x = Attention()([x,x])
        if ump:
            x = MaxPooling1D()(x)
        x = Bidirectional(LSTM(l))(x)
        x = Dense(d,activation='relu')(x)
        x = Dense(d,activation='relu')(x)


    out = Dense(y_cat.shape[1],activation='softmax')(x)

    model = Model(inputs=inp, outputs=out)

    es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose = 0)

    optimizer = tf.keras.optimizers.Adam(learning_rate = lr)
    model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
    history = model.fit(x_train_copy, y_train, batch_size=bs, epochs=500, validation_data=(x_val_copy, y_val), verbose =0, callbacks = [es])


    pred_train = model.predict(x_train_copy)
    pred_val = model.predict(x_val_copy)
    pred_test = model.predict(x_test_copy)

    curacc_train = accuracy_score(np.argsort(y_train,axis=1)[:,-1], np.argsort(pred_train,axis=1)[:,-1])
    curacc_val = accuracy_score(np.argsort(y_val,axis=1)[:,-1], np.argsort(pred_val,axis=1)[:,-1])
    curacc_test = accuracy_score(np.argsort(y_test,axis=1)[:,-1], np.argsort(pred_test,axis=1)[:,-1])


In [None]:
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
    
from sklearn.metrics import *
bestacc = 0
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Input, Conv1D, AveragePooling1D, Attention
from tensorflow.keras.optimizers import *
        


for l in [50,100,200]:
  for d in [32]:
   for bs in [4,16]:
    for lr in [0.001]:
       for transpose in [ False]:
        for bn in [False]:
         for mitr in range(4):
          for ump in [True,False]:
            for split_name, split_df in [
                                        ('mfcc_normalized', normalized_padded_inputs),
                                        ('spectrogram_normalized', normalized_padded_inputs_spec),
                                        ]:


                x_train = split_df[tr]
                x_val = split_df[val]
                x_test = split_df[test]

                y_train = y_cat[tr]
                y_val = y_cat[val]
                y_test = y_cat[test]
    

                if transpose:
                    x_train_copy = x_train.reshape((x_train.shape[0], x_train.shape[2], x_train.shape[1]))
                    x_val_copy = x_val.reshape((x_val.shape[0], x_val.shape[2], x_val.shape[1]))
                    x_test_copy = x_test.reshape((x_test.shape[0], x_test.shape[2], x_test.shape[1]))

                else:
                    x_train_copy = x_train.copy()
                    x_val_copy = x_val.copy()
                    x_test_copy = x_test.copy()

                tf.random.set_seed(3)

                
                inp = Input(shape=(x_train_copy.shape[1],x_train_copy.shape[2]))

                if mitr == 0:
                    #x = Conv1D(25, kernel_size=2)(inp)
                    x = Conv1D(64,5,strides=1,activation="relu",)(inp)
                    x = Attention()([x,x])
                    if ump:
                        x = MaxPooling1D()(x)
                    x = Bidirectional(LSTM(l))(x)
                    x = Dense(d,activation='relu')(x)
                    x = Dense(d,activation='relu')(x)

                elif mitr == 1:
                    #x = Conv1D(25, kernel_size=2)(inp)
                    x = LSTM(l,return_sequences= True )(inp)
                    if ump:
                        x = MaxPooling1D()(x)
                    x = LSTM(l)(x)
                    x = Dense(d,activation='relu')(x)
                    x = Dense(d,activation='relu')(x)
                    
                elif mitr == 2:
                    #x = Conv1D(25, kernel_size=2)(inp)
                    x = Bidirectional(LSTM(l,return_sequences= True ))(inp)
                    if ump:
                        x = MaxPooling1D()(x)
                    x = Bidirectional(LSTM(l))(x)
                    x = Dense(d,activation='relu')(x)
                    x = Dense(d,activation='relu')(x)
                    
                elif mitr == 3:
                    #x = Conv1D(25, kernel_size=2)(inp)
                    x = Bidirectional(LSTM(l,return_sequences= True ))(inp)
                    x = Attention()([x,x])
                    if ump:
                        x = MaxPooling1D()(x)
                    x = Bidirectional(LSTM(l))(x)
                    x = Dense(d,activation='relu')(x)
                    x = Dense(d,activation='relu')(x)

                    
                out = Dense(y_cat.shape[1],activation='softmax')(x)

                model = Model(inputs=inp, outputs=out)

                es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose = 0)

                optimizer = tf.keras.optimizers.Adam(learning_rate = lr)
                model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
                history = model.fit(x_train_copy, y_train, batch_size=bs, epochs=500, validation_data=(x_val_copy, y_val), verbose =0, callbacks = [es])


                pred_train = model.predict(x_train_copy)
                pred_val = model.predict(x_val_copy)
                pred_test = model.predict(x_test_copy)

                curacc_train = accuracy_score(np.argsort(y_train,axis=1)[:,-1], np.argsort(pred_train,axis=1)[:,-1])
                curacc_val = accuracy_score(np.argsort(y_val,axis=1)[:,-1], np.argsort(pred_val,axis=1)[:,-1])
                curacc_test = accuracy_score(np.argsort(y_test,axis=1)[:,-1], np.argsort(pred_test,axis=1)[:,-1])

                model.save('my_model.h5')

                import os

                def get_size(path):
                    size = os.path.getsize(path)
                    if size < 1024:
                        return f"{size} bytes"
                    elif size < 1024*1024:
                        return f"{round(size/1024, 2)} KB"
                    elif size < 1024*1024*1024:
                        return f"{round(size/(1024*1024), 2)} MB"
                    elif size < 1024*1024*1024*1024:
                        return f"{round(size/(1024*1024*1024), 2)} GB"

                

                print('______')
                print(['Train:', curacc_train, 'Val:', curacc_val, 'Test:', curacc_test])
                cm_val = confusion_matrix(np.argmax(y_val,axis=1), np.argmax(pred_val,axis=1))
                #print(cm_val)

                cm_test = confusion_matrix(np.argmax(y_test,axis=1), np.argmax(pred_test,axis=1))
                #print(cm_test)
                print([split_name, mitr, ump, l, d, bs, lr , outact, transpose, bn, 'Model Size:', get_size('my_model.h5')])
   
'''
['Train:', 0.6758333333333333, 'Val:', 0.5725, 'Test:', 0.59]
['mfcc_normalized', 0, True, 50, 32, 4, 0.001, 'softmax', False, False, 'Model Size:', '899.37 KB']
'''

In [None]:

sr = 16000 # Sampling rate
#duration = 5
hop_length = 347 # to make time steps 128
fmin = 20
fmax = sr // 2
n_mels = 128
n_fft = n_mels * 20

def create_dataset(map_to_class,
                  NUM_FEATURES):

    df = pd.read_csv('./ESC-50-master/meta/esc50.csv')
    
    category_to_class = {
    'dog': 'hayvan',
    'chirping_birds': 'hayvan',
    'vacuum_cleaner': 'arkaplan', 
    'thunderstorm': 'arkaplan',
    'door_wood_knock': 'arkaplan',
    'can_opening': 'arkaplan',
    'crow': 'arkaplan',
    'clapping': 'insan',
    'fireworks': 'arkaplan',
    'chainsaw': 'arac',
    'airplane': 'arac',
    'mouse_click': 'arkaplan',
    'pouring_water': 'arkaplan',
    'train': 'arac',
    'sheep': 'hayvan',
    'water_drops': 'arkaplan',
    'church_bells': 'arkaplan',
    'clock_alarm': 'arkaplan',
    'keyboard_typing': 'arkaplan',
    'wind': 'arkaplan',
    'footsteps': 'insan',
    'frog': 'arkaplan',
    'cow': 'hayvan',
    'brushing_teeth': 'arkaplan',
    'car_horn': 'arac',
    'crackling_fire': 'arkaplan',
    'helicopter': 'arac',
    'drinking_sipping': 'arkaplan',
    'rain': 'arkaplan',
    'insects': 'arkaplan',
    'laughing': 'insan',
    'hen': 'hayvan',
    'engine': 'arac',
    'breathing': 'insan',
    'crying_baby': 'insan', 
    'hand_saw': 'arac',
    'coughing': 'insan',
    'glass_breaking': 'arkaplan',
    'snoring' : 'insan',
    'toilet_flush': 'arkaplan',
    'pig': 'hayvan',
    'washing_machine': 'arac',
    'clock_tick': 'arkaplan',
    'sneezing' : 'insan',
    'rooster': 'hayvan',
    'sea_waves': 'arkaplan',
    'siren': 'arac',
    'cat': 'hayvan',
    'door_wood_creaks': 'arkaplan',
    'crickets': 'arkaplan',
    }


    if map_to_class:
        df['class'] = df['category'].map(category_to_class)
    else:
        df['class'] = df['category']
        
    def features_extractor(file_name, NUM_FEATURES):
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        audio_16k = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
        mfccs_features = librosa.feature.mfcc(y=audio_16k, sr=16000, n_mfcc=NUM_FEATURES)
        mfccs_features = mfccs_features.T
        
        spectrogram = librosa.feature.melspectrogram(audio, 
                                             sr=16000,
                                             n_mels=n_mels,
                                             hop_length=hop_length,
                                             n_fft=n_fft,
                                             fmin=fmin,
                                             fmax=fmax)
        spectrogram = librosa.power_to_db(spectrogram).astype(np.float32)

        dct = scipy.fftpack.dct(audio_16k)
        dct_64 = scipy.fftpack.dct(audio_16k, n = 64)
        dct_128 = scipy.fftpack.dct(audio_16k, n = 128)
        dct_256 = scipy.fftpack.dct(audio_16k, n = 256)
        dct_512 = scipy.fftpack.dct(audio_16k, n = 512)
        
        return mfccs_features, spectrogram.T, dct, dct_64, dct_128, dct_256, dct_512


    extracted_features=[]
    spectograms=[]
    dct_list=[]
    dct_64_list=[]
    dct_128_list=[]
    dct_256_list=[]
    dct_512_list=[]

    for _, row in df.iterrows():

        file_name = f"./ESC-50-master/audio/{row['filename']}"

        data, spectrogram, dct, dct_64, dct_128, dct_256, dct_512 =features_extractor(file_name, NUM_FEATURES)

        
        extracted_features.append([data, row['class'], row['fold']])
        spectograms.append([spectrogram, row['class'], row['fold']])
        
        #original_features.append(original_data.tolist())
        
    extracted_features_df=pd.DataFrame(extracted_features,columns=['feature', 'class', 'fold'])
    spectograms_df=pd.DataFrame(spectograms,columns=['feature', 'class', 'fold'])


    padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        extracted_features_df['feature'].tolist(), padding="post")
    
    padded_inputs_spec = tf.keras.preprocessing.sequence.pad_sequences(
        spectograms_df['feature'].tolist(), padding="post")


    y=np.array(extracted_features_df['class'].tolist())

    
    return extracted_features_df, spectograms_df, y, padded_inputs, padded_inputs_spec



def create_dataset(map_to_class,
                  NUM_FEATURES):

    df = pd.read_csv('./ESC-50-master/meta/esc50.csv')
    
    category_to_class = {
    'dog': 'hayvan',
    'chirping_birds': 'hayvan',
    'vacuum_cleaner': 'arkaplan', 
    'thunderstorm': 'arkaplan',
    'door_wood_knock': 'arkaplan',
    'can_opening': 'arkaplan',
    'crow': 'arkaplan',
    'clapping': 'insan',
    'fireworks': 'arkaplan',
    'chainsaw': 'arac',
    'airplane': 'arac',
    'mouse_click': 'arkaplan',
    'pouring_water': 'arkaplan',
    'train': 'arac',
    'sheep': 'hayvan',
    'water_drops': 'arkaplan',
    'church_bells': 'arkaplan',
    'clock_alarm': 'arkaplan',
    'keyboard_typing': 'arkaplan',
    'wind': 'arkaplan',
    'footsteps': 'insan',
    'frog': 'arkaplan',
    'cow': 'hayvan',
    'brushing_teeth': 'arkaplan',
    'car_horn': 'arac',
    'crackling_fire': 'arkaplan',
    'helicopter': 'arac',
    'drinking_sipping': 'arkaplan',
    'rain': 'arkaplan',
    'insects': 'arkaplan',
    'laughing': 'insan',
    'hen': 'hayvan',
    'engine': 'arac',
    'breathing': 'insan',
    'crying_baby': 'insan', 
    'hand_saw': 'arac',
    'coughing': 'insan',
    'glass_breaking': 'arkaplan',
    'snoring' : 'insan',
    'toilet_flush': 'arkaplan',
    'pig': 'hayvan',
    'washing_machine': 'arac',
    'clock_tick': 'arkaplan',
    'sneezing' : 'insan',
    'rooster': 'hayvan',
    'sea_waves': 'arkaplan',
    'siren': 'arac',
    'cat': 'hayvan',
    'door_wood_creaks': 'arkaplan',
    'crickets': 'arkaplan',
    }

    raw_data = extracted_features
    if map_to_class:
        df['class'] = df['category'].map(category_to_class)
    else:
        df['class'] = df['category']
        
    def load_dataset(file_name, NUM_FEATURES):
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        audio_16k = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
        
        return audio_16k

    for _, row in df.iterrows():

        file_name = f"./ESC-50-master/audio/{row['filename']}"

        data = load_dataset(file_name)

        extracted_features.append([data, row['class'], row['fold']])
        spectograms.append([spectrogram, row['class'], row['fold']])
        
        #original_features.append(original_data.tolist())
        
    extracted_features_df=pd.DataFrame(extracted_features,columns=['feature', 'class', 'fold'])
    spectograms_df=pd.DataFrame(spectograms,columns=['feature', 'class', 'fold'])


    padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        extracted_features_df['feature'].tolist(), padding="post")
    
    padded_inputs_spec = tf.keras.preprocessing.sequence.pad_sequences(
        spectograms_df['feature'].tolist(), padding="post")


    y=np.array(extracted_features_df['class'].tolist())

    
    return extracted_features_df, spectograms_df, y, padded_inputs, padded_inputs_spec

In [None]:
#/home/gltkn/Desktop/sekiza/akustik_data/ESC-50-master/audio/1-137-A-32.wav

file_name = './ESC-50-master/audio/1-137-A-32.wav'
audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
audio_16k = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)

In [None]:
scipy.fftpack.dct(audio_16k)

In [None]:
extracted_features_df, spectograms_df, y, padded_inputs, padded_inputs_spec = create_dataset(True, 64)

In [None]:
padded_inputs_spec.max()

In [None]:
padded_inputs.max()

In [None]:
extracted_features_df['class'].unique()

In [None]:
y_cat = tf.keras.utils.to_categorical(extracted_features_df['class'].map(dict(zip(extracted_features_df['class'].unique(), range(4)))).tolist() , num_classes=len(np.unique(y)))
y_cat

In [None]:
def normalize(img):
    '''
    Normalizes an array 
    (subtract mean and divide by standard deviation)
    '''
    eps = 0.001
    if np.std(img) != 0:
        img = (img - np.mean(img)) / np.std(img)
    else:
        img = (img - np.mean(img)) / eps
    return img


normalized_padded_inputs = normalize(padded_inputs)
normalized_padded_inputs_spec = normalize(padded_inputs_spec)

In [None]:
tr = np.where(extracted_features_df['fold']<4)[0]
val = np.where(extracted_features_df['fold']==4)[0]
test = np.where(extracted_features_df['fold']==5)[0]

In [None]:
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
    
from sklearn.metrics import *
bestacc = 0
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Input, Conv1D, AveragePooling1D, Attention
from tensorflow.keras.optimizers import *
        


for l in [50,100,200]:
  for d in [32]:
   for bs in [4,16]:
    for lr in [0.001]:
       for transpose in [ False]:
        for bn in [False]:
         for mitr in range(4):
          for ump in [True,False]:
            for split_name, split_df in [
                                        ('mfcc_normalized', normalized_padded_inputs),
                                        ('spectrogram_normalized', normalized_padded_inputs_spec),
                                        ]:


                x_train = split_df[tr]
                x_val = split_df[val]
                x_test = split_df[test]

                y_train = y_cat[tr]
                y_val = y_cat[val]
                y_test = y_cat[test]
    

                if transpose:
                    x_train_copy = x_train.reshape((x_train.shape[0], x_train.shape[2], x_train.shape[1]))
                    x_val_copy = x_val.reshape((x_val.shape[0], x_val.shape[2], x_val.shape[1]))
                    x_test_copy = x_test.reshape((x_test.shape[0], x_test.shape[2], x_test.shape[1]))

                else:
                    x_train_copy = x_train.copy()
                    x_val_copy = x_val.copy()
                    x_test_copy = x_test.copy()

                tf.random.set_seed(3)

                
                inp = Input(shape=(x_train_copy.shape[1],x_train_copy.shape[2]))

                if mitr == 0:
                    #x = Conv1D(25, kernel_size=2)(inp)
                    x = Conv1D(64,5,strides=1,activation="relu",)(inp)
                    x = Attention()([x,x])
                    if ump:
                        x = MaxPooling1D()(x)
                    x = Bidirectional(LSTM(l))(x)
                    x = Dense(d,activation='relu')(x)
                    x = Dense(d,activation='relu')(x)

                elif mitr == 1:
                    #x = Conv1D(25, kernel_size=2)(inp)
                    x = LSTM(l,return_sequences= True )(inp)
                    if ump:
                        x = MaxPooling1D()(x)
                    x = LSTM(l)(x)
                    x = Dense(d,activation='relu')(x)
                    x = Dense(d,activation='relu')(x)
                    
                elif mitr == 2:
                    #x = Conv1D(25, kernel_size=2)(inp)
                    x = Bidirectional(LSTM(l,return_sequences= True ))(inp)
                    if ump:
                        x = MaxPooling1D()(x)
                    x = Bidirectional(LSTM(l))(x)
                    x = Dense(d,activation='relu')(x)
                    x = Dense(d,activation='relu')(x)
                    
                elif mitr == 3:
                    #x = Conv1D(25, kernel_size=2)(inp)
                    x = Bidirectional(LSTM(l,return_sequences= True ))(inp)
                    x = Attention()([x,x])
                    if ump:
                        x = MaxPooling1D()(x)
                    x = Bidirectional(LSTM(l))(x)
                    x = Dense(d,activation='relu')(x)
                    x = Dense(d,activation='relu')(x)

                    
                out = Dense(y_cat.shape[1],activation='softmax')(x)

                model = Model(inputs=inp, outputs=out)

                es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose = 0)

                optimizer = tf.keras.optimizers.Adam(learning_rate = lr)
                model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
                history = model.fit(x_train_copy, y_train, batch_size=bs, epochs=500, validation_data=(x_val_copy, y_val), verbose =0, callbacks = [es])


                pred_train = model.predict(x_train_copy)
                pred_val = model.predict(x_val_copy)
                pred_test = model.predict(x_test_copy)

                curacc_train = accuracy_score(np.argsort(y_train,axis=1)[:,-1], np.argsort(pred_train,axis=1)[:,-1])
                curacc_val = accuracy_score(np.argsort(y_val,axis=1)[:,-1], np.argsort(pred_val,axis=1)[:,-1])
                curacc_test = accuracy_score(np.argsort(y_test,axis=1)[:,-1], np.argsort(pred_test,axis=1)[:,-1])

                model.save('my_model.h5')

                import os

                def get_size(path):
                    size = os.path.getsize(path)
                    if size < 1024:
                        return f"{size} bytes"
                    elif size < 1024*1024:
                        return f"{round(size/1024, 2)} KB"
                    elif size < 1024*1024*1024:
                        return f"{round(size/(1024*1024), 2)} MB"
                    elif size < 1024*1024*1024*1024:
                        return f"{round(size/(1024*1024*1024), 2)} GB"

                

                print('______')
                print(['Train:', curacc_train, 'Val:', curacc_val, 'Test:', curacc_test])
                cm_val = confusion_matrix(np.argmax(y_val,axis=1), np.argmax(pred_val,axis=1))
                #print(cm_val)

                cm_test = confusion_matrix(np.argmax(y_test,axis=1), np.argmax(pred_test,axis=1))
                #print(cm_test)
                print([split_name, mitr, ump, l, d, bs, lr , outact, transpose, bn, 'Model Size:', get_size('my_model.h5')])
   
'''
['Train:', 0.6758333333333333, 'Val:', 0.5725, 'Test:', 0.59]
['mfcc_normalized', 0, True, 50, 32, 4, 0.001, 'softmax', False, False, 'Model Size:', '899.37 KB']
'''

In [None]:
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten


inp = Input(shape=(x_train_copy.shape[1],x_train_copy.shape[2]))
#x = Conv1D(25, kernel_size=2)(inp)
x = LSTM(l,return_sequences= True )(inp)
x = Dropout(do)(x)
x = MaxPooling1D()(x)
x = LSTM(l)(x)
x = Dropout(do)(x)
x = Dense(d,activation='relu')(x)
x = Dropout(do)(x)
x = Dense(d,activation='relu')(x)
x = Dropout(do)(x)
out = Dense(y_cat.shape[1],activation=outact)(x)
model = Model(inputs=inp, outputs=out)

model.summary()

In [None]:
model.save('my_model.h5')

import os

def get_size(path):
    size = os.path.getsize(path)
    if size < 1024:
        return f"{size} bytes"
    elif size < 1024*1024:
        return f"{round(size/1024, 2)} KB"
    elif size < 1024*1024*1024:
        return f"{round(size/(1024*1024), 2)} MB"
    elif size < 1024*1024*1024*1024:
        return f"{round(size/(1024*1024*1024), 2)} GB"
    
get_size('my_model.h5')

In [None]:

pred_val = model.predict(x_val)
pred_test = model.predict(x_test)

curacc_val = accuracy_score(np.argsort(y_val,axis=1)[:,-1], np.argsort(pred_val,axis=1)[:,-1])
curacc_test = accuracy_score(np.argsort(y_test,axis=1)[:,-1], np.argsort(pred_test,axis=1)[:,-1])

cm_val = confusion_matrix(np.argmax(y_val,axis=1), np.argmax(pred_val,axis=1))
print(cm_val)

cm_test = confusion_matrix(np.argmax(y_test,axis=1), np.argmax(pred_test,axis=1))
print(cm_test)
print([curacc_val, curacc_test, mtc, bs, d ,l ,NUM_FEATURES])

In [None]:
cm_val = confusion_matrix(np.argmax(y_val,axis=1), np.argmax(pred_val,axis=1))
print(cm_val)

cm_test = confusion_matrix(np.argmax(y_test,axis=1), np.argmax(pred_test,axis=1))
print(cm_test)

In [None]:
y_test.sum(axis=0)

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    fig = plt.figure(figsize=(10,7))
    

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
from sklearn.metrics import confusion_matrix

class_names = dict(zip(range(len(labelencoder.classes_)),labelencoder.classes_))

y_true = np.argsort(y,axis=1)[:,-1]
y_pred = np.argsort(ypreds,axis=1)[:,-1]

y_true
y_pred

cnf_matrix = confusion_matrix([class_names[x] for x in y_true],
                 [class_names[x] for x in y_pred],
                 labels=labelencoder.classes_)

np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=labelencoder.classes_,
                      title='Confusion matrix, without normalization')