In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa
import librosa.display
from IPython.display import Audio

import os
from tqdm import tqdm_notebook
from scipy.io import wavfile
from copy import deepcopy

In [2]:
#path_dataset = '../input/'
path_dataset = '/home/edoardobucheli/Datasets/FSDKaggle2018'

path_train = os.path.join(path_dataset,'audio_train_16k')
path_test = os.path.join(path_dataset,'audio_test_16k')

In [3]:
train_data = pd.read_csv(os.path.join(path_dataset,'train_post_competition.csv'))
test_data = pd.read_csv(os.path.join(path_dataset,'test_post_competition_scoring_clips.csv'))

In [4]:
train_data.head()

Unnamed: 0,fname,label,manually_verified,freesound_id,license
0,00044347.wav,Hi-hat,0,28739,Attribution
1,001ca53d.wav,Saxophone,1,358827,Attribution
2,002d256b.wav,Trumpet,0,10897,Creative Commons 0
3,0033e230.wav,Glockenspiel,1,325017,Attribution
4,00353774.wav,Cello,1,195688,Attribution


In [5]:
test_data.head()

Unnamed: 0,fname,label,usage,freesound_id,license
0,00326aa9.wav,Oboe,Private,355125,Attribution
1,0038a046.wav,Bass_drum,Private,90621,Creative Commons 0
2,007759c4.wav,Saxophone,Private,13406,Creative Commons 0
3,008afd93.wav,Saxophone,Private,358962,Attribution
4,00ae03f6.wav,Chime,Private,78203,Attribution


In [6]:
classes = np.unique(train_data['label'])
n_classes = len(classes)

num_to_label = dict([[v,k] for v,k in enumerate(classes)])
label_to_num = dict([[k,v] for v,k in enumerate(classes)])

In [7]:
data_manual = train_data[train_data['manually_verified']==1]
data_auto = train_data[train_data['manually_verified']==0]

In [8]:
filenames_all = train_data['fname'].tolist()
labels_all = [label_to_num[f] for f in train_data['label']]

filenames_manual = data_manual['fname'].tolist()
labels_manual = [label_to_num[f] for f in data_manual['label']]

filenames_auto = data_auto['fname'].tolist()
labels_auto = [label_to_num[f] for f in data_auto['label']]

filenames_test = test_data['fname'].tolist()
labels_test = [label_to_num[f] for f in test_data['label']]

In [9]:
sr = 16000
length = 64000

### Load Test Data

In [10]:
x = np.zeros((1600,64000))

for i, file in enumerate(tqdm_notebook(filenames_test)):
    wave,_ = librosa.load(os.path.join(path_test,file),sr=sr,dtype = np.float32) 
    wave,_ = librosa.effects.trim(wave)
    
    if len(wave)<=length:

        amount = length-len(wave)
        wave = np.pad(wave,(int(np.ceil(amount/2)),int(np.floor(amount/2))),'constant')

    elif len(wave) > length:
        max_start = len(wave)-length
        
        start = np.random.randint(0,max_start)
        end = start + length
        
        wave = wave[start:end]
        
    wave = wave/np.max(np.absolute(wave+1e-10))
    x[i] = wave    

HBox(children=(IntProgress(value=0, max=1600), HTML(value='')))




### Import Libraries for Models

In [11]:
import keras
import os 
import random
import numpy as np
import kapre

from keras.layers import Input
from keras.layers.convolutional import Conv1D
from keras.layers.core import Dense, Reshape, Permute
from keras.models import Model
from keras.optimizers import Adam
from keras.layers.pooling import GlobalAveragePooling1D
from keras import metrics
from utils import util
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers.core import Dropout 
from keras import backend as K
from keras.layers.convolutional import Conv2D, Conv2DTranspose
from keras.layers.pooling import GlobalAveragePooling2D, GlobalMaxPooling2D, MaxPooling2D
from keras.layers import Add, Average, Concatenate, Multiply, Lambda, BatchNormalization, Activation, TimeDistributed
from keras import regularizers
from kapre.time_frequency import Melspectrogram, Spectrogram

Using TensorFlow backend.


In [12]:
def ccategorical_crossentropy(y_true, y_pred):
    
    y_true_v = K.greater(K.sum(y_true, axis=-1),1.1)
    y_true = y_true%1
    
    y_pred = K.clip(y_pred, K.epsilon(), 1)
    loss = -K.sum(y_true*K.log(y_pred),axis=-1)

    m = K.max(loss)*0.8
    
    loss = loss 
    
    el = 1-(K.cast(K.greater(loss,m),'float32')*K.cast(y_true_v,'float32'))
    loss = loss*el
    return loss     

### Waveform Model 16k

In [13]:
model_filename = './COCAI_Models/dcase2018_task2_cochlearai_saved_models_16000_wav-1-fold_wav_model_1fold_0.8537611783271962.h5'

In [15]:
model = keras.models.load_model(model_filename,
                                custom_objects={'Melspectrogram':kapre.time_frequency.Melspectrogram,
                                                'Spectrogram':kapre.time_frequency.Spectrogram,
                                                'ccategorical_crossentropy':ccategorical_crossentropy})

In [16]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 1, None, 1)   0           input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 1, None, 1)   4           reshape_1[0][0]                  
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 1, None, 1)   4           batch_normalization_1[0][0]      
__________________________________________________________________________________________________
concatenat

In [21]:
new_output_name = 'reshape_16'

new_output_layer = model.get_layer(new_output_name).output

model_headless = keras.Model(inputs = model.input, outputs = new_output_layer)

In [22]:
model_headless.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 1, None, 1)   0           input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 1, None, 1)   4           reshape_1[0][0]                  
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 1, None, 1)   4           batch_normalization_1[0][0]      
__________________________________________________________________________________________________
concatenat

In [23]:
y_hat = model_headless.predict(x)

In [25]:
y_hat = np.reshape(y_hat,(1600,512))

y_hat.shape

(1600, 512)

In [48]:
np.mean(labels_test==np.argmax(y_hat,axis = 1))

0.855

### Mel Model 16k

In [2]:
model2_filename = './COCAI_Models/dcase2018_task2_cochlearai_saved_models_16000_mel-1-fold_mel_model_1fold_0.8858495528669121.h5'

In [33]:
model2 = keras.models.load_model(model2_filename,
                                   custom_objects={'Melspectrogram':kapre.time_frequency.Melspectrogram,
                                                   'Spectrogram':kapre.time_frequency.Spectrogram,
                                                   'ccategorical_crossentropy':ccategorical_crossentropy})

In [7]:
model2.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, None, 1)      0           input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, None, 1)      4           reshape_1[0][0]                  
__________________________________________________________________________________________________
reshape_2 (Reshape)             (None, 1, None)      0           batch_normalization_1[0][0]      
__________________________________________________________________________________________________
trainable_

In [20]:
y_hat2 = model2.predict(x)

In [21]:
np.mean(labels_test==np.argmax(y_hat2,axis = 1))

0.881875

In [38]:
edo = model2.layers[7:]

In [27]:
model2.layers.pop()

<keras.layers.merge.Average at 0x7f08fcc90cf8>

In [37]:
model2.layers

[<keras.engine.input_layer.InputLayer at 0x7f0871353ac8>,
 <keras.layers.core.Reshape at 0x7f0871353be0>,
 <keras.layers.normalization.BatchNormalization at 0x7f08713536d8>,
 <keras.layers.core.Reshape at 0x7f0871353c18>,
 <kapre.time_frequency.Melspectrogram at 0x7f0871353ef0>,
 <keras.layers.core.Permute at 0x7f0871353f98>,
 <keras.layers.normalization.BatchNormalization at 0x7f0871353978>,
 <keras.layers.core.Permute at 0x7f08711104a8>,
 <keras.layers.convolutional.Conv2D at 0x7f0871110518>,
 <keras.layers.merge.Concatenate at 0x7f08711106a0>,
 <keras.layers.normalization.BatchNormalization at 0x7f08711106d8>,
 <keras.layers.core.Activation at 0x7f08711107f0>,
 <keras.layers.convolutional.Conv2D at 0x7f0871110828>,
 <keras.layers.normalization.BatchNormalization at 0x7f08711109b0>,
 <keras.layers.core.Activation at 0x7f0871110ac8>,
 <keras.layers.convolutional.Conv2D at 0x7f0871110b00>,
 <keras.layers.pooling.GlobalAveragePooling2D at 0x7f0871110c88>,
 <keras.layers.core.Reshape at 

In [26]:
edo

[<keras.engine.input_layer.InputLayer at 0x7f08fd4cd0b8>,
 <keras.layers.core.Reshape at 0x7f08fd4cd0f0>]

In [22]:
from keras.utils import Sequence

class DataGenerator(Sequence):
    def __init__(self,x_set,y_set,
                 batch_size = 128,input_shape = (80,frames),
                 n_classes=80, sr = 44100, seconds = 4):
        
        self.x, self.y = x_set,y_set
        self.batch_size = batch_size
        self.input_shape = input_shape
        self.n_classes = n_classes
        self.sr = sr
        self.file_length = sr*seconds
        #self.on_epoch_end()
    
    def __len__(self):
        return int(np.ceil(len(self.x)/self.batch_size))
    
    def __getitem__(self,idx):
        
        spects = self.x[idx*self.batch_size:(idx+1)*self.batch_size]
        batch_y = self.y[idx*self.batch_size:(idx+1)*self.batch_size]
        
        batch_x = self.__generate_data(spects)
        
        #print(batch_x.shape)
        #print(batch_y.shape)
        
        return batch_x,batch_y
    
    def __generate_data(self,spects):
        n_mels,frames = self.input_shape
        x_batch = np.zeros((len(spects),n_mels,frames))
        
        for i, spect in enumerate(spects):
            freq_res,time_res = spect.shape
            
            max_start = time_res-frames
            if max_start == 0:
                start = 0
            else:
                start = np.random.randint(0,max_start)
            end = start+frames
            
            x_batch[i] = spect[:,start:end]
        
        return x_batch

In [41]:
train_generator_c = DataGenerator(X_train_c,y_train_c)
val_generator_c = DataGenerator(X_val_c,y_val_c)

In [40]:
X_train_c,X_val_c,y_train_c,y_val_c = train_test_split(X_curated,labels_manual,test_size=0.1, random_state=7)

In [37]:
X_curated = []

for i,tag in enumerate(train_data['manually_verified']):
    if tag == 1:
        X_curated.append(X_all[i])

### Train Only with Curated Data