In [17]:
from keras.models import Sequential
from keras.layers import Convolution1D,Activation,Flatten,Dense,Dropout
from keras import optimizers

In [9]:
import librosa
import pandas as pd
import numpy as np

In [11]:
train = pd.read_csv('data/train.csv')

In [12]:
train.head()

Unnamed: 0,fname,label,manually_verified
0,00044347.wav,Hi-hat,0
1,001ca53d.wav,Saxophone,1
2,002d256b.wav,Trumpet,0
3,0033e230.wav,Glockenspiel,1
4,00353774.wav,Cello,1


In [26]:
class Config(object):
    def __init__(self,
                 sampling_rate=16000, audio_duration=2):
        self.sampling_rate = sampling_rate
        self.audio_duration = audio_duration

        self.audio_length = self.sampling_rate * self.audio_duration
        self.dim = (self.audio_length, 1)

In [None]:
class DataGenerator(d_utils.Sequence):
    def __init__(self, config, data_dir, list_IDs, labels=None, 
                 batch_size=64, preprocessing_fn=lambda x: x):
        self.config = config
        self.data_dir = data_dir
        self.list_IDs = list_IDs
        self.labels = labels
        self.batch_size = batch_size
        self.preprocessing_fn = preprocessing_fn
        self.on_epoch_end()
        self.dim = self.config.dim

    def __len__(self):
        return int(np.ceil(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        return self.__data_generation(list_IDs_temp)

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.list_IDs))
        
    def transform_data(self,data):
        if self.config.use_mfcc:
            data = librosa.feature.mfcc(data, sr=self.config.sampling_rate,
                                               n_mfcc=self.config.n_mfcc)
            data = np.expand_dims(data, axis=-1)
        elif self.config.use_mel_spec:
            data = librosa.feature.melspectrogram(data,sr=self.config.sampling_rate,n_mels=self.config.n_mels)
        else:
            data = self.preprocessing_fn(data)[:, np.newaxis]
        return data
    
    def adjust_audio_length(self,data,input_length):
       # print("adjusted audio length")
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, input_length - len(data) - offset), "constant")
       # print(len(data))
        return data
#             if len(data) >= input_length:
#                 data = data[:input_length]
#             else:
#                 data = np.pad(data,input_length-len(data),"constant")
    def __data_generation(self, list_IDs_temp):
        cur_batch_size = len(list_IDs_temp)
        X = np.empty((cur_batch_size, *self.dim))

        input_length = self.config.audio_length
        for i, ID in enumerate(list_IDs_temp):
            file_path = self.data_dir + ID
            
            # Read and Resample the audio
            data, _ = librosa.core.load(file_path, sr=self.config.sampling_rate,
                                        res_type='kaiser_fast')
            
            #fixing lengths of files
            # Random offset / Padding
            data = self.adjust_audio_length(data,input_length)
            #other preprocessing
            data = self.transform_data(data)
         
            X[i,] = data

        if self.labels is not None:
            y = np.empty(cur_batch_size, dtype=int)
            for i, ID in enumerate(list_IDs_temp):
                y[i] = self.labels[ID]
            return X, n_utils.to_categorical(y, num_classes=self.config.n_classes)
        else:
            return X

In [None]:
def train_test_model(train, config, quick_run=False):
    if quick_run:
        train = train.sample(2000)
        config = Config(sampling_rate=100, audio_duration=1, n_folds=2, max_epochs=1)

    PREDICTION_FOLDER = "predictions_1d_conv"
    if not os.path.exists(PREDICTION_FOLDER):
        os.mkdir(PREDICTION_FOLDER)
    if os.path.exists('logs/' + PREDICTION_FOLDER):
        shutil.rmtree('logs/' + PREDICTION_FOLDER)

    skf = StratifiedKFold(train.label_idx, n_folds=config.n_folds)

    for i, (train_split, val_split) in enumerate(skf):
        train_set = train.iloc[train_split]
        val_set = train.iloc[val_split]
        checkpoint = ModelCheckpoint('best_%d.h5'%i, monitor='val_loss', verbose=1, save_best_only=True)
        early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
        tb = TensorBoard(log_dir='./logs/' + PREDICTION_FOLDER + '/fold_%d'%i, write_graph=True)

        callbacks_list = [checkpoint, early, tb]
        print("Fold: ", i)
        print("#"*50)
        if not quick_run:
            model = get_conv_model(config)
        else:
            model = get_1d_dummy_model(config)
        train_generator = DataGenerator(config, 'data/audio_train/', train_set.index, 
                                        train_set.label_idx, batch_size=64,
                                        preprocessing_fn=normalize_audio)
        val_generator = DataGenerator(config, 'data/audio_train/', val_set.index, 
                                      val_set.label_idx, batch_size=64,
                                      preprocessing_fn=normalize_audio)

        history = model.fit_generator(train_generator, callbacks=callbacks_list, validation_data=val_generator,
                                      epochs=config.max_epochs, use_multiprocessing=True, workers=6, max_queue_size=20)

        model.load_weights('best_%d.h5'%i)

        # Save train predictions
        train_generator = DataGenerator(config, 'data/audio_train/', train.index, batch_size=128,
                                        preprocessing_fn=audio_norm)
        predictions = model.predict_generator(train_generator, use_multiprocessing=True, 
                                              workers=6, max_queue_size=20, verbose=1)
        np.save(PREDICTION_FOLDER + "/train_predictions_%d.npy"%i, predictions)

        # Save test predictions
        test_generator = DataGenerator(config, 'data/audio_test/', test.index, batch_size=128,
                                        preprocessing_fn=audio_norm)
        predictions = model.predict_generator(test_generator, use_multiprocessing=True, 
                                              workers=6, max_queue_size=20, verbose=1)
        np.save(PREDICTION_FOLDER + "/test_predictions_%d.npy"%i, predictions)

        #Make a submission file
        top_3 = np.array(LABELS)[np.argsort(-predictions, axis=1)[:, :3]]
        predicted_labels = [' '.join(list(x)) for x in top_3]
        test['label'] = predicted_labels
        test[['label']].to_csv(PREDICTION_FOLDER + "/predictions_%d.csv"%i)

In [21]:
def get_1d_dummy_model(config):
    nb_class = 41
    input_length = sampling_rate*audio_duration
    model = Sequential()
    model.add(Convolution1D(nb_filter=512, filter_length=1, input_shape=(input_length, 3)))
    model.add(Activation('relu'))
    model.add(Flatten())
    model.add(Dropout(0.4))
    model.add(Dense(2048, activation='relu'))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(nb_class))
    model.add(Activation('softmax'))
    opt = optimizers.Adam()

    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['acc'])
    return model

In [24]:
def train_model(X_train, y_train):
    model = get_model()
    model.fit(X_train,y_Train)
    model.save('cnn_audio_tagger.h5')

In [25]:
X_train = train['fname'].apply(lambda fname: )
y_train = train[['label']]

In [None]:
train_model(X_)