# Load the files into Colab

First we'll upgrade pandas, as it requires a runtime restart, so let's get this out of the way. The old version on Colab doesn't allow you to access series via df.col_name, and it's annoying in f-strings.

In [0]:
!pip install --upgrade pandas #need to upgrade otherwise colab doesn't understand the command df.column_name

Collecting pandas
[?25l  Downloading https://files.pythonhosted.org/packages/19/74/e50234bc82c553fecdbd566d8650801e3fe2d6d8c8d940638e3d8a7c5522/pandas-0.24.2-cp36-cp36m-manylinux1_x86_64.whl (10.1MB)
[K    100% |████████████████████████████████| 10.1MB 4.0MB/s 
Installing collected packages: pandas
  Found existing installation: pandas 0.23.4
    Uninstalling pandas-0.23.4:
      Successfully uninstalled pandas-0.23.4
Successfully installed pandas-0.24.2


Don't forget to press the 'RESTART RUNTIME' button above.

The first cells are for loading the big data files from google drive (as colab only keeps the files for 12 hours), and the .csv from my local drive or dropbox. We first authenticate.

In [1]:
!pip install PyDrive
import os, zipfile
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth, files
from oauth2client.client import GoogleCredentials

auth.authenticate_user()

gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

Collecting PyDrive
[?25l  Downloading https://files.pythonhosted.org/packages/52/e0/0e64788e5dd58ce2d6934549676243dc69d982f198524be9b99e9c2a4fd5/PyDrive-1.3.1.tar.gz (987kB)
[K    100% |████████████████████████████████| 993kB 23.5MB/s 
Building wheels for collected packages: PyDrive
  Building wheel for PyDrive (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/fa/d2/9a/d3b6b506c2da98289e5d417215ce34b696db856643bad779f4
Successfully built PyDrive
Installing collected packages: PyDrive
Successfully installed PyDrive-1.3.1


Now download the files and un unzip the archives. We have to use 7z for train_curated.zip since I believe it's a bad file or 
it's too big. Either way unzip does not work, but works for test.zip

In [0]:
!mkdir /content/data
print('\nDownloading the data')
download = drive.CreateFile({'id': '1e5u2uDQ5mupV64KJbmWVG-JhlGKz9uUD'})
download.GetContentFile('/content/data_shell.tar.gz')
!tar -xzvf /content/data_shell.tar.gz -C /content/data

# Data Visualization and Basic Cleaning (can skip and go to Model directly)

First load some packages for Data Visualization.

In [0]:
import numpy as np
np.random.seed(1989)

import os

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from IPython.display import display
pd.options.display.max_columns = 20

import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 

Now load the spreadsheet data to explore it a little bit.

In [0]:
dir_data = '/content/data/'
#train = pd.read_csv(os.path.join(dir_data, 'train_curated.csv'))
#test = pd.read_csv(os.path.join(dir_data, 'sample_submission.csv'))

Some files from the training set are corrupted (very few) and won't unzip, we erase those from the spreadsheet.

In [0]:
files_in_folder = [name for name in os.listdir(dir_data)]

Look at the shape and number of examples and labels. 

In [0]:
print(f'Train set has {train.shape[0]} examples and {len(set(train.labels))} different labels')
train.sample(3)

In [0]:
print(f'Test set has {test.shape[0]} examples and {len(test.columns[1:])} different labels')
test.sample(3)

We will for now exclude the sounds which have multiple labels.

In [0]:
train = train[train['labels'].isin(test.columns[1:])]
print(f'Removing multilabel examples. The Train set has now {train.shape[0]} examples and {len(set(train.labels))} different labels')

Let's visualize how many samples there are per label.

In [0]:
category_group = train.groupby(['labels']).count()
category_group.columns = ['counts']
print(f'The number of training clips per label range from {category_group.counts.min()} to {category_group.counts.max()}')
plot = category_group.sort_values(ascending=True, by='counts').plot(kind='barh', title='nbr training audio clips per label', figsize=(20,12))
plt.show()

Let's now visualize the distribution of their length:



We first only look at the top 25 categories

In [0]:
train['nframes'] = train['fname'].apply(lambda fname : wave.open(os.path.join(dir_data, 'curated', fname)).getnframes())
test['nframes'] = test['fname'].apply(lambda fname : wave.open(os.path.join(dir_data, 'test', fname)).getnframes())
# plot the distribution of top 25 catetegories of the training set, since 74 is a little bit too much
idx_25_top = category_group.sort_values(ascending=True, by='counts').index[-25:]
_, ax = plt.subplots(figsize=(25,10))
sns.violinplot(data = train[train.labels.isin(idx_25_top)], x='labels', y='nframes')
plt.xticks(rotation=90)
plt.show()

And now compare the training and the test set.

In [0]:
fig, ax = plt.subplots(2,1, figsize=(20,8))
train.nframes.plot(kind='hist', bins=100, rwidth=0.5, ax = ax[0])
test.nframes.plot(kind='hist', bins=100, rwidth=0.5, ax = ax[1])
plt.show()

We can see that it's not the same scale, there are some outliers in the training set stretching the x-axis. We'll get rid of those.

In [0]:
print(f'Here are the outliers: \n{train[train.nframes > 1500000]}')
train = train[train['nframes']< 1500000]
print('We remove it!')

Let's pick a random audio clip and look at its attributes.

In [0]:
rand_ex = train.sample(1)
fname = rand_ex['fname'].values[0]
path_audiofile = os.path.join(dir_data, 'curated', fname)
wav = wave.open(path_audiofile)
print(f'Filename is {fname}')
print(f'Sampling frame rate {wav.getframerate()}')
print(f'Total frames {wav.getnframes()}')
print(f'Duration {wav.getnframes() / wav.getframerate()} sec')
print(f'Label is {rand_ex.labels.values[0]} \n' )
import IPython
IPython.display.Audio(path_audiofile)

In [0]:
rate, data = wavfile.read(path_audiofile)
plt.plot(data)
plt.figure(figsize=(18,4)); plt.plot(data[:500], '.'); plt.plot(data[:500], '-')
plt.show()

Calculate MFCC and visualize it.

In [0]:
SAMPLE_RATE = 44100
wav, _ = librosa.core.load(path_audiofile, sr=SAMPLE_RATE)
wav = wav[:2*44100] # keep 1 sec
mfcc = librosa.feature.mfcc(wav, sr = SAMPLE_RATE, n_mfcc=40)
print(f'Shape of the MFCC is {mfcc.shape}')
_, ax = plt.subplots(figsize=(15, 5))
ax.imshow(mfcc, cmap='Spectral', interpolation='nearest')

Finally we now save this cleaned dataframe.

In [0]:
train.to_csv(os.path.join(dir_data, 'train_curated_clean.csv'))

# Construct The Model

First (Re)Load some Packages

In [0]:
import numpy as np
np.random.seed(1989)

import os, shutil

import librosa, scipy, wave
from scipy.io import wavfile

import pandas as pd

from sklearn.model_selection import StratifiedKFold
from keras import losses, models, optimizers
from keras.activations import relu, softmax
from keras.callbacks import (EarlyStopping, LearningRateScheduler, ModelCheckpoint, TensorBoard, ReduceLROnPlateau)
from keras.layers import (Convolution1D, Dense, Dropout, GlobalAveragePooling1D, GlobalMaxPool1D, Input, MaxPool1D, concatenate) # for the 1D conv models
from keras.layers import (Convolution2D, GlobalAveragePooling2D, BatchNormalization, Flatten, GlobalMaxPool2D, MaxPool2D, Activation, concatenate) # for the 2D models with MFCC
from keras.utils import Sequence, to_categorical
from keras import backend as K

import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 

Using TensorFlow backend.


Load the data, add some columns and set up some dictionnaries.

In [0]:
dir_data = '/content'
train = pd.read_csv(os.path.join(dir_data, 'train_curated_clean.csv'))
test = pd.read_csv(os.path.join(dir_data, 'sample_submission.csv'))

labels = list(train.labels.unique())
label_to_idx = {label: i for i, label in enumerate(labels)}

train.set_index("fname", inplace=True)
test.set_index("fname", inplace=True)
train["label_idx"] = train.labels.apply(lambda x: label_to_idx[x])

category_group = train.groupby(['labels']).count()
category_group.rename(columns={'labels':'counts'})

# these are some parameters for the size of the run.
tiny_run = 100
small_run = 1000
all_run = None

Some Preliminary Functions:

The Normalization function - Simple normalization in the range [0,1], and then shift to 0-mean [-0.5,0.5].

In [0]:
def audio_norm(data):
    max_data = np.max(data)
    min_data = np.min(data)
    data = (data-min_data)/(max_data-min_data+1e-6)
    return data - 0.5

The Configuration class : In particular contains the default settings.

In [0]:
class Config(object):
	def __init__(self, model_name = None, sampling_rate=16000, audio_duration=2, use_mfcc=False, n_mfcc=20, n_classes=len(category_group), n_folds=5, learning_rate=0.0001, max_epochs=10):
		self.sampling_rate = sampling_rate
		self.audio_duration = audio_duration
		self.use_mfcc = use_mfcc
		self.n_mfcc = n_mfcc
		self.n_classes = n_classes
		self.n_folds = n_folds
		self.learning_rate = learning_rate
		self.max_epochs = max_epochs
		self.model_name = model_name
		if self.model_name == None:
		    model_name = 'dummy1d'
		    if use_mfcc:
			    model_name = 'dummy2d'
        
        
		self.audio_length = self.audio_duration * self.sampling_rate

		if self.use_mfcc:
			self.dim = (self.n_mfcc, 1 + int(np.floor(self.audio_length / 512)), 1)
		else:
			self.dim = (self.audio_length, 1)

The Datagenerator class : Inherits from Keras.utils.Sequence to efficiently feed the data.

I think that the normalization is done batch by batch, which is not good, as it's done the same way on the test set. 

In [0]:
class DataGenerator(Sequence): # Inherits from Keras.utils.Sequence for multiprocessing

    def __init__(self, config, dir_data, list_IDs, labels=None,
					batch_size=64, shuffle=False, preprocessing_fn=lambda x: x):
        self.config = config
        self.dir_data = dir_data
        self.list_IDs = list_IDs
        self.labels = labels
        self.batch_size = batch_size
        self.preprocessing_fn = preprocessing_fn
        self.shuffle = shuffle ## DOES IT WORK TO HAVE TRUE ?
        self.on_epoch_end()
        self.dim = self.config.dim

    # returns the number of batches in the Sequence (usually per 1 epoch)
    def __len__(self):
        return int(np.ceil(len(self.list_IDs) / self.batch_size))

    # returns a complete batch at the place index
    def __getitem__(self,index):
        indexes_temp = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes_temp]
        return self.__data_generation(list_IDs_temp)

    # called at the end of an epoch: reloads IDs
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)
    
    def __data_generation(self, list_IDs_temp):
        cur_batch_size = len(list_IDs_temp)
        X = np.empty((cur_batch_size, *self.dim))
        
        input_length = self.config.audio_length
        for i,ID in enumerate(list_IDs_temp):
            file_path = os.path.join(self.dir_data, ID)
            
            # read and resample
            data, _ = librosa.core.load(file_path, sr=self.config.sampling_rate, res_type='kaiser_fast')
            
            # random crop or pad
            if len(data) > input_length:
                max_offset = len(data) - input_length
                offset = np.random.randint(max_offset)
                data = data[offset:offset+input_length]
            else:
                if input_length > len(data):
                    max_offset = input_length - len(data)
                    offset = np.random.randint(max_offset)
                else:
                    offset = 0
                data = np.pad(data, (offset, input_length-offset-len(data)), 'constant')
            
            # normalize + other preprocesses
            if self.config.use_mfcc:
                data = librosa.feature.mfcc(data, sr=self.config.sampling_rate, n_mfcc=self.config.n_mfcc)
                data = np.expand_dims(data,-1) #add a dimension at the end
                #data = self.preprocessing_fn(data) # don't need norm in this case as we use BN
            else:
                data = self.preprocessing_fn(data)[:, np.newaxis]
            
            # save in the big array X
            X[i,] = data
        
        if self.labels is not None:
            y = np.empty(cur_batch_size, dtype=int)
            for i, ID in enumerate(list_IDs_temp):
                y[i] = self.labels[ID]
            
            return X, to_categorical(y, num_classes=self.config.n_classes)
        else:
            return X

Create a dummy model: Just to debug and test the pipeline. Name = dummy1d

In [0]:
def get_1d_dummy_model(config):
    n_class = config.n_classes
    input_length = config.audio_length
    
    inp = Input(shape=(input_length,1))
    x = GlobalMaxPool1D()(inp)
    out = Dense(n_class, activation=softmax)(x)
    
    model = models.Model(inputs=inp, outputs=out)
    opt = optimizers.Adam(config.learning_rate)
    
    model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
    return model

Create a better model: Still simple architecture though. Name = conv1d

In [0]:
def get_1d_conv_model(config):
    
    nclass = config.n_classes
    input_length = config.audio_length
    
    inp = Input(shape=(input_length,1))
    x = Convolution1D(16, 9, activation=relu, padding="valid")(inp)
    x = Convolution1D(16, 9, activation=relu, padding="valid")(x)
    x = MaxPool1D(16)(x)
    x = Dropout(rate=0.1)(x)
    
    x = Convolution1D(32, 3, activation=relu, padding="valid")(x)
    x = Convolution1D(32, 3, activation=relu, padding="valid")(x)
    x = MaxPool1D(4)(x)
    x = Dropout(rate=0.1)(x)
    
    x = Convolution1D(32, 3, activation=relu, padding="valid")(x)
    x = Convolution1D(32, 3, activation=relu, padding="valid")(x)
    x = MaxPool1D(4)(x)
    x = Dropout(rate=0.1)(x)
    
    x = Convolution1D(256, 3, activation=relu, padding="valid")(x)
    x = Convolution1D(256, 3, activation=relu, padding="valid")(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(rate=0.2)(x)

    x = Dense(64, activation=relu)(x)
    x = Dense(1028, activation=relu)(x)
    out = Dense(nclass, activation=softmax)(x)

    model = models.Model(inputs=inp, outputs=out)
    opt = optimizers.Adam(config.learning_rate)

    model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
    return model

Here is the dummy 2d model that applies to the audio data after MFCC. Name = dummy2d

In [0]:
def get_2d_dummy_model(config):
    
    nclass = config.n_classes
    
    inp = Input(shape=(config.dim[0],config.dim[1],1))
    x = GlobalMaxPool2D()(inp)
    out = Dense(nclass, activation=softmax)(x)

    model = models.Model(inputs=inp, outputs=out)
    opt = optimizers.Adam(config.learning_rate)

    model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
    return model

And the larger 2D conv model. Name = conv2d

In [0]:
def get_2d_conv_model(config):
    
    nclass = config.n_classes
    
    inp = Input(shape=(config.dim[0],config.dim[1],1))
    x = Convolution2D(32, (4,10), padding="same")(inp)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)

    x = Flatten()(x)
    x = Dense(64)(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    out = Dense(nclass, activation=softmax)(x)

    model = models.Model(inputs=inp, outputs=out)
    opt = optimizers.Adam(config.learning_rate)

    model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
    return model

# Run The Model!

Set-up the Paths for the data

In [0]:
dir_log = os.path.join(dir_data, 'logs')
if os.path.isdir(dir_log):
        shutil.rmtree(dir_log)

dir_pred = os.path.join(dir_data, 'pred')
if os.path.isdir(dir_pred):
    shutil.rmtree(dir_pred)
os.mkdir(dir_pred)

dir_train = os.path.join(dir_data, 'curated')
dir_test = os.path.join(dir_data, 'test')

Creating the config and other necessary objects

In [0]:
config = Config(sampling_rate=44100, model_name = 'conv2d', audio_duration=5, n_folds=3, learning_rate=0.001, max_epochs=50, use_mfcc = True, n_mfcc = 40)

skf = StratifiedKFold(n_splits=config.n_folds)

size_of_run = all_run # choose in [tiny_run, small_run, all_run]

train = train[:size_of_run]
test = test[:size_of_run]

Run the Loop

In [0]:
if os.path.isdir(dir_log):
        shutil.rmtree(dir_log)
if os.path.isdir(dir_pred):
    shutil.rmtree(dir_pred)
os.mkdir(dir_pred)

for i, (train_split, val_split) in enumerate(skf.split(train.index, train.label_idx)):
    K.clear_session()
    
    train_set = train.iloc[train_split]
    val_set = train.iloc[val_split]
    
    callbacks_list= []
    
    # the checkpoint is causing issues with colab and breaks in the middle of training. A workaround is the restore_best_weights from EarlyStopping
    #checkpoint = ModelCheckpoint(os.path.join(dir_log,'best_%d.h5'%i), monitor='val_loss', verbose=1, save_best_only=True)
    #callbacks_list.append(checkpoint)  
    
    early = EarlyStopping(monitor='val_loss', mode='min', patience=5, restore_best_weights=True)
    callbacks_list.append(early)
    
    tb = TensorBoard(log_dir= os.path.join(dir_log, 'fold_%d'%i), write_graph=True)
    callbacks_list.append(tb)    
    
    print("#"*50)
    print(f'\nFold {i}')
    
    if config.model_name == 'conv1d':
        model = get_1d_conv_model(config)
    elif config.model_name == 'conv2d':
        model = get_2d_conv_model(config)
    elif config.model_name == 'dummy2d':
        model = get_2d_dummy_model(config)
    else:
        model = get_1d_dummy_model(config)
    
    train_generator = DataGenerator(config, dir_train, train_set.index, labels=train_set.label_idx,
					batch_size=64) #, preprocessing_fn=audio_norm)
    val_generator = DataGenerator(config, dir_train, val_set.index, labels=val_set.label_idx,
					batch_size=64) #, preprocessing_fn=audio_norm)

    history = model.fit_generator(train_generator, callbacks=callbacks_list, validation_data=val_generator, epochs=config.max_epochs, use_multiprocessing=True, max_queue_size=20)
    
    model.save(os.path.join(dir_log,'best_%d.h5'%i))
    
    # training done, now load best model (at what epoch it was best) and predict
    model.load_weights(os.path.join(dir_log,'best_%d.h5'%i))
    
    # save train prediction for error analysis
    train_generator = DataGenerator(config, dir_train, train_set.index, labels=train_set.label_idx,
					batch_size=64) #, preprocessing_fn=audio_norm)
    
    predictions = model.predict_generator(train_generator, use_multiprocessing=True, max_queue_size=20, verbose=1)
    np.save(os.path.join(dir_pred, 'train_pred_%d.npy'%i), predictions)
    
    # save test prediction
    test_generator = DataGenerator(config, dir_test, test.index, labels=None,
					batch_size=64) #, preprocessing_fn=audio_norm)
    
    predictions = model.predict_generator(test_generator, use_multiprocessing=True, max_queue_size=20, verbose=1)
    np.save(os.path.join(dir_pred, 'test_pred_%d.npy'%i), predictions)
    pred_test_shape = predictions.shape

    # Make a submission file
    top_3 = np.array(labels)[np.argsort(-predictions, axis=1)[:, :3]]
    predicted_labels = [' '.join(list(x)) for x in top_3]
    test['label'] = predicted_labels
    test[['label']].to_csv(os.path.join(dir_pred, 'predictions_%d.csv'%i))

Ensemble the Predictions

In [0]:
pred_list = []

for i in range(config.n_folds):
    pred_list.append(np.load(os.path.join(dir_pred, 'test_pred_%d.npy'%i)))
    
prediction = np.ones_like(pred_list[0])

# Taking a geometric mean of the probabilities
for pred in pred_list:
    prediction = prediction*pred
prediction_gm = prediction**(1./len(pred_list))

# Make a submission file
top_3 = np.array(labels)[np.argsort(-prediction_gm, axis=1)[:, :3]]

predicted_labels = [' '.join(list(x)) for x in top_3]

submission = pd.read_csv(os.path.join(dir_data, 'sample_submission.csv'))

submission['label'] = predicted_labels
submission[['fname', 'label']].to_csv(os.path.join(dir_data, 'submission.csv'), index=False)

We now save the logs, pr

In [0]:
from datetime import datetime
datetime_now = str(datetime.now()).replace('-','').replace(':','').replace(' ','')
datetime_now = datetime_now[:datetime_now.find('.')]
datetime_now = datetime_now[4:8] + '_' + datetime_now[-6:-2]

# save the logs
zipname = datetime_now + '_log.zip'
!zip -r /content/"$zipname" /content/logs
files.download(os.path.join(dir_data, zipname))

# save the preds
zipname = datetime_now + '_pred.zip'
!zip -r /content/"$zipname" /content/pred
files.download(os.path.join(dir_data, zipname))

# save the submission
zipname = datetime_now + '_submission.zip'
!zip -r /content/"$zipname" /content/submission.csv
files.download(os.path.join(dir_data, zipname))