<a href="https://colab.research.google.com/github/gheorghebg11/Shell/blob/master/Shell.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load the files into Colab

The first cells are for loading the big data files from google drive (as colab only keeps the files for 12 hours), and the .csv from my local drive or dropbox. We first authenticate.

In [1]:
!pip install PyDrive
import os, zipfile
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth, files
from oauth2client.client import GoogleCredentials

auth.authenticate_user()

gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

Collecting PyDrive
[?25l  Downloading https://files.pythonhosted.org/packages/52/e0/0e64788e5dd58ce2d6934549676243dc69d982f198524be9b99e9c2a4fd5/PyDrive-1.3.1.tar.gz (987kB)
[K    100% |████████████████████████████████| 993kB 26.6MB/s 
Building wheels for collected packages: PyDrive
  Building wheel for PyDrive (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/fa/d2/9a/d3b6b506c2da98289e5d417215ce34b696db856643bad779f4
Successfully built PyDrive
Installing collected packages: PyDrive
Successfully installed PyDrive-1.3.1


Now download the files and un unzip the archives. Right now there are 4 folders for 4 different gas station brands.

In [2]:
!rm -r /content/data
!mkdir /content/data
print('\nDownloading the data')
download = drive.CreateFile({'id': '1e5u2uDQ5mupV64KJbmWVG-JhlGKz9uUD'})
download.GetContentFile('/content/data_shell.tar.gz')
!tar -xzf /content/data_shell.tar.gz -C /content/data
print('\nData successfully downloaded in /content/data')

rm: cannot remove '/content/data': No such file or directory

Downloading the data


# Data Visualization and Basic Cleaning

First load some packages for Data Visualization.

In [0]:
import numpy as np
np.random.seed(1989)

import os

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from IPython.display import display
pd.options.display.max_columns = 20

import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 

Explore the data a little bit. We look at the available categories (each in one separate folder) and create a dictionary for one-hot encoding.

In [4]:
dir_data = '/content/data/'
labels = list(os.listdir(dir_data))
labels_to_idx = {label: i for i, label in enumerate(labels)}
print(labels_to_idx)

#train = pd.read_csv(os.path.join(dir_data, 'train_curated.csv'))
#test = pd.read_csv(os.path.join(dir_data, 'sample_submission.csv'))

{'chevron': 0, 'phillips': 1, 'conoco': 2, 'shell': 3}


We will now loop through all files and do a few tasks:
- check for corrupted files (ending in .part)
- check for all available extensions, in order to exlcude non-images
- rename each file in the formal 'label_oldfilename' and move them all in one single folder.

In [5]:
extensions = set()

for label in labels:
    dir_label = os.path.join(dir_data, label)
    filenames = [name for name in os.listdir(dir_label) if os.path.isfile(os.path.join(dir_label, name))]
    
    for filename in filenames:
        new_filename = label + '_' + filename
        os.rename(os.path.join(dir_label, filename), os.path.join(dir_data,new_filename))
        
        # check for the extension
        extension = filename.split('.')[-1]
        if extension not in extensions:
            extensions.add(extension)
    
    # erase the empty folder
    os.rmdir(os.path.join(dir_data, label))
    #shutil.rmtree(os.path.join(dir_data, label))
    
print(f'We found the extensions {extensions}')

# create a list with all filenames MAYBE PUT IN PANDAS ?
filenames = [name for name in os.listdir(dir_data) if os.path.isfile(os.path.join(dir_data, name))]

We found the extensions {'jpeg', 'jpg', 'png', 'part'}


We will now erase the files ending in .part as they are corrupted images that failed during data mining.

In [6]:
corrupted_files = 0
for filename in filenames:
    if filename.endswith('.part'):
        corrupted_files = corrupted_files + 1
        os.remove(os.path.join(dir_data, filename))
        filenames.remove(filename)
print(f'We removed {corrupted_files} corrupted files out of {len(filenames) + corrupted_files} total files.')

We removed 7 corrupted files out of 776 total files.


We now create a Pandas df with the data to explore it a little bit. 
#TODO

Look at the shape and number of examples and labels. 

In [0]:
print(f'Train set has {train.shape[0]} examples and {len(set(train.labels))} different labels')
train.sample(3)

In [0]:
print(f'Test set has {test.shape[0]} examples and {len(test.columns[1:])} different labels')
test.sample(3)

We will for now exclude the sounds which have multiple labels.

In [0]:
train = train[train['labels'].isin(test.columns[1:])]
print(f'Removing multilabel examples. The Train set has now {train.shape[0]} examples and {len(set(train.labels))} different labels')

Let's visualize how many samples there are per label.

In [0]:
category_group = train.groupby(['labels']).count()
category_group.columns = ['counts']
print(f'The number of training clips per label range from {category_group.counts.min()} to {category_group.counts.max()}')
plot = category_group.sort_values(ascending=True, by='counts').plot(kind='barh', title='nbr training audio clips per label', figsize=(20,12))
plt.show()

Let's now visualize the distribution of their length:



We first only look at the top 25 categories

In [0]:
train['nframes'] = train['fname'].apply(lambda fname : wave.open(os.path.join(dir_data, 'curated', fname)).getnframes())
test['nframes'] = test['fname'].apply(lambda fname : wave.open(os.path.join(dir_data, 'test', fname)).getnframes())
# plot the distribution of top 25 catetegories of the training set, since 74 is a little bit too much
idx_25_top = category_group.sort_values(ascending=True, by='counts').index[-25:]
_, ax = plt.subplots(figsize=(25,10))
sns.violinplot(data = train[train.labels.isin(idx_25_top)], x='labels', y='nframes')
plt.xticks(rotation=90)
plt.show()

And now compare the training and the test set.

In [0]:
fig, ax = plt.subplots(2,1, figsize=(20,8))
train.nframes.plot(kind='hist', bins=100, rwidth=0.5, ax = ax[0])
test.nframes.plot(kind='hist', bins=100, rwidth=0.5, ax = ax[1])
plt.show()

Let's pick a random audio clip and look at its attributes.

In [0]:
rand_ex = train.sample(1)
fname = rand_ex['fname'].values[0]
path_audiofile = os.path.join(dir_data, 'curated', fname)
wav = wave.open(path_audiofile)
print(f'Filename is {fname}')
print(f'Sampling frame rate {wav.getframerate()}')
print(f'Total frames {wav.getnframes()}')
print(f'Duration {wav.getnframes() / wav.getframerate()} sec')
print(f'Label is {rand_ex.labels.values[0]} \n' )
import IPython
IPython.display.Audio(path_audiofile)

Finally we now save this cleaned dataframe.

In [0]:
train.to_csv(os.path.join(dir_data, 'train_curated_clean.csv'))

# Construct The Model

First (Re)Load some Packages

In [0]:
import numpy as np
np.random.seed(1989)

import os, shutil, cv2
import pandas as pd

from albumentations import (Compose, HorizontalFlip, CLAHE, HueSaturationValue, RandomCrop,
    RandomBrightness, RandomContrast, RandomGamma, ToFloat, ShiftScaleRotate)


from sklearn.model_selection import StratifiedKFold
from keras import losses, models, optimizers
from keras.activations import relu, softmax
from keras.callbacks import (EarlyStopping, LearningRateScheduler, ModelCheckpoint, TensorBoard, ReduceLROnPlateau)
from keras.layers import (Convolution1D, Dense, Dropout, GlobalAveragePooling1D, GlobalMaxPool1D, Input, MaxPool1D, concatenate) # for the 1D conv models
from keras.layers import (Convolution2D, GlobalAveragePooling2D, BatchNormalization, Flatten, GlobalMaxPool2D, MaxPool2D, Activation, concatenate) # for the 2D models with MFCC
from keras.utils import Sequence, to_categorical
from keras import backend as K

import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 

Load the data, add some columns and set up some dictionnaries.

In [0]:
dir_data = '/content/data'
n_classes = len(labels)
#train = pd.read_csv(os.path.join(dir_data, 'train_curated_clean.csv'))
#test = pd.read_csv(os.path.join(dir_data, 'sample_submission.csv'))

#train.set_index("fname", inplace=True)
#test.set_index("fname", inplace=True)
#train["label_idx"] = train.labels.apply(lambda x: label_to_idx[x])

In [29]:
filenames_and_labels = [(filename,filename.split('_')[0]) for filename in filenames]

print(filenames_and_labels)

train = pd.DataFrame(filenames_and_labels, columns=['fname', 'label'])
train.head()

[('conoco_401.jpg', 'conoco'), ('chevron_292.jpg', 'chevron'), ('shell_194.jpg', 'shell'), ('shell_88.jpg', 'shell'), ('shell_14.jpg', 'shell'), ('conoco_283.jpg', 'conoco'), ('shell_276.jpg', 'shell'), ('chevron_37.jpg', 'chevron'), ('chevron_84.jpg', 'chevron'), ('chevron_44.jpg', 'chevron'), ('chevron_60.jpg', 'chevron'), ('chevron_119.jpg', 'chevron'), ('shell_278.jpg', 'shell'), ('phillips_154.jpg', 'phillips'), ('chevron_352.jpg', 'chevron'), ('conoco_114.jpg', 'conoco'), ('shell_191.jpg', 'shell'), ('conoco_370.jpg', 'conoco'), ('chevron_324.jpg', 'chevron'), ('conoco_356.jpg', 'conoco'), ('conoco_280.jpg', 'conoco'), ('shell_388.jpg', 'shell'), ('conoco_1.jpg', 'conoco'), ('shell_355.jpg', 'shell'), ('phillips_114.jpg', 'phillips'), ('conoco_136.png', 'conoco'), ('shell_73.jpg', 'shell'), ('conoco_203.jpg', 'conoco'), ('shell_3.jpg', 'shell'), ('shell_49.jpg', 'shell'), ('chevron_385.jpg', 'chevron'), ('shell_209.jpg', 'shell'), ('phillips_369.jpg', 'phillips'), ('phillips_39.j

Unnamed: 0,fname,label
0,conoco_401.jpg,conoco
1,chevron_292.jpg,chevron
2,shell_194.jpg,shell
3,shell_88.jpg,shell
4,shell_14.jpg,shell


The Configuration class : In particular contains the default settings.

In [0]:
class Config(object):
	def __init__(self, model_name = None, n_classes=n_classes, image_size=(64,64,3), n_folds=1, learning_rate=0.0001, max_epochs=10):

		self.n_classes = n_classes
		self.image_size = image_size
		self.n_folds = n_folds
		self.learning_rate = learning_rate
		self.max_epochs = max_epochs
		self.model_name = model_name

The Datagenerator class : Inherits from Keras.utils.Sequence to efficiently feed the data.

I think that the normalization is done batch by batch, which is not good, as it's done the same way on the test set. 

In [0]:
class DataGenerator(Sequence): # Inherits from Keras.utils.Sequence for multiprocessing

    def __init__(self, config, dir_data, list_IDs, labels=None,
					batch_size=16, shuffle=False, augmentation = None, preprocessing_fn=lambda x: x):
    
        self.config = config
        self.dir_data = dir_data
        self.list_IDs = list_IDs
        self.labels = labels
        self.batch_size = batch_size
        self.augment = augmentation
        self.preprocessing_fn = preprocessing_fn
        self.shuffle = shuffle ## DOES IT WORK TO HAVE TRUE ?
        self.on_epoch_end()

    # returns the number of batches in the Sequence (usually per 1 epoch)
    def __len__(self):
        return int(np.ceil(len(self.list_IDs) / self.batch_size))

    # returns a complete batch at the place index
    def __getitem__(self,index):
        indexes_temp = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes_temp]
        return self.__data_generation(list_IDs_temp)

    # called at the end of an epoch: reloads IDs
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            print('Got to the end of an epoch. Shuffling the dataset')
            np.random.shuffle(self.indexes)
    
    def __data_generation(self, list_IDs_temp):
        cur_batch_size = len(list_IDs_temp)
        X = np.empty((cur_batch_size, *self.config.image_size)) ### WHAT DIM FOR IMAGES ?
        
        for i,ID in enumerate(list_IDs_temp):
            file_path = os.path.join(self.dir_data, ID)
            
            # read the image
            img = cv2.cvtColor(cv2.imread(file_path), cv2.COLOR_RGB2BGR)
            
            
            # other preprocesses (not augmenting though)
            img = self.preprocessing_fn(img)  ## REMOVE THAT, SET NONE BY DEFAULT AND DO AN IF self.pre: ...etc
            
            # augment and save in the big array X
            if self.augment:
                X[i,] = self.augment(image=img)["image"]
        
        if self.labels is not None:
            y = np.empty(cur_batch_size, dtype=float)
            for i, ID in enumerate(list_IDs_temp):
                y[i] = self.labels[ID]
            
            return X, to_categorical(y, num_classes=self.config.n_classes)
        else:
            return X

Here is the dummy 2d model that applies to the audio data after MFCC. Name = dummy2d

In [0]:
def get_2d_dummy_model(config):
    
    nclass = config.n_classes
    
    inp = Input(shape=(config.dim[0],config.dim[1],1))
    x = GlobalMaxPool2D()(inp)
    out = Dense(nclass, activation=softmax)(x)

    model = models.Model(inputs=inp, outputs=out)
    opt = optimizers.Adam(config.learning_rate)

    model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
    return model

And the larger 2D conv model. Name = conv2d

In [0]:
def get_2d_conv_model(config):
    
    nclass = config.n_classes
    
    inp = Input(shape=(config.dim[0],config.dim[1],1))
    x = Convolution2D(32, (4,10), padding="same")(inp)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)

    x = Flatten()(x)
    x = Dense(64)(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    out = Dense(nclass, activation=softmax)(x)

    model = models.Model(inputs=inp, outputs=out)
    opt = optimizers.Adam(config.learning_rate)

    model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
    return model

# Run The Model!

Set-up the Paths for the data

In [0]:
dir_log = os.path.join(dir_data, 'logs')
if os.path.isdir(dir_log):
        shutil.rmtree(dir_log)

dir_pred = os.path.join(dir_data, 'pred')
if os.path.isdir(dir_pred):
    shutil.rmtree(dir_pred)
os.mkdir(dir_pred)

dir_train = os.path.join(dir_data)
#dir_test = os.path.join(dir_data, 'test')

Creating the config and other necessary objects

In [0]:
config = Config(n_folds=3, learning_rate=0.001, max_epochs=50)

skf = StratifiedKFold(n_splits=config.n_folds)

augmentation_train = Compose([ 
    RandomCrop(16,16),
    HorizontalFlip(p=0.5),
    RandomContrast(limit=0.2, p=0.5),
    RandomGamma(gamma_limit=(80, 120), p=0.5),
    RandomBrightness(limit=0.2, p=0.5),
    HueSaturationValue(hue_shift_limit=5, sat_shift_limit=20, val_shift_limit=10, p=.9),
    # CLAHE(p=1.0, clip_limit=2.0),
    ShiftScaleRotate(
        shift_limit=0.0625, scale_limit=0.1, 
        rotate_limit=15, border_mode=cv2.BORDER_REFLECT_101, p=0.8), 
    ToFloat(max_value=255)
])

augmentation_test = Compose([
    RandomCrop(16,16),
    # CLAHE(p=1.0, clip_limit=2.0),
    ToFloat(max_value=255)
])


#train = train[:size_of_run]
#test = test[:size_of_run]

Run the Loop

In [24]:
if os.path.isdir(dir_log):
        shutil.rmtree(dir_log)
if os.path.isdir(dir_pred):
    shutil.rmtree(dir_pred)
os.mkdir(dir_pred)

for i, (train_split, val_split) in enumerate(skf.split(train.index, train.label_idx)):
    K.clear_session()
    
    train_set = train.iloc[train_split]
    val_set = train.iloc[val_split]
    
    callbacks_list= []
    
    # the checkpoint is causing issues with colab and breaks in the middle of training. A workaround is the restore_best_weights from EarlyStopping
    #checkpoint = ModelCheckpoint(os.path.join(dir_log,'best_%d.h5'%i), monitor='val_loss', verbose=1, save_best_only=True)
    #callbacks_list.append(checkpoint)  
    
    early = EarlyStopping(monitor='val_loss', mode='min', patience=5, restore_best_weights=True)
    callbacks_list.append(early)
    
    tb = TensorBoard(log_dir= os.path.join(dir_log, 'fold_%d'%i), write_graph=True)
    callbacks_list.append(tb)    
    
    print("#"*50)
    print(f'\nFold {i}')
    
    if config.model_name == 'conv1d':
        model = get_1d_conv_model(config)
    elif config.model_name == 'conv2d':
        model = get_2d_conv_model(config)
    elif config.model_name == 'dummy2d':
        model = get_2d_dummy_model(config)
    else:
        model = get_1d_dummy_model(config)
    
    train_generator = DataGenerator(config, dir_train, train_set.index, labels=train_set.label_idx,
					batch_size=64) #, preprocessing_fn=audio_norm)
    val_generator = DataGenerator(config, dir_train, val_set.index, labels=val_set.label_idx,
					batch_size=64) #, preprocessing_fn=audio_norm)

    history = model.fit_generator(train_generator, callbacks=callbacks_list, validation_data=val_generator, epochs=config.max_epochs, use_multiprocessing=True, max_queue_size=20)
    
    model.save(os.path.join(dir_log,'best_%d.h5'%i))
    
    # training done, now load best model (at what epoch it was best) and predict
    model.load_weights(os.path.join(dir_log,'best_%d.h5'%i))
    
    # save train prediction for error analysis
    train_generator = DataGenerator(config, dir_train, train_set.index, labels=train_set.label_idx,
					batch_size=64) #, preprocessing_fn=audio_norm)
    
    predictions = model.predict_generator(train_generator, use_multiprocessing=True, max_queue_size=20, verbose=1)
    np.save(os.path.join(dir_pred, 'train_pred_%d.npy'%i), predictions)
    
    '''
    # save test prediction
    test_generator = DataGenerator(config, dir_test, test.index, labels=None,
					batch_size=64) #, preprocessing_fn=audio_norm)
    
    predictions = model.predict_generator(test_generator, use_multiprocessing=True, max_queue_size=20, verbose=1)
    np.save(os.path.join(dir_pred, 'test_pred_%d.npy'%i), predictions)
    pred_test_shape = predictions.shape

    # Make a submission file
    top_3 = np.array(labels)[np.argsort(-predictions, axis=1)[:, :3]]
    predicted_labels = [' '.join(list(x)) for x in top_3]
    test['label'] = predicted_labels
    test[['label']].to_csv(os.path.join(dir_pred, 'predictions_%d.csv'%i))
    '''

NameError: ignored

Ensemble the Predictions

In [0]:
pred_list = []

for i in range(config.n_folds):
    pred_list.append(np.load(os.path.join(dir_pred, 'test_pred_%d.npy'%i)))
    
prediction = np.ones_like(pred_list[0])

# Taking a geometric mean of the probabilities
for pred in pred_list:
    prediction = prediction*pred
prediction_gm = prediction**(1./len(pred_list))

# Make a submission file
top_3 = np.array(labels)[np.argsort(-prediction_gm, axis=1)[:, :3]]

predicted_labels = [' '.join(list(x)) for x in top_3]

submission = pd.read_csv(os.path.join(dir_data, 'sample_submission.csv'))

submission['label'] = predicted_labels
submission[['fname', 'label']].to_csv(os.path.join(dir_data, 'submission.csv'), index=False)

We now save the logs, pr

In [0]:
from datetime import datetime
datetime_now = str(datetime.now()).replace('-','').replace(':','').replace(' ','')
datetime_now = datetime_now[:datetime_now.find('.')]
datetime_now = datetime_now[4:8] + '_' + datetime_now[-6:-2]

# save the logs
zipname = datetime_now + '_log.zip'
!zip -r /content/"$zipname" /content/logs
files.download(os.path.join(dir_data, zipname))

# save the preds
zipname = datetime_now + '_pred.zip'
!zip -r /content/"$zipname" /content/pred
files.download(os.path.join(dir_data, zipname))

# save the submission
zipname = datetime_now + '_submission.zip'
!zip -r /content/"$zipname" /content/submission.csv
files.download(os.path.join(dir_data, zipname))