In [9]:
%load_ext autoreload
%autoreload 2
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
from keras import backend as K
from keras.layers import (Activation, BatchNormalization, Conv2D, Dense,
                          Dropout, Flatten, GlobalAveragePooling2D,
                          MaxPooling2D)
from keras.models import Sequential
from keras.optimizers import Adadelta, Adam
from keras.preprocessing.image import ImageDataGenerator
from matplotlib import pyplot as plt

import helper
import kfold_keras
from helper import (dict_product, filter_guided, filter_lee,
                    plot_training_history, prepare_data)
from statoil_models import Simple_CNN
from statoil_models import vgg16_finetune

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# Data Augmentation 

In this notebook we perform a grid search to find the optimal data augmentation parameter. We try a range of different settings and check the performance on the validation set. 

##  Load Data

we are using the data treatment that yielded the best results so far: Guided Prefilter, with no scaling

In [3]:
data_folder = Path("data")
train_file = data_folder / 'train.json'
test_file = data_folder / 'test.json'

train = pd.read_json(train_file) 
print('loaded training '+str(len(train)))


y=train['is_iceberg'].values ## convert pandas series to numpy array
X=prepare_data(train,dim=[0,1,2],filter_function=filter_guided,rnd=False,scale=None)
input_shape= X[0].shape
print('filter: guided')

loaded training 1604
filter: guided


## data augmentation for the VGG16
We only use the fine tuned VGG16 model to test the data augmentation parameter

Setting up the grid search parameter

In [7]:
options = {"data_format": ['channels_last'], "rotation_range": [0,20,45],
           "horizontal_flip":[True,False],"vertical_flip": [True,False],
          "zoom_range":[0,0.15,0.3],"fill_mode":['nearest'],"featurewise_center":[False]}

grid =dict_product(options)
    

In [11]:
results_vgg16 = [] 
for name,datagen in enumerate(grid):
    model=vgg16_finetune(input_shape)
    (cvscores, roc_aucs,episodes,val_loss) = kfold_keras.k_fold_keras_early_stop(model,X,y
                                                ,k=5,name=str(name),train_at_end=False
                                                ,datagen=ImageDataGenerator(datagen),batch_size=128,patience=20)
    results_vgg16.append(np.mean(val_loss))

0                   : 100%|██████████| 5/5 [03:50<00:00, 45.57s/it, Acc=90.9, Epi=54.6, ROC_AUC=0.97, vloss=0.234] 
1                   : 100%|██████████| 5/5 [02:15<00:00, 28.75s/it, Acc=90.6, Epi=30.6, ROC_AUC=0.969, vloss=0.233]
2                   : 100%|██████████| 5/5 [03:16<00:00, 33.85s/it, Acc=90.5, Epi=46.6, ROC_AUC=0.969, vloss=0.235]
3                   : 100%|██████████| 5/5 [02:59<00:00, 37.55s/it, Acc=90.7, Epi=40.2, ROC_AUC=0.969, vloss=0.234]
4                   : 100%|██████████| 5/5 [03:11<00:00, 36.73s/it, Acc=90.8, Epi=45.6, ROC_AUC=0.969, vloss=0.235]
5                   : 100%|██████████| 5/5 [02:23<00:00, 28.41s/it, Acc=90.6, Epi=33.2, ROC_AUC=0.97, vloss=0.236] 
6                   : 100%|██████████| 5/5 [03:05<00:00, 38.31s/it, Acc=91.5, Epi=43.6, ROC_AUC=0.97, vloss=0.232] 
7                   : 100%|██████████| 5/5 [02:52<00:00, 32.57s/it, Acc=91, Epi=39.2, ROC_AUC=0.97, vloss=0.235]  
8                   : 100%|██████████| 5/5 [02:53<00:00, 32.84s/it, Acc=9

In [None]:
grid =dict_product(options)
best_aug_vgg16 = list(grid)[np.argmin(results_vgg16)]
print (f'The best run was {np.argmin(results_vgg16)} with the following settings:')
best_aug_vgg16

The above data augmentation settings lead to the best improvement for the VGG16 model

## data augmentation for the larger CNN best settings

Setting up the grid search parameter
using the small CNN to evaluate impact

In [None]:
options = {"data_format": ['channels_last'], "rotation_range": [0,20,45],
           "horizontal_flip":[True,False],"vertical_flip": [True,False],
          "zoom_range":[0,0.15,0.3],"fill_mode":['nearest'],"featurewise_center":[False]}


grid =dict_product(options)
   

In [None]:
results = [] 
for name,datagen in enumerate(grid):
    model = Larger_CNN(input_shape,width=4)
    (cvscores, roc_aucs,episodes,val_loss) = kfold_keras.k_fold_keras_early_stop(model,X,y
                                                ,k=5,name=str(name),train_at_end=False
                                                ,datagen=ImageDataGenerator(datagen),patience=20)
    results.append(np.mean(val_loss))

In [None]:
grid =dict_product(options)
results = np.where(np.isnan(results), 1, results)
best_aug_small_cnn = list(grid)[np.argmin(results)]
print (f'The best run was {np.argmin(results)} with the following settings:')
best_aug_small_cnn

## data augmentation for the small CNN best settings

Setting up the grid search parameter
using the small CNN to evaluate impact

In [4]:
options = {"data_format": ['channels_last'], "rotation_range": [0,20,45],
           "horizontal_flip":[True,False],"vertical_flip": [True,False],
          "zoom_range":[0,0.15,0.3],"fill_mode":['nearest'],"featurewise_center":[False]}

grid =dict_product(options)
   

In [5]:
results = [] 
for name,datagen in enumerate(grid):
    model = Simple_CNN(input_shape,width=2)
    (cvscores, roc_aucs,episodes,val_loss) = kfold_keras.k_fold_keras_early_stop(model,X,y
                                                ,k=5,name=str(name),train_at_end=False
                                                ,datagen=ImageDataGenerator(datagen),patience=20)
    results.append(np.mean(val_loss))

0                   : 100%|██████████| 5/5 [01:17<00:00, 14.71s/it, Acc=86.7, Epi=57.2, ROC_AUC=0.946, vloss=0.313]
1                   : 100%|██████████| 5/5 [02:19<00:00, 27.39s/it, Acc=71.5, Epi=66.2, ROC_AUC=0.772, vloss=7.55]
2                   : 100%|██████████| 5/5 [02:02<00:00, 20.70s/it, Acc=70.2, Epi=58.6, ROC_AUC=0.739, vloss=6.78]
3                   : 100%|██████████| 5/5 [02:43<00:00, 31.36s/it, Acc=87.2, Epi=71.8, ROC_AUC=0.951, vloss=0.323]
4                   : 100%|██████████| 5/5 [02:11<00:00, 25.22s/it, Acc=85.4, Epi=61.6, ROC_AUC=0.933, vloss=0.328]
5                   : 100%|██████████| 5/5 [02:27<00:00, 28.84s/it, Acc=76.9, Epi=55.4, ROC_AUC=0.85, vloss=3.93]  
6                   : 100%|██████████| 5/5 [00:30<00:00,  5.97s/it, Acc=53.1, Epi=23.2, ROC_AUC=0.5, vloss=16.2]
7                   : 100%|██████████| 5/5 [01:25<00:00, 13.64s/it, Acc=71, Epi=69.6, ROC_AUC=0.742, vloss=6.71]  
8                   : 100%|██████████| 5/5 [01:17<00:00, 14.98s/it, Acc=87.7, 

In [6]:
grid =dict_product(options)
results = np.where(np.isnan(results), 1, results)
best_aug_small_cnn = list(grid)[np.argmin(results)]
print (f'The best run was {np.argmin(results)} with the following settings:')
best_aug_small_cnn

The best run was 10 with the following settings:


{'data_format': 'channels_last',
 'featurewise_center': False,
 'fill_mode': 'nearest',
 'horizontal_flip': False,
 'rotation_range': 0,
 'vertical_flip': False,
 'zoom_range': 0.15}

In [12]:
grid =dict_product(options)
best_aug_small_cnn = list(grid)[19]

We will use the above data augmentation setup.While the score is slightly worse than the top selection, it applies more kinds of transformations to the data, making the resulting model more generalising.