In [4]:
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)

In [5]:
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
import seaborn as sns
from keras import backend as K
from matplotlib import pyplot as plt

import kfold_keras
from helper import extract_moments, filter_guided, prepare_data

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# Soft voting ensemble
this notebook prepares the final model by combining the best one from the previous steps

## Load the data
using the best pre-processing as determined in the earlier steps: Guided filtering and no scaling

In [6]:
data_folder = Path("data")
train_file = data_folder / 'train.json'
test_file = data_folder / 'test.json'

train = pd.read_json(train_file) 
print('loaded training '+str(len(train)))
test = pd.read_json(test_file) 
print('loaded test '+str(len(train)))


loaded training 1604
loaded test 1604


## load and train the best models
### VGG16 transfer learning

This best model used the non-augmented data for best results, and achieved the best results after 5 training episodes.

In [7]:
y=train['is_iceberg'].values 
X=prepare_data(train,dim=[0,1,2],filter_function=filter_guided,rnd=False,scale=None)
input_shape= X[0].shape

In [8]:
from keras.preprocessing.image import ImageDataGenerator

aug_param = {'data_format': 'channels_last',
 'featurewise_center': False,
 'fill_mode': 'nearest',
 'horizontal_flip': False,
 'rotation_range': 20,
 'vertical_flip': True,
 'zoom_range': 0.15}
datagen = ImageDataGenerator(aug_param)


In [14]:
from statoil_models import vgg16_finetune
vgg16_model= vgg16_finetune(input_shape)


_,vgg16_ensemble= kfold_keras.k_fold_keras_early_stop(vgg16_model,X,y,k=5,name='VGG', batch_size=128,datagen=datagen,
                                                  train_at_end=True,patience =20)


VGG                 : 100%|██████████| 5/5 [04:39<00:00, 52.98s/it, Acc=90.7, Epi=43, ROC_AUC=0.969, vloss=0.236]


Created ensemble.


### CNN small
- the small CNN from scratch had single a fully connected layer with 256 nodes, and batch normalization
- data augmentation significantly improved the performance of the model

In [12]:
from statoil_models import Simple_CNN,vgg16_finetune
from keras.preprocessing.image import ImageDataGenerator
aug_param = {'data_format': 'channels_last',
 'featurewise_center': False,
 'fill_mode': 'nearest',
 'horizontal_flip': False,
 'rotation_range': 20,
 'vertical_flip': True,
 'zoom_range': 0.15}

small_cnn= Simple_CNN(input_shape,width=2)

In [13]:
datagen = ImageDataGenerator(aug_param)
_, small_cnn_ensemble = kfold_keras.k_fold_keras_early_stop(small_cnn,X,y,k=5,name='small_CNN',
                                                  train_at_end=True,datagen=datagen,patience =20)

small_CNN           : 100%|██████████| 5/5 [01:49<00:00, 22.24s/it, Acc=87, Epi=50.8, ROC_AUC=0.949, vloss=0.32]   

Created ensemble.





### Random Forest on extracted features

In [15]:
train_exfeat = extract_moments(train)
test_exfeat = extract_moments(test)
# drop the missing angle values
train_angle=train.where(train['inc_angle'] != 'na')
y_exfeat=train_angle['is_iceberg'].dropna(how='all').values
X_exfeat=train_angle.drop(['is_iceberg','band_1','band_2','id'],axis=1).dropna(how='all').values

from sklearn.ensemble import RandomForestClassifier

rnd_frst = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=3000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

rnd_frst.fit(X_exfeat,y_exfeat)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=3000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## Soft Voting classifier

The predictions of the best models are combined by averaging predictions of the 3 best models

In [16]:
X_test=prepare_data(test,dim=[0,1,2],filter_function=filter_guided,rnd=False,scale=None)
X_exfeat_test = test_exfeat.drop(['band_1','band_2','id'],axis=1).dropna(how='all').values

In [17]:
rnd_frst_proba=rnd_frst.predict_proba(X_exfeat_test)[:,1]

In [18]:
cnn_proba=small_cnn_ensemble.predict(X_test)

In [19]:
vgg16_proba=vgg16_ensemble.predict(X_test)

In [20]:
soft_voting= (rnd_frst_proba+np.squeeze(cnn_proba)+np.squeeze(vgg16_proba))/3

In [21]:
submission=pd.DataFrame({'id':test['id'],'is_iceberg':rnd_frst_proba})
submission.to_csv('submission_rnd_frst.csv',index=False)
submission=pd.DataFrame({'id':test['id'],'is_iceberg':np.squeeze(cnn_proba)})
submission.to_csv('submission_cnn.csv',index=False)
submission=pd.DataFrame({'id':test['id'],'is_iceberg':np.squeeze(vgg16_proba)})
submission.to_csv('submission_vgg16.csv',index=False)
submission=pd.DataFrame({'id':test['id'],'is_iceberg':soft_voting})
submission.to_csv('submission_ensemble.csv',index=False)

## Fine tuning the score

Unfortunately there was a data leak in this Kaggle competition. It came to general knowledge that a given label the tends to have similar values for the incidence angles, both on the testing and training set. To gauge the impact of this leak, we also applied this gained knowledge to our result and resubmitted the test submission. Just by applying the following rules, we were able to improve upon the final score (from 0.309 to 0.2652):

- Compare the incidence angle of the testing set instance
- if a similar angle exist  and the predictions agree change the label to the one from the training set, and use a high confidence prediction.
- If the label do not agree use majority voting to determine a new label, again use the high probabilities 

In [22]:
submission_tuned=pd.DataFrame({'id':test['id'],'is_iceberg':soft_voting})
max_prob=0.95
min_prob=0.05
agree=0
disagree=0
not_changed=0
for idx,angle in enumerate(test['inc_angle']):
        iceberg=train['is_iceberg'].loc[abs(train_angle['inc_angle'] - angle) < 0.001]
        if len(iceberg) > 0:
            #if np.mean(iceberg) not in [0.0, 1.0]:
                #print(np.mean(iceberg))
            is_iceberg = round(np.mean(iceberg))
            if round(soft_voting[idx]) == is_iceberg:
                agree+=1
                submission_tuned.at[idx,'is_iceberg'] =max_prob if is_iceberg else min_prob
            else:
                if np.mean(iceberg) != 0.5:
                    submission_tuned.at[idx,'is_iceberg'] =max_prob if is_iceberg else min_prob
                    disagree+=1
                else:
                    not_changed+=1
        else:
            not_changed+=1
print(agree,disagree,not_changed)
submission_tuned.to_csv('submission_ensemble_tuned.csv',index=False)

2477 400 5547
