In [15]:
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)

In [16]:
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
import seaborn as sns
from keras import backend as K
from matplotlib import pyplot as plt

import helper

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# Using scikit learn models

Here we explore various scikit learn models to detect an iceberg:
1. using the image data that is band_1 and band_2
2. using the extracted features (the statistical moments) and the incidence angle

the image data does 

## loading the data

In [17]:
data_folder = Path("data")
train_file = data_folder / 'train.json'
test_file = data_folder / 'test.json'

train = pd.read_json(train_file) 
print('loaded training '+str(len(train)))
#test = pd.read_json(test_file) 
#print('loaded test '+str(len(train)))

y=train['is_iceberg'].values ## convert pandas series to numpy array
X=helper.prepare_data(train,dim=[0,1],filter_function=helper.filter_guided,rnd=False,scale=None)
input_shape= X[0].shape
print('filter: guided ',input_shape)

loaded training 1604
filter: guided  (75, 75, 2)


- flattening the data for scikit learn models

In [18]:
X=X.reshape(1604,75*75*2)
X.shape

(1604, 11250)

# Image data
These grid searches use only the image data

## Randomforest

In [65]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,make_scorer

n_estimators_list = [100,1000,3000]
max_depth_list = [5,10,15]
param_grid = {'n_estimators': n_estimators_list, 'max_depth' : max_depth_list}
estimator = RandomForestClassifier()    
gridRF = GridSearchCV(estimator, param_grid, scoring=['accuracy','roc_auc','neg_log_loss'],  n_jobs=12,  cv=5,
                      verbose=2,refit='neg_log_loss')

In [66]:
gridRF.fit(X,y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] max_depth=5, n_estimators=100 ...................................
[CV] max_depth=5, n_estimators=100 ...................................
[CV] max_depth=5, n_estimators=100 ...................................
[CV] max_depth=5, n_estimators=100 ...................................
[CV] max_depth=5, n_estimators=100 ...................................
[CV] max_depth=5, n_estimators=1000 ..................................
[CV] max_depth=5, n_estimators=1000 ..................................
[CV] max_depth=5, n_estimators=1000 ..................................
[CV] max_depth=5, n_estimators=1000 ..................................
[CV] max_depth=5, n_estimators=1000 ..................................
[CV] max_depth=5, n_estimators=3000 ..................................
[CV] max_depth=5, n_estimators=3000 ..................................
[CV] .................... max_depth=5, n_estimators=100, total=   4.5s
[CV] max_depth=5,

[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:  1.8min


[CV] .................. max_depth=10, n_estimators=1000, total= 1.2min
[CV] max_depth=10, n_estimators=3000 .................................
[CV] .................. max_depth=10, n_estimators=1000, total= 1.3min
[CV] max_depth=15, n_estimators=100 ..................................
[CV] .................. max_depth=10, n_estimators=1000, total= 1.3min
[CV] max_depth=15, n_estimators=100 ..................................
[CV] ................... max_depth=15, n_estimators=100, total=   8.8s
[CV] max_depth=15, n_estimators=100 ..................................
[CV] ................... max_depth=15, n_estimators=100, total=   9.0s
[CV] max_depth=15, n_estimators=100 ..................................
[CV] ................... max_depth=15, n_estimators=100, total=   9.3s
[CV] max_depth=15, n_estimators=100 ..................................
[CV] ................... max_depth=15, n_estimators=100, total=   8.9s
[CV] max_depth=15, n_estimators=1000 .................................
[CV] .

[Parallel(n_jobs=12)]: Done  45 out of  45 | elapsed:  7.3min remaining:    0.0s
[Parallel(n_jobs=12)]: Done  45 out of  45 | elapsed:  7.3min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=12,
       param_grid={'n_estimators': [100, 1000, 3000], 'max_depth': [5, 10, 15]},
       pre_dispatch='2*n_jobs', refit='neg_log_loss',
       return_train_score='warn',
       scoring=['accuracy', 'roc_auc', 'neg_log_loss'], verbose=2)

In [67]:
results=gridRF.cv_results_


In [68]:
print ( 'The best estimator scored:')
scores_names=['mean_test_roc_auc','mean_test_accuracy','mean_test_neg_log_loss']
best=np.argmax(results['mean_test_neg_log_loss'])
for s in scores_names:
    print(s,round(results[s][best],4))

The best estimator scored:
mean_test_roc_auc 0.8773
mean_test_accuracy 0.7936
mean_test_neg_log_loss -0.4796


In [71]:
gridRF.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=3000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [73]:
rndforest= RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=3000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## K NN

In [75]:
n_neighbors_list = [3,5,7]
weights_list = ['uniform', 'distance']
p_list = [1,2]
param_grid = {'n_neighbors': n_neighbors_list, 'weights' : weights_list,'p':p_list}
estimator = KNeighborsClassifier(n_jobs=3)    
gridKNN = GridSearchCV(estimator, param_grid, scoring=['accuracy','roc_auc','neg_log_loss'],  n_jobs=4,  cv=5,
                      verbose=2,refit='neg_log_loss')

gridKNN.fit(X,y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] n_neighbors=3, p=1, weights=uniform .............................
[CV] n_neighbors=3, p=1, weights=uniform .............................
[CV] n_neighbors=3, p=1, weights=uniform .............................
[CV] n_neighbors=3, p=1, weights=uniform .............................
[CV] .............. n_neighbors=3, p=1, weights=uniform, total=  12.2s
[CV] n_neighbors=3, p=1, weights=uniform .............................
[CV] .............. n_neighbors=3, p=1, weights=uniform, total=  12.1s
[CV] n_neighbors=3, p=1, weights=distance ............................
[CV] .............. n_neighbors=3, p=1, weights=uniform, total=  12.8s
[CV] n_neighbors=3, p=1, weights=distance ............................
[CV] .............. n_neighbors=3, p=1, weights=uniform, total=  13.1s
[CV] n_neighbors=3, p=1, weights=distance ............................
[CV] ............. n_neighbors=3, p=1, weights=distance, total=  13.3s
[CV] n_neighbors

[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  8.9min


[CV] .............. n_neighbors=5, p=2, weights=uniform, total=  11.5s
[CV] n_neighbors=5, p=2, weights=distance ............................
[CV] .............. n_neighbors=5, p=2, weights=uniform, total=  11.7s
[CV] n_neighbors=5, p=2, weights=distance ............................
[CV] ............. n_neighbors=5, p=2, weights=distance, total=  11.7s
[CV] n_neighbors=5, p=2, weights=distance ............................
[CV] ............. n_neighbors=5, p=2, weights=distance, total=  11.4s
[CV] n_neighbors=7, p=1, weights=uniform .............................
[CV] ............. n_neighbors=5, p=2, weights=distance, total=  11.9s
[CV] n_neighbors=7, p=1, weights=uniform .............................
[CV] ............. n_neighbors=5, p=2, weights=distance, total=  12.4s
[CV] n_neighbors=7, p=1, weights=uniform .............................
[CV] ............. n_neighbors=5, p=2, weights=distance, total=  12.4s
[CV] n_neighbors=7, p=1, weights=uniform .............................
[CV] .

[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed: 14.8min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=3, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance'], 'p': [1, 2]},
       pre_dispatch='2*n_jobs', refit='neg_log_loss',
       return_train_score='warn',
       scoring=['accuracy', 'roc_auc', 'neg_log_loss'], verbose=2)

In [76]:
results=gridKNN.cv_results_
print ( 'The best estimator scored:')
scores_names=['mean_test_roc_auc','mean_test_accuracy','mean_test_neg_log_loss']
best=np.argmax(results['mean_test_neg_log_loss'])
for s in scores_names:
    print(s,round(results[s][best],4))

The best estimator scored:
mean_test_roc_auc 0.808
mean_test_accuracy 0.7113
mean_test_neg_log_loss -1.0175


In [77]:
gridKNN.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=3, n_neighbors=7, p=1,
           weights='distance')

In [78]:
kNN= KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=3, n_neighbors=3, p=2,
           weights='uniform')

# Using only extracted features
In this section we use only the extracted moments and the incidence angle data. The instances that have missing values for the incidence angle are dropped


## Using only extracted features

In [80]:
train_angle = train.where(train['inc_angle'] != 'na')
y=train_angle['is_iceberg'].dropna(how='all')
X=train_angle.drop(['is_iceberg','band_1','band_2','id'],axis=1).dropna(how='all')

In [81]:
n_estimators_list = [100,1000,2000,3000]
max_depth_list = [5,10,12,15,20]
param_grid = {'n_estimators': n_estimators_list, 'max_depth' : max_depth_list}
estimator = RandomForestClassifier()    
gridRF_extract = GridSearchCV(estimator, param_grid, scoring=['accuracy','roc_auc','neg_log_loss'],  n_jobs=12,  cv=5,
                      verbose=2,refit='neg_log_loss')

In [82]:
gridRF_extract.fit(X,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] max_depth=5, n_estimators=100 ...................................
[CV] max_depth=5, n_estimators=100 ...................................
[CV] max_depth=5, n_estimators=100 ...................................
[CV] max_depth=5, n_estimators=100 ...................................
[CV] max_depth=5, n_estimators=100 ...................................
[CV] max_depth=5, n_estimators=1000 ..................................
[CV] max_depth=5, n_estimators=1000 ..................................
[CV] max_depth=5, n_estimators=1000 ..................................
[CV] max_depth=5, n_estimators=1000 ..................................
[CV] max_depth=5, n_estimators=2000 ..................................
[CV] max_depth=5, n_estimators=2000 ..................................
[CV] max_depth=5, n_estimators=1000 ..................................
[CV] .................... max_depth=5, n_estimators=100, total=   0.2s
[CV] max_depth=

[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    4.9s


[CV] ................... max_depth=5, n_estimators=2000, total=   4.0s
[CV] max_depth=10, n_estimators=1000 .................................
[CV] ................... max_depth=5, n_estimators=2000, total=   4.2s
[CV] max_depth=10, n_estimators=2000 .................................
[CV] ................... max_depth=5, n_estimators=2000, total=   4.2s
[CV] max_depth=10, n_estimators=2000 .................................
[CV] .................. max_depth=10, n_estimators=1000, total=   2.4s
[CV] max_depth=10, n_estimators=2000 .................................
[CV] .................. max_depth=10, n_estimators=1000, total=   2.3s
[CV] max_depth=10, n_estimators=2000 .................................
[CV] ................... max_depth=5, n_estimators=3000, total=   5.9s
[CV] max_depth=10, n_estimators=2000 .................................
[CV] .................. max_depth=10, n_estimators=1000, total=   2.3s
[CV] max_depth=10, n_estimators=3000 .................................
[CV] .

[CV] ................... max_depth=20, n_estimators=100, total=   0.3s
[CV] max_depth=20, n_estimators=1000 .................................
[CV] ................... max_depth=20, n_estimators=100, total=   0.3s
[CV] max_depth=20, n_estimators=1000 .................................
[CV] .................. max_depth=15, n_estimators=2000, total=   5.2s
[CV] max_depth=20, n_estimators=1000 .................................
[CV] .................. max_depth=15, n_estimators=2000, total=   5.6s
[CV] max_depth=20, n_estimators=2000 .................................
[CV] .................. max_depth=15, n_estimators=2000, total=   5.6s
[CV] max_depth=20, n_estimators=2000 .................................
[CV] .................. max_depth=20, n_estimators=1000, total=   2.5s
[CV] max_depth=20, n_estimators=2000 .................................
[CV] .................. max_depth=20, n_estimators=1000, total=   2.8s
[CV] max_depth=20, n_estimators=2000 .................................
[CV] .

[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:   40.1s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=12,
       param_grid={'n_estimators': [100, 1000, 2000, 3000], 'max_depth': [5, 10, 12, 15, 20]},
       pre_dispatch='2*n_jobs', refit='neg_log_loss',
       return_train_score='warn',
       scoring=['accuracy', 'roc_auc', 'neg_log_loss'], verbose=2)

In [83]:
gridRF_extract.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=3000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [84]:
results=gridRF_extract.cv_results_
print ( 'The best estimator scored:')
scores_names=['mean_test_roc_auc','mean_test_accuracy','mean_test_neg_log_loss']
best=np.argmax(results['mean_test_neg_log_loss'])
for s in scores_names:
    print(s,round(results[s][best],4))

The best estimator scored:
mean_test_roc_auc 0.9539
mean_test_accuracy 0.9109
mean_test_neg_log_loss -0.2823
