In [2]:
import os
import numpy as np 
import pandas as pd 

# Data Load

In [16]:
sub_path = "submissions"
all_files = os.listdir(sub_path)

# Read and concatenate submissions
outs = [pd.read_csv(os.path.join(sub_path, f), index_col=0) for f in all_files]
concat_sub = pd.concat(outs, axis=1)
cols = list(map(lambda x: "is_iceberg_" + str(x), range(len(concat_sub.columns))))
concat_sub.columns = cols
concat_sub.reset_index(inplace=True)
concat_sub.head()


['submission_nonorm.csv', 'submission_shiftzoom.csv', 'submission_newseed.csv', 'submission_original.csv', 'submission_noaugment.csv', 'submission_densenoaugment.csv']


Unnamed: 0,id,is_iceberg_0,is_iceberg_1,is_iceberg_2,is_iceberg_3,is_iceberg_4,is_iceberg_5
0,5941774d,0.014893,0.020135,0.030152,0.173533,0.133513,0.217252
1,4023181e,0.738189,0.651737,0.52581,0.782985,0.57295,0.58987
2,b20200e4,0.000181,0.000674,0.000444,2e-06,0.005628,2e-06
3,e7f018bb,0.986621,0.982486,0.98212,0.983651,0.971778,0.957472
4,4371c8c3,0.891878,0.820381,0.7718,0.630755,0.831036,0.337472


In [4]:
# check correlation
concat_sub.corr()

Unnamed: 0,is_iceberg_0,is_iceberg_1,is_iceberg_2,is_iceberg_3,is_iceberg_4,is_iceberg_5
is_iceberg_0,1.0,0.988481,0.979019,0.82672,0.969011,0.877193
is_iceberg_1,0.988481,1.0,0.990653,0.867007,0.97798,0.895832
is_iceberg_2,0.979019,0.990653,1.0,0.873313,0.980502,0.903381
is_iceberg_3,0.82672,0.867007,0.873313,1.0,0.868697,0.841379
is_iceberg_4,0.969011,0.97798,0.980502,0.868697,1.0,0.888093
is_iceberg_5,0.877193,0.895832,0.903381,0.841379,0.888093,1.0


In [5]:
# get the data fields ready for stacking
concat_sub['is_iceberg_max'] = concat_sub.iloc[:, 1:6].max(axis=1)
concat_sub['is_iceberg_min'] = concat_sub.iloc[:, 1:6].min(axis=1)
concat_sub['is_iceberg_mean'] = concat_sub.iloc[:, 1:6].mean(axis=1)
concat_sub['is_iceberg_median'] = concat_sub.iloc[:, 1:6].median(axis=1)

In [6]:
# set up cutoff threshold for lower and upper bounds, easy to twist 
cutoff_lo = 0.8
cutoff_hi = 0.2

# Mean Stacking

In [7]:
concat_sub['is_iceberg'] = concat_sub['is_iceberg_mean']
concat_sub[['id', 'is_iceberg']].to_csv('stack_mean.csv', 
                                        index=False, float_format='%.6f')

**LB 0.1698** , decent first try - still some gap comparing with our top-line model performance in stack.

# Median Stacking

In [8]:
concat_sub['is_iceberg'] = concat_sub['is_iceberg_median']
concat_sub[['id', 'is_iceberg']].to_csv('stack_median.csv', 
                                        index=False, float_format='%.6f')

**LB 0.1575**, very close with our top-line model performance, but we want to see some improvement at least.

# PushOut + Median Stacking 

Pushout strategy is a bit agressive given what it does...

In [9]:
concat_sub['is_iceberg'] = np.where(np.all(concat_sub.iloc[:,1:6] > cutoff_lo, axis=1), 1, 
                                    np.where(np.all(concat_sub.iloc[:,1:6] < cutoff_hi, axis=1),
                                             0, concat_sub['is_iceberg_median']))
concat_sub[['id', 'is_iceberg']].to_csv('stack_pushout_median.csv', 
                                        index=False, float_format='%.6f')

**LB 0.1940**, not very impressive results given the base models in the pipeline...

# MinMax + Mean Stacking

MinMax seems more gentle and it outperforms the previous one given its peformance score.

In [10]:
concat_sub['is_iceberg'] = np.where(np.all(concat_sub.iloc[:,1:6] > cutoff_lo, axis=1), 
                                    concat_sub['is_iceberg_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:6] < cutoff_hi, axis=1),
                                             concat_sub['is_iceberg_min'], 
                                             concat_sub['is_iceberg_mean']))
concat_sub[['id', 'is_iceberg']].to_csv('stack_minmax_mean.csv', 
                                        index=False, float_format='%.6f')

**LB 0.1622**, need to stack with Median to see the results.

# MinMax + Median Stacking 

In [11]:
concat_sub['is_iceberg'] = np.where(np.all(concat_sub.iloc[:,1:6] > cutoff_lo, axis=1), 
                                    concat_sub['is_iceberg_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:6] < cutoff_hi, axis=1),
                                             concat_sub['is_iceberg_min'], 
                                             concat_sub['is_iceberg_median']))
concat_sub[['id', 'is_iceberg']].to_csv('stack_minmax_median.csv', 
                                        index=False, float_format='%.6f')

**LB 0.1488** - **Great!** This is an improvement to our top-line model performance (LB 0.1538). But can we do better?

# MinMax + BestBase Stacking

In [14]:
# load the model with best base performance
sub_base = pd.read_csv('submissions/submission_shiftzoom.csv')

In [15]:
concat_sub['is_iceberg_base'] = sub_base['is_iceberg']
concat_sub['is_iceberg'] = np.where(np.all(concat_sub.iloc[:,1:6] > cutoff_lo, axis=1), 
                                    concat_sub['is_iceberg_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:6] < cutoff_hi, axis=1),
                                             concat_sub['is_iceberg_min'], 
                                             concat_sub['is_iceberg_base']))
concat_sub[['id', 'is_iceberg']].to_csv('stack_minmax_bestbase.csv', 
                                        index=False, float_format='%.6f')

**LB 0.1463** - **Yes!** This is a decent score given none of the models in our ensemble pipeline has achieved thus better. I am sure there are more twisted ways to boost the score further, so will keep updating or just leave to more Kagglers to discover!


### P.S. As I wrote along this work, deeply I think, building strong & roboust model is always the key component, stacking only comes last with the promise to surprise, sometimes, in an unpleasant direction@ 




# Weighted mean Stacking

In [17]:
print all_files

['submission_nonorm.csv', 'submission_shiftzoom.csv', 'submission_newseed.csv', 'submission_original.csv', 'submission_noaugment.csv', 'submission_densenoaugment.csv']


In [20]:
concat_sub['is_iceberg'] = (concat_sub['is_iceberg_0']*(1.-0.1602) + concat_sub['is_iceberg_1']*(1.-0.1568) \
                                        + concat_sub['is_iceberg_2']*(1.-0.1601) + concat_sub['is_iceberg_3']*(1.-0.1692) \
                                        + concat_sub['is_iceberg_4']*(1.-0.1932) + concat_sub['is_iceberg_5']*(1.-0.1972)) \
                                        / (6-0.1602-0.1568-0.1601-0.1692-0.1932-0.1972)

In [21]:
concat_sub[['id', 'is_iceberg']].to_csv('stack_weighted_mean.csv', 
                                        index=False, float_format='%.6f')