This file is just a temporary placeholder for my attempts to do actual stacking using the set of public kernels made popular by Victor Paslay in his LB 0.287 rank averages kernel.  The idea is, we should be able to make some judgment about the appropriate way to combine these models without having to use up a submission every time we try a new idea.  Maybe a plain rank average is the best we can do.  Maybe my regularized log-odds average is the right approach.  Maybe it's better to use a real fitted stacker instead of just taking averages.  Hopefully we can get a better idea of the answer by using out-of-fold or pseudo-out-of-fold training set predictions than by making random stabs at the public leaderboard.  Anyhow, I've created a set of data for this: it's still an early version and hasn't been fully calibrated to reflect the guesstimated relative degree of overfitting in the different public kernels, but it's a start.  This particular kernel was forked from the one I used to generate the OOF data for one of those public kernels (which was itself a stacker), and much of the code remains the same, so it's doing a logistic fit to probabilities, which in my view is not the best way to do things.  But, again, it's a start.

In [None]:
import numpy as np 
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from numba import jit
from sklearn.linear_model import LogisticRegression

os.listdir('../input')

In [None]:
indir = '../input/kaggleportosegurocnoof/'
infiles = [
  'forza_pascal_oof.csv',
  'gp_pseu_val.csv',
  'rgf_valid.csv',
  'stacker_oof_preds_1.csv',
  'xgb_valid.csv'
]
for i, f in enumerate(infiles):
    indf = pd.read_csv(indir + infiles[i])
    if not i:
        oof = indf
    else:
        oof = pd.merge(oof, indf, on='id', suffixes=['',str(i)])
oof.rename(columns={'target':'target0'}, inplace=True)
oof.head()

In [None]:
train = pd.read_csv('../input/porto-seguro-safe-driver-prediction/train.csv')

In [None]:
df = pd.merge(train[['id','target']], oof, on='id')
df.head(10)

In [None]:
X = df.drop(['id', 'target'], axis=1)
X.head()

In [None]:
y = df['target']
y.head(10)

In [None]:
ids = df['id']
ids.head()

In [None]:
# Compute gini

# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

In [None]:
# Set up folds
K = 5
kf = KFold(n_splits = K, random_state = 1, shuffle = True)
np.random.seed(0)
y_valid_pred = 0*df['target']

In [None]:
stacker = LogisticRegression()

In [None]:
for i, (train_index, test_index) in enumerate(kf.split(df)):
    
    # Create data for this fold
    y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index].copy()
    X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
    print( "\nFold ", i)
    
    stacker.fit(X_train, y_train)
    pred = stacker.predict_proba(X_valid)[:,1]
    print( "  Gini = ", eval_gini(y_valid, pred) )
    
    y_valid_pred.iloc[test_index] = pred

In [None]:
print( "\nGini for full training set:" )
eval_gini(df['target'], y_valid_pred)

In [None]:
val = pd.DataFrame()
val['id'] = df['id'].values
val['target'] = y_valid_pred.values
val.to_csv('next_level_oof_preds.csv', float_format='%.6f', index=False)

In [None]:
val.head()