In [259]:
# Load in our libraries
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

# Going to use these 5 base models for the stacking
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold


def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

def calcginiindex(array):
    array = array.flatten()
    array += 0.0000001
    array = np.sort(array)
    index = np.arange(1,array.shape[0]+1)
    n = array.shape[0]
    return ((np.sum((2*index - n - 1)*array))/(n * np.sum(array)))

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score

# Data

In [189]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

features = train.drop(['target', 'id'], axis = 1)
targets = train.target.values

unwanted = train.columns[train.columns.str.startswith('ps_calc_')]

train = train.drop(unwanted, axis = 1)
test = test.drop(unwanted, axis = 1)

# Ensembling and stacking models
### SklearnHelper For RF, ET, AD, GB

In [200]:
# Some useful parameters which will come in handy later on
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 42 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict_proba(self, x):
        return self.clf.predict_proba(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
    
# Class to extend XGboost classifer

### Get predictions/new features from SklearnHelper

In [304]:
kfold = 5
skf = StratifiedKFold(n_splits=kfold, random_state=42)

def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((kfold, ntest))

    
    for i, (train_index, test_index) in enumerate(skf.split(x_train, y_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        y_te = y_train[test_index]
        
        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict_proba(x_te)[:, 1]
        train_pred = clf.predict_proba(x_tr)[:, 1]
        oof_test_skf[i, :] = clf.predict_proba(x_test)[:, 1]
        print("Fold :", i,"Train Gini:", gini_normalized(train_pred, y_tr) ,"Valid Gini:", gini_normalized(oof_train[test_index], y_te))
        

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

### Get predictions/new features from LightGBM model

In [238]:
def get_oof_lgb(x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((kfold, ntest))


    for i, (train_index, test_index) in enumerate(skf.split(x_train, y_train)):
        X_train, X_valid = X[train_index], X[test_index]
        y_train, y_valid = y[train_index], y[test_index]

        # create dataset for lightgbm
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        

        clf = lgb.train(lgb_params,
                    lgb_train,
                    num_boost_round=4000,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=130,
                    feval=gini_lgb, verbose_eval=500,
                    learning_rates = lambda iter: 0.025 * (0.999 ** iter)
                    )

        oof_train[test_index] = (clf.predict(X_valid) > 0.5)
        oof_test_skf[i, :] = (clf.predict(x_test) > 0.5)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

### Get predictions/new features from XGBoost model

In [239]:
def get_oof_xgb(x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((kfold, ntest))


    for i, (train_index, test_index) in enumerate(skf.split(x_train, y_train)):
        X_train, X_valid = X[train_index], X[test_index]
        y_train, y_valid = y[train_index], y[test_index]
        # Convert our data into XGBoost format
        d_train = xgb.DMatrix(X_train, y_train)
        d_valid = xgb.DMatrix(X_valid, y_valid)
        d_test = xgb.DMatrix(x_test.values)
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]

        # Train the model! We pass in a max of 2,000 rounds (with early stopping after 100)
        # and the custom metric (maximize=True tells xgb that higher metric is better)
        clf = xgb.train(xgb_params, d_train, 4000, watchlist, early_stopping_rounds=130, feval=gini_xgb, maximize=True, verbose_eval=500,
                       learning_rates = lambda iter, ed: 0.025 * (0.998 ** iter))

        oof_train[test_index] = (clf.predict(d_valid) > 0.5)
        oof_test_skf[i, :] = (clf.predict(d_test) > 0.5)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

# Data prep

In [218]:
X = train.drop(['id', 'target'], axis = 1).values
y = train.target.values

test_id = test.id.values
test = test.drop('id', axis = 1)

AttributeError: 'DataFrame' object has no attribute 'id'

# Random Forest

In [256]:
best_gini = 0
rf_params = {
    'n_jobs': 6,
    'n_estimators': 40,
    'warm_start': True, 
    'max_features': 37,
    'max_depth': 15
}
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)

print('RANDOM FOREST')
rf_oof_train, rf_oof_test = get_oof(rf,X, y, test) # Random Forest
curr_gini = calcginiindex(rf_oof_test)
if curr_gini > best_gini:
    print('New best gini:', curr_gini)
    best_gini = curr_gini
    best_params = pd.DataFrame(rf_params, index=[1])
else:
    print('Curr gini:', curr_gini)

RANDOM FOREST
('Fold :', 0, 'Train Gini:', 0.31587930721959806, 'Valid Gini:', 0.033468833637298273)
('Fold :', 1, 'Train Gini:', 0.30676553131541462, 'Valid Gini:', 0.31803738981901836)
('Fold :', 2, 'Train Gini:', 0.30675264653999368, 'Valid Gini:', 0.31752042090380284)
('Fold :', 3, 'Train Gini:', 0.30746164616239946, 'Valid Gini:', 0.31649170078492367)
('Fold :', 4, 'Train Gini:', 0.30875467048488037, 'Valid Gini:', 0.31592191628930205)
('New best gini:', 0.31200791921807464)


# DecisionTree

In [260]:
best_gini = 0
dt_params = {
    'max_features': 37,
    'max_depth': 20,
    'max_leaf_nodes':130
}
dt = SklearnHelper(clf=DecisionTreeClassifier, seed=SEED, params=dt_params)

print('Decision Tree')
dt_oof_train, dt_oof_test = get_oof(dt,X, y, test) # Random Forest
curr_gini = calcginiindex(dt_oof_test)
if curr_gini > best_gini:
    print('New best gini:', curr_gini)
    best_gini = curr_gini
    best_params = pd.DataFrame(dt_params, index=[1])
else:
    print('Curr gini:', curr_gini)

Decision Tree
('Fold :', 0, 'Train Gini:', 0.064682245742869432, 'Valid Gini:', 0.036497715991040923)
('Fold :', 1, 'Train Gini:', 0.063224794941622642, 'Valid Gini:', 0.032148094070969684)
('Fold :', 2, 'Train Gini:', 0.057848795808981911, 'Valid Gini:', 0.036702423667169942)
('Fold :', 3, 'Train Gini:', 0.061834802632231824, 'Valid Gini:', 0.036345915731020688)
('Fold :', 4, 'Train Gini:', 0.061748761597841699, 'Valid Gini:', 0.039186190963190148)
('New best gini:', 0.21572119122892536)


# Extra Trees

In [261]:
# Extra Trees Parameters
et_params = {
    'n_jobs': 6,
    'n_estimators': 40,
    'warm_start': True, 
    'max_features': 37,
    'max_depth': 17  
}
print('EXTRA TREES')
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)

et_oof_train, et_oof_test = get_oof(et, X, y, test) # Extra Trees

curr_gini = calcginiindex(et_oof_test)
if curr_gini > best_gini:
    print('New best gini:', curr_gini)
    best_gini = curr_gini
    best_params = pd.DataFrame(et_params, index=[1])
else:
    print('Curr gini:', curr_gini)

EXTRA TREES
('Fold :', 0, 'Train Gini:', 0.5276635028812704, 'Valid Gini:', 0.03638232810262243)
('Fold :', 1, 'Train Gini:', 0.51606517905262295, 'Valid Gini:', 0.52713584119592249)
('Fold :', 2, 'Train Gini:', 0.51482675750572648, 'Valid Gini:', 0.53187893658813168)
('Fold :', 3, 'Train Gini:', 0.51677015176940266, 'Valid Gini:', 0.52576297579359732)
('Fold :', 4, 'Train Gini:', 0.5161111107254438, 'Valid Gini:', 0.52677931774240461)
('New best gini:', 0.34544273823048977)


# Ada Boost

In [262]:
# AdaBoost parameters
ada_params = {
    'n_estimators': 300,
    'learning_rate' : 0.025,   
}

print('ADABOOST')
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)

ada_oof_train, ada_oof_test = get_oof(ada, X, y, test) # AdaBoost 

curr_gini = calcginiindex(ada_oof_test)
if curr_gini > best_gini:
    print('New best gini:', curr_gini)
    best_gini = curr_gini
    best_params = pd.DataFrame(ada_params, index=[1])
else:
    print('Curr gini:', curr_gini)

ADABOOST
('Fold :', 0, 'Train Gini:', 0.02937513804857288, 'Valid Gini:', 0.029498637856646977)
('Fold :', 1, 'Train Gini:', 0.027900992771846579, 'Valid Gini:', 0.032083904283688854)
('Fold :', 2, 'Train Gini:', 0.028262741880190803, 'Valid Gini:', 0.027240561320036463)
('Fold :', 3, 'Train Gini:', 0.029047119671950509, 'Valid Gini:', 0.02360796730539369)
('Fold :', 4, 'Train Gini:', 0.029163440434755383, 'Valid Gini:', 0.029680080751846933)
('Curr gini:', 0.01285387891056696)


# Gradient Boosting

In [None]:
# Gradient Boosting parameters
gb_params = {
    'n_estimators': 200,
    'max_features': 37,
    'max_depth': 17,
    'subsample': 0.8,
}

gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)

print('GRADIENT BOOSTING')
gb_oof_train, gb_oof_test = get_oof(gb,X, y, test) # Gradient Boost
curr_gini = calcginiindex(gb_oof_test)
if curr_gini > best_gini:
    print('New best gini:', curr_gini)
    best_gini = curr_gini
    best_params = pd.DataFrame(gb_params, index=[1])
else:
    print('Curr gini:', curr_gini)

# LightGBM

In [264]:
lgb_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'min_sum_hessian_in_leaf': 10,
    'min_gain_to_split': 0.65,
    'poisson_max_delta_step': 1.8,
    'num_leaves': 31,
    #'learning_rate': 0.025,
    'feature_fraction': 0.4,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'max_depth': 7,
    'verbose': 2
}

lgb_oof_train, lgb_oof_test = get_oof_lgb(X, y, test) # LightGBM

Training until validation scores don't improve for 130 rounds.
[500]	valid_0's gini: 0.275826
[1000]	valid_0's gini: 0.280338
Early stopping, best iteration is:
[1359]	valid_0's gini: 0.280923
Training until validation scores don't improve for 130 rounds.
[500]	valid_0's gini: 0.279104
[1000]	valid_0's gini: 0.282039
Early stopping, best iteration is:
[1058]	valid_0's gini: 0.282115
Training until validation scores don't improve for 130 rounds.
[500]	valid_0's gini: 0.276888
[1000]	valid_0's gini: 0.279012
Early stopping, best iteration is:
[1189]	valid_0's gini: 0.279377
Training until validation scores don't improve for 130 rounds.
[500]	valid_0's gini: 0.283432
[1000]	valid_0's gini: 0.286575
Early stopping, best iteration is:
[911]	valid_0's gini: 0.286634
Training until validation scores don't improve for 130 rounds.
[500]	valid_0's gini: 0.269399
[1000]	valid_0's gini: 0.273026
Early stopping, best iteration is:
[1137]	valid_0's gini: 0.273242


# XGBoost

In [265]:
xgb_params = {
    'min_child_weight': 10.0,
    'objective': 'binary:logistic',
    'max_depth': 7,
    'max_delta_step': 1.8,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'eta': 0.025,
    'gamma': 0.65,
    'num_boost_round' : 700
}

xgb_oof_train, xgb_oof_test = get_oof_xgb(X, y, test) # XGBoost

[0]	train-gini:0.023568	valid-gini:0.030105
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 130 rounds.
[500]	train-gini:0.40022	valid-gini:0.278484
[1000]	train-gini:0.431121	valid-gini:0.27982
Stopping. Best iteration:
[1033]	train-gini:0.432084	valid-gini:0.279847

[0]	train-gini:0.023179	valid-gini:0.036347
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 130 rounds.
[500]	train-gini:0.397777	valid-gini:0.285831
[1000]	train-gini:0.429074	valid-gini:0.287582
Stopping. Best iteration:
[1322]	train-gini:0.436728	valid-gini:0.288112

[0]	train-gini:0.023073	valid-gini:0.02586
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 130 rounds.
[500]	train-gini:0.400694	valid-gini:0.282228
Stopping. Best iteration:
[690]	train-gini:0.417463	v

In [266]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'DecisionTree': dt_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
  #   'AdaBoost': ada_oof_train.ravel(),
   #  'GradientBoost': gb_oof_train.ravel(),
     'LightGBM': lgb_oof_train.ravel(),
     'XGBoost': xgb_oof_train.ravel()
    })
base_predictions_train.head()
base_predictions_train.to_csv('base_predictions')

In [270]:
base_predictions_test = pd.DataFrame( {'RandomForest': rf_oof_test.ravel(),
     'DecisionTree': dt_oof_test.ravel(),
     'ExtraTrees': et_oof_test.ravel(),
  #   'AdaBoost': ada_oof_train.ravel(),
   #  'GradientBoost': gb_oof_train.ravel(),
     'LightGBM': lgb_oof_test.ravel(),
     'XGBoost': xgb_oof_test.ravel()
    })
base_predictions_test.head()
base_predictions_test.to_csv('base_predictions_test')

In [317]:
#x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train,lgb_oof_train,xgb_oof_train), axis=1)
#x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, lgb_oof_test,xgb_oof_test), axis=1)
x_train = np.concatenate(( et_oof_train,lgb_oof_train,xgb_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, lgb_oof_test,xgb_oof_test), axis=1)

In [297]:
kfold = 5
skf = StratifiedKFold(n_splits=kfold, random_state=42)

params = {
   # 'min_child_weight': 5.0,
    'objective': 'binary:logistic',
    'max_depth': 5,
    'max_delta_step': 1.8,
    'colsample_bytree': 0.5,
    'subsample': 0.5,
    'eta': 0.0005,
#    'gamma': 100.85,
    'num_boost_round' : 700
}

sub = pd.DataFrame()
sub['id'] = test_id
sub['target'] = np.zeros_like(test_id)

sum_valid_gini = 0

for i, (train_index, test_index) in enumerate(skf.split(x_train, y)):
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_valid = x_train[train_index], x_train[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    # Convert our data into XGBoost format
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)
    d_test = xgb.DMatrix(x_test)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    mdl = xgb.train(params, d_train, 4000, watchlist, early_stopping_rounds=130, 
                    feval=gini_xgb, maximize=True, verbose_eval=10,
                    learning_rates = lambda iter, ed: 0.0005 * (0.997 ** iter)
                   )

    print('[Fold %d/%d Prediciton:]' % (i + 1, kfold))
    # Predict on our test data
    p_test = mdl.predict(d_test)
    p_valid = mdl.predict(d_valid)
    sub['target'] += p_test/kfold
    sum_valid_gini += gini_normalized(y_valid, p_valid)

[Fold 1/5]
[0]	train-gini:0.070798	valid-gini:0.066703
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 130 rounds.
[10]	train-gini:0.780813	valid-gini:0.148646
[20]	train-gini:0.772707	valid-gini:0.173647
[30]	train-gini:0.770504	valid-gini:0.17463
[40]	train-gini:0.77126	valid-gini:0.175379
[50]	train-gini:0.773022	valid-gini:0.176712
[60]	train-gini:0.777408	valid-gini:0.189046
[70]	train-gini:0.775344	valid-gini:0.188741
[80]	train-gini:0.775689	valid-gini:0.188221
[90]	train-gini:0.781383	valid-gini:0.188756
[100]	train-gini:0.784398	valid-gini:0.207066
[110]	train-gini:0.78708	valid-gini:0.207107
[120]	train-gini:0.788555	valid-gini:0.220791
[130]	train-gini:0.794844	valid-gini:0.222259
[140]	train-gini:0.798594	valid-gini:0.22249
[150]	train-gini:0.802365	valid-gini:0.222664
[160]	train-gini:0.806315	valid-gini:0.222613
[170]	train-gini:0.812845	valid-gini:0.225163
[180]	train-gini:0.817426	vali

KeyboardInterrupt: 

In [None]:
print('EstimatedGini: ', calcginiindex(sub['target'].values))
#sum_valid_gini = 0.279847 + 0.288112 + 0.282968 + 0.291635 + 0.27727
print('Valid gini:', sum_valid_gini/kfold)

In [None]:
sub.to_csv('ensemble'+ str(sum_valid_gini/kfold) +'.csv', index = False)

In [319]:
best_gini = 0
rf_params = {
    'n_jobs': 6,
    'n_estimators': 40,
    'warm_start': True, 
    'max_depth': 3
}
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)

print('RANDOM FOREST')
rf_train, rf_test = get_oof(rf,x_train, y, x_test) # Random Forest

curr_gini = calcginiindex(rf_test)
if curr_gini > best_gini:
    print('New best gini:', curr_gini)
    best_gini = curr_gini
    best_params = pd.DataFrame(rf_params, index=[1])
else:
    print('Curr gini:', curr_gini)

RANDOM FOREST
('Fold :', 0, 'Train Gini:', 0.87716051078130197, 'Valid Gini:', 0.042669514555137913)
('Fold :', 1, 'Train Gini:', 0.80026221117762508, 'Valid Gini:', 0.88101619215005911)
('Fold :', 2, 'Train Gini:', 0.79998561362061993, 'Valid Gini:', 0.88215553019869741)
('Fold :', 3, 'Train Gini:', 0.80218687544262712, 'Valid Gini:', 0.87396872870139297)
('Fold :', 4, 'Train Gini:', 0.8010473675368841, 'Valid Gini:', 0.87868734811001514)
('New best gini:', 0.7589042875002795)


In [324]:
sub = pd.DataFrame({
    'id': test_id,
    'target': (et_oof_test.ravel() + lgb_oof_test.ravel() + xgb_oof_test.ravel())/3
})
sub.to_csv('ensemble_medium_v1.csv', index = False)

In [325]:
calcginiindex(sub['target'].values)

0.34668668191145868