This is a simpler modelling stack, where models have been pulled from the stack if they are not useful.  Strangely, the regularization on the top level of the stack must be light l1.  Produces 0.28966 in local test. Just over 0.285 on LB.  Position: 387 (386 was bottom of bronze).

In [1]:
import numpy as np, pandas as pd,matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.cross_validation import StratifiedKFold
import copy



In [3]:
def rank(array):
    srtInd = array.argsort()
    ranks = np.empty(len(array), float)
    ranks[srtInd] = np.arange(len(array))
    return ranks / float(len(array))

In [4]:
class Layer(object):
    def __init__(self, n_splits, base_models):
        self.n_splits = n_splits
        self.pipelines= []
        self.base_models= []
        for x in base_models:
            self.pipelines.append( x[0] )
            self.base_models.append( x[1] )

    def fit_predict(self, df_train, y, df_test):
        y = np.array(y)

        folds = StratifiedKFold(y,n_folds=self.n_splits, shuffle=True, random_state=2016)

        S_train = np.zeros((len(df_train.index), len(self.base_models)))
        S_test = np.zeros((len(df_test.index), len(self.base_models)))
        for i, clf in enumerate(self.base_models):
            #
            PIPELINE= self.pipelines[i]
            #
            S_test_i = np.zeros((len(df_test.index), self.n_splits))
#             X_train, y_train, X_test, y_test= PIPELINE(df_train.copy(),df_test.copy())
            #
            for j, (train_idx, val_idx) in enumerate(folds):
                #
                X_train, y_train, X_holdout, y_holdout= PIPELINE(
                    df_train.copy().loc[train_idx,:],
                    df_train.copy().loc[val_idx,:]
                )
                _, _, X_test, y_test= PIPELINE(df_train.copy().loc[train_idx,:],df_test.copy()
                )
                #
                print ("Fit %s --> %s fold %d" % (str(PIPELINE).split()[1],str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
                y_pred = clf.predict_proba(X_holdout)[:,1]              

                S_train[val_idx, i] = y_pred
                #
                test_probs= clf.predict_proba(X_test)[:,1]
                S_test_i[:, j] = np.log(test_probs) - np.log(1.0 - test_probs)
            agg_lor= S_test_i.mean(axis=1)
            S_test[:, i] = 1.0 / (1.0 + np.exp( -agg_lor) )
        return pd.DataFrame(S_train,columns=['x_'+str(i) for i in range(np.shape(S_train)[1])]),pd.DataFrame(S_test,columns=['x_'+str(i) for i in range(np.shape(S_test)[1])])

In [5]:
# Add output of layer results ofr quick iteration
class Stack:
    def __init__(self,k_folds,hidden_layers,top_layer,saveInternalVectors=False):
        self.saveInternalVectors= saveInternalVectors
        self.layers= []
        for h in hidden_layers:
            self.layers.append( Layer(k_folds,h) )
        self.top_layer= top_layer 
        return None
    
    def fit_predict(self,df_train,y,df_test,external_base_scores= None):
        Xt_train= df_train
        Xt_test= df_test
        i= 1
        for layer in self.layers:
            print 'Fitting stack layer '+str(i)
            Xt_train, Xt_test= layer.fit_predict(Xt_train,y,Xt_test)
            Xt_train.loc[:,'target']= y
            if external_base_scores is not None and i==1:
                Xt_train.loc[:,'ext']= external_base_scores[0]
                Xt_test.loc[:,'ext']= external_base_scores[1]
#                 Xt_train= np.concatenate( (Xt_train,np.reshape(external_base_scores[0],
#                                                                (np.shape(external_base_scores[0])[0],1))),
#                                          axis=1)
#                 Xt_test= np.concatenate( (Xt_test,np.reshape(external_base_scores[1],
#                                                             (np.shape(external_base_scores[1])[0],1))), 
#                                         axis=1)
            #
            if self.saveInternalVectors:
                fname= 'STACK_internal_train_layer_'+str(i)+'.bin'
                np.save(open(fname,'wb'),Xt_train)
                fname= 'STACK_internal_test_layer_'+str(i)+'.bin'
                np.save(open(fname,'wb'),Xt_test)
            i+=1
        Xt_train.drop('target',axis=1,inplace=True)
        self.top_layer.fit(Xt_train.as_matrix(),y)
        return self.top_layer.predict_proba(Xt_test.as_matrix())[:,1]

Now specify the stack

## Now score the actual data

In [6]:
# # Read data

# # train
# X_train= np.load(open('../WP014/dummy_train_matrix.bin','rb'))
# y_train= np.load(open('../WP014/dummy_train_labels.bin','rb'))

# # test
# X_test= np.load(open('../WP014/dummy_test_matrix.bin','rb'))
# y_test= np.load(open('../WP014/dummy_test_labels.bin','rb'))

In [7]:
import sklearn.model_selection, pipe0_median as pipe0

df_train= pd.read_csv('../data/train.csv')
df_test= pd.read_csv('../data/test.csv')

X_train, y_train, X_test, y_test= pipe0.run_pipe0( df_train.copy(),df_test.copy() )

# index_train, index_test= sklearn.model_selection.train_test_split( range(len(df_train.index)) , 
#                                                                     test_size=0.3,random_state=1)
# 
# df_test= df_train.loc[index_test,:].reset_index(drop=True)
# y_test= df_test.target.values
# df_train= df_train.loc[index_train,:].reset_index(drop=True)
# y_train= df_train.target.values

In [8]:
# read rgf pipeline train and test scores
# X_rgf_train= pd.read_csv('../wp017/rgf_scores_train.csv').target.values
# X_rgf_test= pd.read_csv('../wp017/rgf_blind_scores.csv').target.values
X_rgf_test= pd.read_csv('../wp018/output_0.01_300leaf/rgf_blind_scores.csv').target.values
X_rgf_train= pd.read_csv('../wp018/output_0.01_300leaf/rgf_validation_ scores.csv').target.values
# X_rgf_test= X_rgf_train.target.values[index_test]
# X_rgf_train= X_rgf_train.target.values[index_train]

In [9]:
test_ids= pd.read_csv('../data/test.csv',usecols=['id'])

In [10]:
import lightgbm.sklearn
import xgboost.sklearn
import catboost
import sklearn.linear_model, sklearn.ensemble
import sklearn.neural_network

lgb_params = {}
lgb_params['learning_rate'] = 0.01
lgb_params['n_estimators'] = 1300
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 500
lgb_params['num_leaves']= 25
lgb_params['n_jobs']=8


# lgb_params_3 = {
#     'learning_rate': 0.02,
#     'n_estimators': 800,
#     'max_depth': 4,
#     'n_jobs':8
# }
lgb_params_3 = {
    'learning_rate': 0.02,
    'n_estimators': 800, #150,
    'max_depth': 4,
    'n_jobs':8
#     'min_child_samples':100
}

lgb_params_4 = {
    'learning_rate':0.05,
    'n_estimators':600,
    'num_leaves':35,
    'min_child_samples':500,
    'n_jobs':8
}


xgb_params= {'learning_rate': 0.07,
             'n_estimators':525,
             'max_depth': 4, 
             'nthread':8,
             'subsample': 0.8,
             'min_child_weight':6.0, 
             'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 
             'eval_metric': 'auc', 
             'seed': 99, 
             'silent': True,
             'scale_pos_weight': 1.6,
             'reg_alpha':8,
             'reg_lambda':1.3,
             'gamma':10
            }

cb_params= {
    'learning_rate':0.05, 
    'depth':6, 
    'l2_leaf_reg': 14, 
    'iterations': 650,
    'verbose': False,
    'loss_function':'Logloss'
    }

# lgbmn_params= {'num_leaves': 81, 'verbose': 1, 'learning_rate': 0.005, 
#                'min_data': 650, 'categorical_column': [], 'bagging_fraction': 0.9, 
#                'metric': ['auc'], 'boosting_type': 'gbdt', 'lambda_l1': 30,
#                'bagging_freq': 3, 'lambda_l2': 0, 'is_unbalance': True, 
#                'max_bin': 255, 'objective': ['binary'], 'max_depth': 6, 
#                'feature_fraction': 0.7,'n_estimators':1600
#               }

# Layer 1
lgbm1 = (pipe0.run_pipe0,  lightgbm.sklearn.LGBMClassifier(**lgb_params))
xgbm1= (pipe0.run_pipe0,   xgboost.sklearn.XGBClassifier(**xgb_params))
lgbm3 = (pipe0.run_pipe0,  lightgbm.sklearn.LGBMClassifier(**lgb_params_3))
lgbm4 = (pipe0.run_pipe0,  lightgbm.sklearn.LGBMClassifier(**lgb_params_4))
cb= (pipe0.run_pipe0,      catboost.CatBoostClassifier(**cb_params))
# lgbmn = (pipe3.run_pipe3,  lightgbm.sklearn.LGBMClassifier(**lgbmn_params))

# Top layer
# stacker= sklearn.linear_model.LogisticRegression(C=500.0,class_weight='balanced',penalty='l1')

# Define the stack
# stack = Stack(3,[ [cb,xgbm1,lgbm1,lgbm3,lgbm4] ], stacker, saveInternalVectors=True)   

### LGBM3

In [11]:
from datetime import datetime,timedelta
tc= datetime.now()
X_train, y_train, X_test, y_test= pipe0.run_pipe0( df_train.copy(),df_test.copy() )
lgbm3= lightgbm.sklearn.LGBMClassifier(**lgb_params_3)
lgbm3.fit(X_train,y_train)
ypred= lgbm3.predict_proba(X_test)[:,1]
print 'Training took: ',(datetime.now() - tc).total_seconds()
# print 'Test Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,ypred)-1.

Training took:  57.309064


In [20]:
from datetime import datetime,timedelta
import bagging_me
def bag_classifier(base,PIPELINE,df_train,df_test):
    tc= datetime.now()
    clf= bagging_me.BaggingClassifier(base,
                                            n_estimators=32,
                                            oob_score=True,
                                            max_features=0.8,
                                            max_samples=0.6,
                                            random_state=1,
                                            n_jobs=1
                                           )
    X_train, y_train, X_test, y_test= PIPELINE( df_train.copy(),df_test.copy() )
    clf.fit(X_train,y_train,sample_weight=None)
    oob= clf.oob_decision_function_[:,1]
    y_pred= clf.predict_proba(X_test)[:,1]
    print 'Training took: ',(datetime.now() - tc).total_seconds()
    return oob, y_pred

In [13]:
lgbm3= lightgbm.sklearn.LGBMClassifier(**lgb_params_3)
oob, ptest= bag_classifier(lgbm3,pipe0.run_pipe0,df_train,df_test)
print 'OOB Gini score= ',2.*sklearn.metrics.roc_auc_score(y_train,oob)-1.
# print 'Test Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,ptest)-1.
pd.DataFrame({'id':df_train['id'].values, 'target':oob},columns=['id','target']).to_csv('internals_test/lgbm3_oob.csv',
                                                                                        index=False)
pd.DataFrame({'id':df_test['id'].values, 'target':ptest},columns=['id','target']).to_csv('internals_test/lgbm3_test.csv',
                                                                                        index=False)

Training took:  830.969689
OOB Gini score=  0.286489148083


### LGBM 1

In [14]:
tc= datetime.now()
X_train, y_train, X_test, y_test= pipe0.run_pipe0( df_train.copy(),df_test.copy() )
lgbm1= lightgbm.sklearn.LGBMClassifier(**lgb_params)
lgbm1.fit(X_train,y_train)
ypred= lgbm1.predict_proba(X_test)[:,1]
print 'Training took: ',(datetime.now() - tc).total_seconds()
# print 'Test Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,ypred)-1.

Training took:  96.548018


In [15]:
lgbm1= lightgbm.sklearn.LGBMClassifier(**lgb_params)
oob, ptest= bag_classifier(lgbm1,pipe0.run_pipe0,df_train,df_test)
print 'OOB Gini score= ',2.*sklearn.metrics.roc_auc_score(y_train,oob)-1.
# print 'Test Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,ptest)-1.
pd.DataFrame({'id':df_train['id'].values, 'target':oob},columns=['id','target']).to_csv('internals_test/lgbm1_oob.csv',
                                                                                        index=False)
pd.DataFrame({'id':df_test['id'].values, 'target':ptest},columns=['id','target']).to_csv('internals_test/lgbm1_test.csv',
                                                                                        index=False)

Training took:  1678.619106
OOB Gini score=  0.285851819675


### LGBM4

In [16]:
tc= datetime.now()
X_train, y_train, X_test, y_test= pipe0.run_pipe0( df_train.copy(),df_test.copy() )
lgbm4= lightgbm.sklearn.LGBMClassifier(**lgb_params_4)
lgbm4.fit(X_train,y_train)
ypred= lgbm4.predict_proba(X_test)[:,1]
print 'Training took: ',(datetime.now() - tc).total_seconds()
# print 'Test Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,ypred)-1.

Training took:  53.311398


In [17]:
lgbm4= lightgbm.sklearn.LGBMClassifier(**lgb_params_4)
oob, ptest= bag_classifier(lgbm4,pipe0.run_pipe0,df_train,df_test)
print 'OOB Gini score= ',2.*sklearn.metrics.roc_auc_score(y_train,oob)-1.
# print 'Test Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,ptest)-1.
pd.DataFrame({'id':df_train['id'].values, 'target':oob},columns=['id','target']).to_csv('internals_test/lgbm4_oob.csv',
                                                                                        index=False)
pd.DataFrame({'id':df_test['id'].values, 'target':ptest},columns=['id','target']).to_csv('internals_test/lgbm4_test.csv',
                                                                                        index=False)

Training took:  790.394124
OOB Gini score=  0.285332756327


### XGBOOST

In [18]:
tc= datetime.now()
X_train, y_train, X_test, y_test= pipe0.run_pipe0( df_train.copy(),df_test.copy() )
xgb= xgboost.sklearn.XGBClassifier(**xgb_params)
xgb.fit(X_train,y_train)
ypred= xgb.predict_proba(X_test)[:,1]
print 'Training took: ',(datetime.now() - tc).total_seconds()
# print 'Test Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,ypred)-1.

Training took:  153.069223


In [19]:
xgb= xgboost.sklearn.XGBClassifier(**xgb_params)
oob, ptest= bag_classifier(xgb,pipe0.run_pipe0,df_train,df_test)
print 'OOB Gini score= ',2.*sklearn.metrics.roc_auc_score(y_train,oob)-1.
# print 'Test Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,ptest)-1.
pd.DataFrame({'id':df_train['id'].values, 'target':oob},columns=['id','target']).to_csv('internals_test/xgb_oob.csv',
                                                                                        index=False)
pd.DataFrame({'id':df_test['id'].values, 'target':ptest},columns=['id','target']).to_csv('internals_test/xgb_test.csv',
                                                                                        index=False)

Training took:  2536.753526
OOB Gini score=  0.288650029066


### CATBOOST

In [20]:
tc= datetime.now()
X_train, y_train, X_test, y_test= pipe0.run_pipe0( df_train.copy(),df_test.copy() )
cb= catboost.CatBoostClassifier(**cb_params)
cb.fit(X_train,y_train)
ypred= cb.predict_proba(X_test)[:,1]
print 'Training took: ',(datetime.now() - tc).total_seconds()
# print 'Test Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,ypred)-1.

Training took:  349.613966


In [21]:
cb= catboost.CatBoostClassifier(**cb_params)
oob, ptest= bag_classifier(cb,pipe0.run_pipe0,df_train,df_test)
print 'OOB Gini score= ',2.*sklearn.metrics.roc_auc_score(y_train,oob)-1.
# print 'Test Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,ptest)-1.
pd.DataFrame({'id':df_train['id'].values, 'target':oob},columns=['id','target']).to_csv('internals_test/cb_oob.csv',
                                                                                        index=False)
pd.DataFrame({'id':df_test['id'].values, 'target':ptest},columns=['id','target']).to_csv('internals_test/cb_test.csv',
                                                                                        index=False)

Training took:  5751.556413
OOB Gini score=  0.286864230361


### Now grab the data and ensemble it

In [20]:
cb= pd.read_csv('internals_test/cb_oob.csv')
xgb= pd.read_csv('internals_test/xgb_oob.csv')
lgb1= pd.read_csv('internals_test/lgbm1_oob.csv')
lgb3= pd.read_csv('internals_test/lgbm3_oob.csv')
lgb4= pd.read_csv('internals_test/lgbm4_oob.csv')
harl= pd.read_csv('../wp023/internals_test/harless_oob.csv')
# froza= pd.read_csv('../wp023/internals_test/froza_oof.csv')
# keras= pd.read_csv('internals/keras_oob.csv')
# lrbase= pd.read_csv('internals/LRbase_oob.csv')
X_OOB= pd.DataFrame({'cb':cb.target.values,
                     'xgb':xgb.target.values,
                     'lgb1':lgb1.target.values,
                     'lgb3':lgb3.target.values,
                     'lgb4':lgb4.target.values,
                     'rgf':X_rgf_train,
                     'harl':harl.target.values
#                      'froza':froza.target.values
#                      'keras':keras.target.values[index_train]
#                      'lrbase':lrbase.target.values
                     }).as_matrix()

In [21]:
cb= pd.read_csv('internals_test/cb_test.csv')
xgb= pd.read_csv('internals_test/xgb_test.csv')
lgb1= pd.read_csv('internals_test/lgbm1_test.csv')
lgb3= pd.read_csv('internals_test/lgbm3_test.csv')
lgb4= pd.read_csv('internals_test/lgbm4_test.csv')
harl= pd.read_csv('../wp023/internals_test/harless_test.csv')
# froza= pd.read_csv('../wp023/internals_test/froza_test.csv')
# keras= pd.read_csv('internals/keras_test.csv')
# lrbase= pd.read_csv('internals/LRbase_test.csv')

X_TEST= pd.DataFrame({'cb':cb.target.values,
                     'xgb':xgb.target.values,
                     'lgb1':lgb1.target.values,
                     'lgb3':lgb3.target.values,
                     'lgb4':lgb4.target.values,
                     'rgf':X_rgf_test,
                     'harl':harl.target.values
#                      'froza':froza.target.values
#                      'keras':keras.target.values[index_test]
#                      'lrbase':lrbase.target.values
                     }).as_matrix()

In [23]:
#  This for testing ensemble incl RGF
internal= [0,1,2,3,4,5,6]
folds = StratifiedKFold(y_train,n_folds=40, shuffle=True, random_state=2016)
y_pred= np.zeros(len(y_train))
for i, (itr,ite) in enumerate(folds):
    stacker= sklearn.linear_model.LogisticRegression(C=1.0,class_weight='balanced',penalty='l2')
    stacker.fit( X_OOB[np.ix_(itr,internal)] , y_train[itr] )
    y_pred[ite]= stacker.predict_proba(X_OOB[np.ix_(ite,internal)])[:,1]
print 'CV Gini score= ',2.*sklearn.metrics.roc_auc_score(y_train,y_pred)-1.,0.286699739102,0.289887071208,0.2870

CV Gini score=  0.291194763656 0.286699739102 0.289887071208 0.287


In [18]:
y_pred= stacker.predict_proba(X_TEST[:,internal])[:,1]
# print 'TEST Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.,0.286699739102,0.287116870117,0.292955968548

Now lets try the bagging approach (which will take longer).  Now the above appears to be better!!

In [14]:
import sklearn.ensemble
from datetime import datetime,timedelta
tc= datetime.now()
clf= sklearn.ensemble.BaggingClassifier(stacker,
                                        n_estimators=128,
                                        oob_score=True,
                                        max_features=4,
                                        max_samples=0.8,
                                        random_state=1,
                                        n_jobs=-1
                                       )
clf.fit(X_OOB,y_train)
y_pred= clf.predict_proba(X_TEST)[:,1]
print 'Training took: ',(datetime.now() - tc).total_seconds()

Training took:  56.774913


In [15]:
y_pred2= clf.oob_decision_function_[:,1]
ok= np.where(map(np.isfinite,y_pred2))[0]
print len(ok),len(y_pred2)

595212 595212


In [16]:
print 'OOB Gini score= ',2.*sklearn.metrics.roc_auc_score(y_train[ok],y_pred2[ok])-1.,0.286463220363,'(0.286760096353)'
# print 'Test Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.,0.291740847787,'(0.292343541304)'

OOB Gini score=  0.29086944693 0.286463220363 (0.286760096353)


In [19]:
# out_df= pd.DataFrame( {'id': test_ids.id.values, 'target': y_pred},
#                      columns=['id','target']
#                     ).to_csv('submission_wp023b.csv',index=False)