This is a simpler modelling stack, where models have been pulled from the stack if they are not useful.  Strangely, the regularization on the top level of the stack must be light l1.  Produces 0.28966 in local test. Just over 0.285 on LB.  Position: 387 (386 was bottom of bronze).

In [26]:
import numpy as np, pandas as pd,matplotlib.pyplot as plt
%matplotlib inline

In [27]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import copy

In [28]:
def rank(array):
    srtInd = array.argsort()
    ranks = np.empty(len(array), float)
    ranks[srtInd] = np.arange(len(array))
    return ranks / float(len(array))

In [29]:
keeps= np.load(open('xgboost_rfe_keepers.bin','rb'))

class Layer(object):
    def __init__(self, n_splits, base_models):
        self.n_splits = n_splits
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):
            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]

                print ("Fit %s fold %d" % (str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
                y_pred = clf.predict_proba(X_holdout)[:,1]              

                S_train[test_idx, i] = y_pred
                #
                test_probs= clf.predict_proba(T)[:,1]
                S_test_i[:, j] = np.log(test_probs) - np.log(1.0 - test_probs)
            agg_lor= S_test_i.mean(axis=1)
            S_test[:, i] = 1.0 / (1.0 + np.exp( -agg_lor) )
        return S_train,S_test

In [30]:
# Add output of layer results ofr quick iteration
class Stack:
    def __init__(self,k_folds,hidden_layers,top_layer,saveInternalVectors=False):
        self.saveInternalVectors= saveInternalVectors
        self.layers= []
        for h in hidden_layers:
            self.layers.append( Layer(k_folds,h) )
        self.top_layer= top_layer 
        return None
    
    def fit_predict(self,X,y,T,external_base_scores= None):
        Xt_train= copy.deepcopy(X)
        Xt_test= copy.deepcopy(T)
        i= 1
        for layer in self.layers:
            print 'Fitting stack layer '+str(i)
            Xt_train, Xt_test= layer.fit_predict(Xt_train,y,Xt_test)
            if external_base_scores is not None and i==1:
                Xt_train= np.concatenate( (Xt_train,np.reshape(external_base_scores[0],
                                                               (np.shape(external_base_scores[0])[0],1))),
                                         axis=1)
                Xt_test= np.concatenate( (Xt_test,np.reshape(external_base_scores[1],
                                                            (np.shape(external_base_scores[1])[0],1))), 
                                        axis=1)
            #
            # Add entropy score from layer
#             train_entropy= np.array(map(lambda i: np.sum(Xt_train[i] * np.log(Xt_train[i])), 
#                                         range(np.shape(Xt_train)[0] )))
#             Xt_train= np.concatenate( (Xt_train,np.reshape(train_entropy,(len(train_entropy),1))) ,axis=1)
#             test_entropy= np.array(map(lambda i: np.sum(Xt_test[i] * np.log(Xt_test[i])), 
#                                         range(np.shape(Xt_test)[0] )))
#             Xt_test= np.concatenate( (Xt_test,np.reshape(test_entropy,(len(test_entropy),1))) ,axis=1)
            #
#             # Rank transform
#             for jj in range(np.shape(Xt_train)[1]):
#                 Xt_train[:,jj]= rank(Xt_train[:,jj])
#                 Xt_test[:,jj]= rank(Xt_test[:,jj])
#             #
            if self.saveInternalVectors:
                fname= 'STACK_internal_train_layer_'+str(i)+'.bin'
                np.save(open(fname,'wb'),Xt_train)
                fname= 'STACK_internal_test_layer_'+str(i)+'.bin'
                np.save(open(fname,'wb'),Xt_test)
            i+=1
#         for i in range(np.shape(Xt_train)[1]): #-1 so we don't apply to entropy!!
#             p= copy.deepcopy(Xt_train[:,i])
#             Xt_train[:,i]= np.log( p ) - np.log(1.0 - p)
        self.top_layer.fit(Xt_train,y)
        return self.top_layer.predict_proba(Xt_test)[:,1]

Now specify the stack

## Now score the actual data

In [31]:
# Read data

# train
X_train= np.load(open('../wp013/full_train_matrix.bin','rb'))
y_train= np.load(open('../wp013/full_train_labels.bin','rb'))

# test
X_test= np.load(open('../wp013/blind_test_matrix.bin','rb'))
# y_test= np.load(open('../wp013/blind_test_labels.bin','rb'))

In [32]:
# keeps= np.load(open('xgboost_rfe_keepers.bin','rb'))

In [33]:
# import sklearn.linear_model
# lr= sklearn.linear_model.LogisticRegression(C=10000.0,class_weight={0:1.,1.:10/0.034},penalty='l1')
# lr.fit(X_train,y_train)
# y_pred= lr.predict_proba(X_test)[:,1]
# print 'Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.

In [34]:
test_ids= pd.read_csv('../data/test.csv',usecols=['id'])

In [35]:
import lightgbm.sklearn
import xgboost.sklearn
import sklearn.linear_model
import sklearn.neural_network

lgb_params = {}
lgb_params['learning_rate'] = 0.01
lgb_params['n_estimators'] = 1300
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 500
lgb_params['num_leaves']= 25
lgb_params['n_jobs']=8


lgb_params_3 = {
    'learning_rate': 0.02,
    'n_estimators': 800,
    'max_depth': 4,
    'n_jobs':8
}

lgb_params_4 = {
    'learning_rate':0.05,
    'n_estimators':600,
    'num_leaves':35,
    'min_child_samples':500,
    'n_jobs':8
}


xgb_params= {'learning_rate': 0.07,
             'n_estimators':2000, #525,
             'max_depth': 4, 
             'nthread':8,
             'subsample': 0.8,
             'min_child_weight':0.77,
             'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 
             'eval_metric': 'auc', 
             'seed': 99, 
             'silent': True,
             'scale_pos_weight': 1.6,
             'reg_alpha':8,
             'reg_lambda':1.3,
             'gamma':10
            }


# Layer 1
lgbm1 = lightgbm.sklearn.LGBMClassifier(**lgb_params)
xgbm1= xgboost.sklearn.XGBClassifier(**xgb_params)
lgbm3 = lightgbm.sklearn.LGBMClassifier(**lgb_params_3)
lgbm4 = lightgbm.sklearn.LGBMClassifier(**lgb_params_4)

# Top layer
stacker= sklearn.linear_model.LogisticRegression(C=500.0,class_weight='balanced',penalty='l1')

# Define the stack
stack = Stack(10,[ [xgbm1,lgbm1,lgbm3,lgbm4] ], stacker)   

In [36]:
# Fit and predict
from datetime import datetime
tc= datetime.now()
y_pred = stack.fit_predict(X_train, y_train, X_test) #,
print' Training took '+str( (datetime.now() - tc).total_seconds())

Fitting stack layer 1
Fit XGBClassifier fold 1
Fit XGBClassifier fold 2
Fit XGBClassifier fold 3
Fit XGBClassifier fold 4
Fit XGBClassifier fold 5
Fit XGBClassifier fold 6
Fit XGBClassifier fold 7
Fit XGBClassifier fold 8
Fit XGBClassifier fold 9
Fit XGBClassifier fold 10
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 4
Fit LGBMClassifier fold 5
Fit LGBMClassifier fold 6
Fit LGBMClassifier fold 7
Fit LGBMClassifier fold 8
Fit LGBMClassifier fold 9
Fit LGBMClassifier fold 10
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 4
Fit LGBMClassifier fold 5
Fit LGBMClassifier fold 6
Fit LGBMClassifier fold 7
Fit LGBMClassifier fold 8
Fit LGBMClassifier fold 9
Fit LGBMClassifier fold 10
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 4
Fit LGBMClassifier fold 5
Fit LGBMClassifier fold 6
Fit LGBMClassifier fold 7
Fit LGBMClassifier fold

In [37]:
# print 'Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.

In [38]:
print 'Best local score= ',0.290737925496,0.289662696292
print 'Baseline',0.288711072302

Best local score=  0.290737925496 0.289662696292
Baseline 0.288711072302


In [39]:
out_df= pd.DataFrame( {'id': test_ids.id.values, 'target': y_pred},
                     columns=['id','target']
                    ).to_csv('submission_wp013d.csv',index=False)

In [8]:
xgb_params= {'learning_rate': 0.07,
             'n_estimators':1000, #525, #,354
             'max_depth': 4, 
             'nthread':8,
             'subsample': 0.8,
             'min_child_weight':0.77,
             'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 
             'eval_metric': 'auc', 
             'seed': 99, 
             'silent': True,
             'scale_pos_weight': 1.6,
             'reg_alpha':8,
             'reg_lambda':1.3,
             'gamma':10
            }
xgbm1= xgboost.sklearn.XGBClassifier(**xgb_params)
xgbm1.fit(X_train,y_train,eval_metric='auc',early_stopping_rounds=50,eval_set=[(X_test,y_test)])
y_pred= xgbm1.predict_proba(X_test)[:,1]
print 'Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.

[0]	validation_0-auc:0.587442
Will train until validation_0-auc hasn't improved in 50 rounds.
[1]	validation_0-auc:0.604111
[2]	validation_0-auc:0.604609
[3]	validation_0-auc:0.609857
[4]	validation_0-auc:0.610012
[5]	validation_0-auc:0.613922
[6]	validation_0-auc:0.614897
[7]	validation_0-auc:0.615186
[8]	validation_0-auc:0.615893
[9]	validation_0-auc:0.615365
[10]	validation_0-auc:0.615606
[11]	validation_0-auc:0.616957
[12]	validation_0-auc:0.616898
[13]	validation_0-auc:0.61743
[14]	validation_0-auc:0.617757
[15]	validation_0-auc:0.618349
[16]	validation_0-auc:0.618636
[17]	validation_0-auc:0.619403
[18]	validation_0-auc:0.619486
[19]	validation_0-auc:0.619779
[20]	validation_0-auc:0.62052
[21]	validation_0-auc:0.621023
[22]	validation_0-auc:0.621524
[23]	validation_0-auc:0.62188
[24]	validation_0-auc:0.621653
[25]	validation_0-auc:0.621614
[26]	validation_0-auc:0.622377
[27]	validation_0-auc:0.622772
[28]	validation_0-auc:0.622941
[29]	validation_0-auc:0.623317
[30]	validation_0-a

In [57]:
# keep= np.where(xgbm1.feature_importances_ > 0.)[0]
# keep2= np.where(xgbm1.feature_importances_ > 0.)[0]
keeps= keep[keep2]

In [58]:
# np.save(open('xgboost_rfe_keepers.bin','wb'),keeps)

In [53]:
0.288190290095

0.288190290095

In [54]:
xgbm1.feature_importances_ 

array([ 0.05974843,  0.0932914 ,  0.00052411,  0.05398323,  0.04874214,
        0.04192872,  0.0927673 ,  0.01415094,  0.01572327,  0.11268344,
        0.04507338,  0.02568134,  0.02935011,  0.03249476,  0.00314465,
        0.01572327,  0.00681342,  0.00052411,  0.00052411,  0.00524109,
        0.00628931,  0.00733753,  0.00262054,  0.03563941,  0.00104822,
        0.01100629,  0.00471698,  0.00157233,  0.01048218,  0.00366876,
        0.00209644,  0.01100629,  0.00052411,  0.00890985,  0.00524109,
        0.00681342,  0.0115304 ,  0.00157233,  0.00157233,  0.00262054,
        0.00209644,  0.00471698,  0.00209644,  0.00995807,  0.01415094,
        0.00314465,  0.        ,  0.00733753,  0.01572327,  0.00262054,
        0.00209644,  0.00104822,  0.        ,  0.01257862,  0.01519916,
        0.00943396,  0.01310273,  0.0115304 ,  0.03459119,  0.00419287], dtype=float32)