This is a simpler modelling stack, where models have been pulled from the stack if they are not useful.  Strangely, the regularization on the top level of the stack must be light l1.  Produces 0.28966 in local test. Just over 0.285 on LB.  Position: 387 (386 was bottom of bronze).

In [1]:
import numpy as np, pandas as pd,matplotlib.pyplot as plt
%matplotlib inline



In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import copy

In [3]:
def rank(array):
    srtInd = array.argsort()
    ranks = np.empty(len(array), float)
    ranks[srtInd] = np.arange(len(array))
    return ranks / float(len(array))

In [4]:
class Layer(object):
    def __init__(self, n_splits, base_models):
        self.n_splits = n_splits
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):
            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]

                print ("Fit %s fold %d" % (str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
                y_pred = clf.predict_proba(X_holdout)[:,1]              

                S_train[test_idx, i] = y_pred
                #
                test_probs= clf.predict_proba(T)[:,1]
                S_test_i[:, j] = np.log(test_probs) - np.log(1.0 - test_probs)
            agg_lor= S_test_i.mean(axis=1)
            S_test[:, i] = 1.0 / (1.0 + np.exp( -agg_lor) )
        return S_train,S_test

In [5]:
# Add output of layer results ofr quick iteration
class Stack:
    def __init__(self,k_folds,hidden_layers,top_layer,saveInternalVectors=False):
        self.saveInternalVectors= saveInternalVectors
        self.layers= []
        for h in hidden_layers:
            self.layers.append( Layer(k_folds,h) )
        self.top_layer= top_layer 
        return None
    
    def fit_predict(self,X,y,T,external_base_scores= None):
        Xt_train= copy.deepcopy(X)
        Xt_test= copy.deepcopy(T)
        i= 1
        for layer in self.layers:
            print 'Fitting stack layer '+str(i)
            Xt_train, Xt_test= layer.fit_predict(Xt_train,y,Xt_test)
            if external_base_scores is not None and i==1:
                Xt_train= np.concatenate( (Xt_train,np.reshape(external_base_scores[0],
                                                               (np.shape(external_base_scores[0])[0],1))),
                                         axis=1)
                Xt_test= np.concatenate( (Xt_test,np.reshape(external_base_scores[1],
                                                            (np.shape(external_base_scores[1])[0],1))), 
                                        axis=1)
            #
            # Add entropy score from layer
#             train_entropy= np.array(map(lambda i: np.sum(Xt_train[i] * np.log(Xt_train[i])), 
#                                         range(np.shape(Xt_train)[0] )))
#             Xt_train= np.concatenate( (Xt_train,np.reshape(train_entropy,(len(train_entropy),1))) ,axis=1)
#             test_entropy= np.array(map(lambda i: np.sum(Xt_test[i] * np.log(Xt_test[i])), 
#                                         range(np.shape(Xt_test)[0] )))
#             Xt_test= np.concatenate( (Xt_test,np.reshape(test_entropy,(len(test_entropy),1))) ,axis=1)
            #
#             # Rank transform
#             for jj in range(np.shape(Xt_train)[1]):
#                 Xt_train[:,jj]= rank(Xt_train[:,jj])
#                 Xt_test[:,jj]= rank(Xt_test[:,jj])
#             #
            if self.saveInternalVectors:
                fname= 'STACK_internal_train_layer_'+str(i)+'.bin'
                np.save(open(fname,'wb'),Xt_train)
                fname= 'STACK_internal_test_layer_'+str(i)+'.bin'
                np.save(open(fname,'wb'),Xt_test)
            i+=1
#         for i in range(np.shape(Xt_train)[1]): #-1 so we don't apply to entropy!!
#             p= copy.deepcopy(Xt_train[:,i])
#             Xt_train[:,i]= np.log( p ) - np.log(1.0 - p)
        self.top_layer.fit(Xt_train,y)
        return self.top_layer.predict_proba(Xt_test)[:,1]

Now specify the stack

## Now score the actual data

In [6]:
# Read data

# train
X_train= np.load(open('full_train_matrix.bin','rb'))
y_train= np.load(open('full_train_labels.bin','rb'))

# test
X_test= np.load(open('blind_test_matrix.bin','rb'))
# y_test= np.load(open('blind_test_labels.bin','rb'))

In [7]:
# keeps= np.load(open('xgboost_rfe_keepers.bin','rb'))

In [8]:
# import sklearn.linear_model
# lr= sklearn.linear_model.LogisticRegression(C=10000.0,class_weight={0:1.,1.:10/0.034},penalty='l1')
# lr.fit(X_train,y_train)
# y_pred= lr.predict_proba(X_test)[:,1]
# print 'Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.

In [9]:
test_ids= pd.read_csv('../data/test.csv',usecols=['id'])

In [10]:
import lightgbm.sklearn
import xgboost.sklearn
import catboost
import sklearn.linear_model
import sklearn.neural_network

lgb_params = {}
lgb_params['learning_rate'] = 0.01
lgb_params['n_estimators'] = 1300
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 500
lgb_params['num_leaves']= 25
lgb_params['n_jobs']=8


lgb_params_3 = {
    'learning_rate': 0.02,
    'n_estimators': 800,
    'max_depth': 4,
    'n_jobs':8
}

lgb_params_4 = {
    'learning_rate':0.05,
    'n_estimators':600,
    'num_leaves':35,
    'min_child_samples':500,
    'n_jobs':8
}


xgb_params= {'learning_rate': 0.07,
             'n_estimators':525,
             'max_depth': 4, 
             'nthread':8,
             'subsample': 0.8,
             'min_child_weight':0.77,
             'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 
             'eval_metric': 'auc', 
             'seed': 99, 
             'silent': True,
             'scale_pos_weight': 1.6,
             'reg_alpha':8,
             'reg_lambda':1.3,
             'gamma':10
            }

cb_params= {
    'learning_rate':0.05, 
    'depth':6, 
    'l2_leaf_reg': 14, 
    'iterations': 650,
    'verbose': False,
    'loss_function':'Logloss'
    }

# Layer 1
lgbm1 = lightgbm.sklearn.LGBMClassifier(**lgb_params)
xgbm1= xgboost.sklearn.XGBClassifier(**xgb_params)
lgbm3 = lightgbm.sklearn.LGBMClassifier(**lgb_params_3)
lgbm4 = lightgbm.sklearn.LGBMClassifier(**lgb_params_4)
cb= catboost.CatBoostClassifier(**cb_params)

# Top layer
stacker= sklearn.linear_model.LogisticRegression(C=500.0,class_weight='balanced',penalty='l1')

# Define the stack
stack = Stack(10,[ [cb,xgbm1,lgbm1,lgbm3,lgbm4] ], stacker, saveInternalVectors=True)   

In [11]:
# Fit and predict
from datetime import datetime
tc= datetime.now()
y_pred = stack.fit_predict(X_train, y_train, X_test) #,
print' Training took '+str( (datetime.now() - tc).total_seconds())

Fitting stack layer 1
Fit <catboost.core.CatBoostClassifier object at 0x14509bb90> fold 1
Fit <catboost.core.CatBoostClassifier object at 0x14509bb90> fold 2
Fit <catboost.core.CatBoostClassifier object at 0x14509bb90> fold 3
Fit <catboost.core.CatBoostClassifier object at 0x14509bb90> fold 4
Fit <catboost.core.CatBoostClassifier object at 0x14509bb90> fold 5
Fit <catboost.core.CatBoostClassifier object at 0x14509bb90> fold 6
Fit <catboost.core.CatBoostClassifier object at 0x14509bb90> fold 7
Fit <catboost.core.CatBoostClassifier object at 0x14509bb90> fold 8
Fit <catboost.core.CatBoostClassifier object at 0x14509bb90> fold 9
Fit <catboost.core.CatBoostClassifier object at 0x14509bb90> fold 10
Fit XGBClassifier fold 1
Fit XGBClassifier fold 2
Fit XGBClassifier fold 3
Fit XGBClassifier fold 4
Fit XGBClassifier fold 5
Fit XGBClassifier fold 6
Fit XGBClassifier fold 7
Fit XGBClassifier fold 8
Fit XGBClassifier fold 9
Fit XGBClassifier fold 10
Fit LGBMClassifier fold 1
Fit LGBMClassifier f

In [12]:
# print 'Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.

In [13]:
print 'Best local score= ',0.290737925496,0.289662696292
print 'Baseline',0.288711072302

Best local score=  0.290737925496 0.289662696292
Baseline 0.288711072302


In [14]:
out_df= pd.DataFrame( {'id': test_ids.id.values, 'target': y_pred},
                     columns=['id','target']
                    ).to_csv('submission_wp014b.csv',index=False)

In [38]:
stacker= sklearn.linear_model.LogisticRegression(C=500.,class_weight='balanced',penalty='l1')
X_train= np.load('STACK_internal_train_layer_1.bin')[:,[0,2,3,4]]
X_test= np.load('STACK_internal_test_layer_1.bin')[:,[0,2,3,4]]
stacker.fit(X_train,y_train)
y_pred= stacker.predict_proba(X_test)[:,1]
print 'Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.

Gini score=  0.290126214944


In [23]:
0.289780485339

0.289780485339

In [8]:
xgb_params= {'learning_rate': 0.07,
             'n_estimators':1000, #525, #,354
             'max_depth': 4, 
             'nthread':8,
             'subsample': 0.8,
             'min_child_weight':0.77,
             'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 
             'eval_metric': 'auc', 
             'seed': 99, 
             'silent': True,
             'scale_pos_weight': 1.6,
             'reg_alpha':8,
             'reg_lambda':1.3,
             'gamma':10
            }
xgbm1= xgboost.sklearn.XGBClassifier(**xgb_params)
xgbm1.fit(X_train,y_train,eval_metric='auc',early_stopping_rounds=50,eval_set=[(X_test,y_test)])
y_pred= xgbm1.predict_proba(X_test)[:,1]
print 'Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.

[0]	validation_0-auc:0.587442
Will train until validation_0-auc hasn't improved in 50 rounds.
[1]	validation_0-auc:0.604111
[2]	validation_0-auc:0.604609
[3]	validation_0-auc:0.609857
[4]	validation_0-auc:0.610012
[5]	validation_0-auc:0.613922
[6]	validation_0-auc:0.614897
[7]	validation_0-auc:0.615186
[8]	validation_0-auc:0.615893
[9]	validation_0-auc:0.615365
[10]	validation_0-auc:0.615606
[11]	validation_0-auc:0.616957
[12]	validation_0-auc:0.616898
[13]	validation_0-auc:0.61743
[14]	validation_0-auc:0.617757
[15]	validation_0-auc:0.618349
[16]	validation_0-auc:0.618636
[17]	validation_0-auc:0.619403
[18]	validation_0-auc:0.619486
[19]	validation_0-auc:0.619779
[20]	validation_0-auc:0.62052
[21]	validation_0-auc:0.621023
[22]	validation_0-auc:0.621524
[23]	validation_0-auc:0.62188
[24]	validation_0-auc:0.621653
[25]	validation_0-auc:0.621614
[26]	validation_0-auc:0.622377
[27]	validation_0-auc:0.622772
[28]	validation_0-auc:0.622941
[29]	validation_0-auc:0.623317
[30]	validation_0-a

In [57]:
# keep= np.where(xgbm1.feature_importances_ > 0.)[0]
# keep2= np.where(xgbm1.feature_importances_ > 0.)[0]
keeps= keep[keep2]

In [58]:
# np.save(open('xgboost_rfe_keepers.bin','wb'),keeps)

In [7]:
import catboost

In [15]:
MAX_ROUNDS = 650
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.05
cb_params= {
    'learning_rate':0.05, 
    'depth':6, 
    'l2_leaf_reg': 14, 
    'iterations':650,
    'verbose': True,
    'loss_function':'Logloss'
    }

cb= catboost.CatBoostClassifier(**cb_params)

In [10]:
cb.fit(X_train,y_train)

0: learn: 0.6202101	total: 475ms	remaining: 5m 8s
1: learn: 0.5567888	total: 909ms	remaining: 4m 54s
2: learn: 0.5023358	total: 1.35s	remaining: 4m 50s
3: learn: 0.4557553	total: 1.78s	remaining: 4m 47s
4: learn: 0.4152165	total: 2.19s	remaining: 4m 43s
5: learn: 0.3805192	total: 2.63s	remaining: 4m 42s
6: learn: 0.3507592	total: 3.06s	remaining: 4m 40s
7: learn: 0.3251094	total: 3.48s	remaining: 4m 39s
8: learn: 0.3031735	total: 3.89s	remaining: 4m 37s
9: learn: 0.2841162	total: 4.45s	remaining: 4m 44s
10: learn: 0.2678054	total: 4.76s	remaining: 4m 36s
11: learn: 0.253743	total: 5.22s	remaining: 4m 37s
12: learn: 0.2413729	total: 5.65s	remaining: 4m 36s
13: learn: 0.2306661	total: 6.17s	remaining: 4m 40s
14: learn: 0.2213532	total: 6.59s	remaining: 4m 39s
15: learn: 0.2133104	total: 7.23s	remaining: 4m 46s
16: learn: 0.2063741	total: 7.67s	remaining: 4m 45s
17: learn: 0.2003758	total: 7.94s	remaining: 4m 38s
18: learn: 0.1949752	total: 8.44s	remaining: 4m 40s
19: learn: 0.1902638	tot

<catboost.core._CatBoostBase at 0x110bed850>

In [12]:
import sklearn.metrics
y_pred= cb.predict_proba(X_test)[:,1]
print 'Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.

 Gini score=  0.285913088474
