This is a simpler modelling stack, where models have been pulled from the stack if they are not useful.  Strangely, the regularization on the top level of the stack must be light l1.  Produces 0.28966 in local test. Just over 0.285 on LB.  Position: 387 (386 was bottom of bronze).

In [1]:
import numpy as np, pandas as pd,matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.cross_validation import StratifiedKFold
import copy



In [3]:
def rank(array):
    srtInd = array.argsort()
    ranks = np.empty(len(array), float)
    ranks[srtInd] = np.arange(len(array))
    return ranks / float(len(array))

In [4]:
class Layer(object):
    def __init__(self, n_splits, base_models):
        self.n_splits = n_splits
        self.pipelines= []
        self.base_models= []
        for x in base_models:
            self.pipelines.append( x[0] )
            self.base_models.append( x[1] )

    def fit_predict(self, df_train, y, df_test):
        y = np.array(y)

        folds = StratifiedKFold(y,n_folds=self.n_splits, shuffle=True, random_state=2016)

        S_train = np.zeros((len(df_train.index), len(self.base_models)))
        S_test = np.zeros((len(df_test.index), len(self.base_models)))
        for i, clf in enumerate(self.base_models):
            #
            PIPELINE= self.pipelines[i]
            #
            S_test_i = np.zeros((len(df_test.index), self.n_splits))
#             X_train, y_train, X_test, y_test= PIPELINE(df_train.copy(),df_test.copy())
            #
            for j, (train_idx, val_idx) in enumerate(folds):
                #
                X_train, y_train, X_holdout, y_holdout= PIPELINE(
                    df_train.copy().loc[train_idx,:],
                    df_train.copy().loc[val_idx,:]
                )
                _, _, X_test, y_test= PIPELINE(df_train.copy().loc[train_idx,:],df_test.copy()
                )
                #
                print ("Fit %s --> %s fold %d" % (str(PIPELINE).split()[1],str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
                y_pred = clf.predict_proba(X_holdout)[:,1]              

                S_train[val_idx, i] = y_pred
                #
                test_probs= clf.predict_proba(X_test)[:,1]
                S_test_i[:, j] = np.log(test_probs) - np.log(1.0 - test_probs)
            agg_lor= S_test_i.mean(axis=1)
            S_test[:, i] = 1.0 / (1.0 + np.exp( -agg_lor) )
        return pd.DataFrame(S_train,columns=['x_'+str(i) for i in range(np.shape(S_train)[1])]),pd.DataFrame(S_test,columns=['x_'+str(i) for i in range(np.shape(S_test)[1])])

In [5]:
# Add output of layer results ofr quick iteration
class Stack:
    def __init__(self,k_folds,hidden_layers,top_layer,saveInternalVectors=False):
        self.saveInternalVectors= saveInternalVectors
        self.layers= []
        for h in hidden_layers:
            self.layers.append( Layer(k_folds,h) )
        self.top_layer= top_layer 
        return None
    
    def fit_predict(self,df_train,y,df_test,external_base_scores= None):
        Xt_train= df_train
        Xt_test= df_test
        i= 1
        for layer in self.layers:
            print 'Fitting stack layer '+str(i)
            Xt_train, Xt_test= layer.fit_predict(Xt_train,y,Xt_test)
            Xt_train.loc[:,'target']= y
            if external_base_scores is not None and i==1:
                Xt_train.loc[:,'ext']= external_base_scores[0]
                Xt_test.loc[:,'ext']= external_base_scores[1]
#                 Xt_train= np.concatenate( (Xt_train,np.reshape(external_base_scores[0],
#                                                                (np.shape(external_base_scores[0])[0],1))),
#                                          axis=1)
#                 Xt_test= np.concatenate( (Xt_test,np.reshape(external_base_scores[1],
#                                                             (np.shape(external_base_scores[1])[0],1))), 
#                                         axis=1)
            #
            if self.saveInternalVectors:
                fname= 'STACK_internal_train_layer_'+str(i)+'.bin'
                np.save(open(fname,'wb'),Xt_train)
                fname= 'STACK_internal_test_layer_'+str(i)+'.bin'
                np.save(open(fname,'wb'),Xt_test)
            i+=1
        Xt_train.drop('target',axis=1,inplace=True)
        self.top_layer.fit(Xt_train.as_matrix(),y)
        return self.top_layer.predict_proba(Xt_test.as_matrix())[:,1]

Now specify the stack

## Now score the actual data

In [6]:
# # Read data

# # train
# X_train= np.load(open('../WP014/dummy_train_matrix.bin','rb'))
# y_train= np.load(open('../WP014/dummy_train_labels.bin','rb'))

# # test
# X_test= np.load(open('../WP014/dummy_test_matrix.bin','rb'))
# y_test= np.load(open('../WP014/dummy_test_labels.bin','rb'))

In [7]:
import sklearn.model_selection, pipe0_median as pipe0

df_train= pd.read_csv('../data/train.csv')
df_test= pd.read_csv('../data/test.csv')

# index_train, index_test= sklearn.model_selection.train_test_split( range(len(df_train.index)) , 
#                                                                     test_size=0.3,random_state=1)

# df_test= df_train.loc[index_test,:].reset_index(drop=True)
# y_test= df_test.target.values
# df_train= df_train.loc[index_train,:].reset_index(drop=True)
# y_train= df_train.target.values

In [8]:
# read rgf pipeline train and test scores
# X_rgf_train= pd.read_csv('../wp017/rgf_scores_train.csv').target.values
# X_rgf_test= pd.read_csv('../wp017/rgf_blind_scores.csv').target.values
X_rgf_test= pd.read_csv('../wp018/output_0.01_300leaf/rgf_blind_scores.csv').target.values
X_rgf_train= pd.read_csv('../wp018/output_0.01_300leaf/rgf_validation_ scores.csv').target.values
# X_rgf_test= X_rgf_train.target.values[index_test]
# X_rgf_train= X_rgf_train.target.values[index_train]

In [9]:
test_ids= pd.read_csv('../data/test.csv',usecols=['id'])

In [10]:
import lightgbm.sklearn
import xgboost.sklearn
import catboost
import sklearn.linear_model, sklearn.ensemble
import sklearn.neural_network

lgb_params = {}
lgb_params['learning_rate'] = 0.01
lgb_params['n_estimators'] = 1300
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 500
lgb_params['num_leaves']= 25
lgb_params['n_jobs']=8


# lgb_params_3 = {
#     'learning_rate': 0.02,
#     'n_estimators': 800,
#     'max_depth': 4,
#     'n_jobs':8
# }
lgb_params_3 = {
    'learning_rate': 0.02,
    'n_estimators': 800, #150,
    'max_depth': 4,
    'n_jobs':8
#     'min_child_samples':100
}

lgb_params_4 = {
    'learning_rate':0.05,
    'n_estimators':600,
    'num_leaves':35,
    'min_child_samples':500,
    'n_jobs':8
}


xgb_params= {'learning_rate': 0.07,
             'n_estimators':525,
             'max_depth': 4, 
             'nthread':8,
             'subsample': 0.8,
             'min_child_weight':6.0, 
             'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 
             'eval_metric': 'auc', 
             'seed': 99, 
             'silent': True,
             'scale_pos_weight': 1.6,
             'reg_alpha':8,
             'reg_lambda':1.3,
             'gamma':10
            }

cb_params= {
    'learning_rate':0.05, 
    'depth':6, 
    'l2_leaf_reg': 14, 
    'iterations': 650,
    'verbose': False,
    'loss_function':'Logloss'
    }

# lgbmn_params= {'num_leaves': 81, 'verbose': 1, 'learning_rate': 0.005, 
#                'min_data': 650, 'categorical_column': [], 'bagging_fraction': 0.9, 
#                'metric': ['auc'], 'boosting_type': 'gbdt', 'lambda_l1': 30,
#                'bagging_freq': 3, 'lambda_l2': 0, 'is_unbalance': True, 
#                'max_bin': 255, 'objective': ['binary'], 'max_depth': 6, 
#                'feature_fraction': 0.7,'n_estimators':1600
#               }

# Layer 1
lgbm1 = (pipe0.run_pipe0,  lightgbm.sklearn.LGBMClassifier(**lgb_params))
xgbm1= (pipe0.run_pipe0,   xgboost.sklearn.XGBClassifier(**xgb_params))
lgbm3 = (pipe0.run_pipe0,  lightgbm.sklearn.LGBMClassifier(**lgb_params_3))
lgbm4 = (pipe0.run_pipe0,  lightgbm.sklearn.LGBMClassifier(**lgb_params_4))
cb= (pipe0.run_pipe0,      catboost.CatBoostClassifier(**cb_params))
# lgbmn = (pipe3.run_pipe3,  lightgbm.sklearn.LGBMClassifier(**lgbmn_params))

# Top layer
stacker= sklearn.linear_model.LogisticRegression(C=500.0,class_weight='balanced',penalty='l1')

# Define the stack
stack = Stack(10,[ [cb,xgbm1,lgbm1,lgbm3,lgbm4] ], stacker, saveInternalVectors=True)   

In [11]:
# Fit and predict
from datetime import datetime
tc= datetime.now()
y_pred = stack.fit_predict(df_train, df_train.target.values, df_test,
                            external_base_scores= (X_rgf_train, X_rgf_test) ) #,
print' Training took '+str( (datetime.now() - tc).total_seconds())

Fitting stack layer 1
Fit run_pipe0 --> <catboost.core.CatBoostClassifier object at 0x1130a7d50> fold 1
Fit run_pipe0 --> <catboost.core.CatBoostClassifier object at 0x1130a7d50> fold 2
Fit run_pipe0 --> <catboost.core.CatBoostClassifier object at 0x1130a7d50> fold 3
Fit run_pipe0 --> <catboost.core.CatBoostClassifier object at 0x1130a7d50> fold 4
Fit run_pipe0 --> <catboost.core.CatBoostClassifier object at 0x1130a7d50> fold 5
Fit run_pipe0 --> <catboost.core.CatBoostClassifier object at 0x1130a7d50> fold 6
Fit run_pipe0 --> <catboost.core.CatBoostClassifier object at 0x1130a7d50> fold 7
Fit run_pipe0 --> <catboost.core.CatBoostClassifier object at 0x1130a7d50> fold 8
Fit run_pipe0 --> <catboost.core.CatBoostClassifier object at 0x1130a7d50> fold 9
Fit run_pipe0 --> <catboost.core.CatBoostClassifier object at 0x1130a7d50> fold 10
Fit run_pipe0 --> XGBClassifier fold 1
Fit run_pipe0 --> XGBClassifier fold 2
Fit run_pipe0 --> XGBClassifier fold 3
Fit run_pipe0 --> XGBClassifier fold 4
F

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
# print 'Test Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.,0.290426135503

In [11]:
import sklearn.linear_model
stacker= sklearn.linear_model.LogisticRegression(C=500.,class_weight='balanced',penalty='l2')
X_train= np.load('internals_withrgf_median_test_run4/STACK_internal_train_layer_1.bin')
X_test= np.load('internals_withrgf_median_test_run4/STACK_internal_test_layer_1.bin')
X_test[:,5]= X_rgf_test
X_train[:,6]= X_rgf_train
y_train= X_train[:,5]
X_train= np.delete(X_train,5,1)

In [12]:
# OOF
X_harl= pd.read_csv('../wp023/internals_test/harless_oob.csv').target.values
X_train= np.column_stack((X_train,X_harl))
X_froza= pd.read_csv('../wp023/internals_test/froza_oof.csv').target.values
X_train= np.column_stack((X_train,X_froza))
#
# Test
X_harl= pd.read_csv('../wp023/internals_test/harless_test.csv').target.values
X_test= np.column_stack((X_test,X_harl))
X_froza= pd.read_csv('../wp023/internals_test/froza_test.csv').target.values
X_test= np.column_stack((X_test,X_froza))

In [None]:
# import sklearn.preprocessing
# fvg= sklearn.preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
# X_train_pol= fvg.fit_transform(X_train)
# X_test_pol= fvg.fit_transform(X_test)

In [40]:
#  This for testing ensemble incl RGF
internal= [0,1,2,3,4,5,6,7]
folds = StratifiedKFold(y_train,n_folds=10, shuffle=True, random_state=2016)
y_pred= np.zeros(len(y_train))
for i, (itr,ite) in enumerate(folds):
    stacker= sklearn.linear_model.LogisticRegression(C=500.,class_weight='balanced',penalty='l2')
    stacker.fit( X_train[np.ix_(itr,internal)] , y_train[itr] )
    y_pred[ite]= stacker.predict_proba(X_train[np.ix_(ite,internal)])[:,1]
print 'CV Gini score= ',2.*sklearn.metrics.roc_auc_score(y_train,y_pred)-1.,0.286699739102,0.282201168818,0.289731866932

CV Gini score=  0.290674519781 0.286699739102 0.282201168818 0.289731866932


Now lets try the bagging approach (which will take longer)

In [19]:
import sklearn.ensemble
from datetime import datetime,timedelta
tc= datetime.now()
# lgb_params_3 = {
#     'learning_rate': 0.1,
#     'n_estimators': 115,
#     'max_depth': 1,
#     'n_jobs':1
# #     'eval_metric':'auc'
# }
# altStacker= xgboost.sklearn.XGBClassifier(**lgb_params_3)
clf= sklearn.ensemble.BaggingClassifier(stacker,
                                        n_estimators=128,
                                        oob_score=True,
                                        max_features=4,
                                        max_samples=1.0,
                                        random_state=3,
                                        n_jobs=-1
                                       )
clf.fit(X_train,y_train)
y_pred= clf.predict_proba(X_test)[:,1]
print 'Training took: ',(datetime.now() - tc).total_seconds()

Training took:  103.627528


In [20]:
y_pred2= clf.oob_decision_function_[:,1]
ok= np.where(map(np.isfinite,y_pred2))[0]
print len(ok),len(y_pred2)

595212 595212


In [21]:
print 'OOB Gini score= ',2.*sklearn.metrics.roc_auc_score(y_train[ok],y_pred2[ok])-1.,0.286463220363,'(0.286760096353)',0.291032405649
# print 'Test Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.,0.291740847787,'(0.292343541304)'

OOB Gini score=  0.290849329423 0.286463220363 (0.286760096353) 0.291032405649


...and now output predictions...

In [76]:
# out_df= pd.DataFrame( {'id': test_ids.id.values, 'target': y_pred},
#                      columns=['id','target']
#                     ).to_csv('submission_wp024c.csv',index=False)