This is a simpler modelling stack, where models have been pulled from the stack if they are not useful.  Strangely, the regularization on the top level of the stack must be light l1.  Produces 0.28966 in local test. Just over 0.285 on LB.  Position: 387 (386 was bottom of bronze).

In [1]:
import numpy as np, pandas as pd,matplotlib.pyplot as plt
%matplotlib inline



In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.cross_validation import StratifiedKFold
import copy



In [3]:
def rank(array):
    srtInd = array.argsort()
    ranks = np.empty(len(array), float)
    ranks[srtInd] = np.arange(len(array))
    return ranks / float(len(array))

In [4]:
class Layer(object):
    def __init__(self, n_splits, base_models):
        self.n_splits = n_splits
        self.pipelines= []
        self.base_models= []
        for x in base_models:
            self.pipelines.append( x[0] )
            self.base_models.append( x[1] )

    def fit_predict(self, df_train, y, df_test):
        y = np.array(y)

        folds = StratifiedKFold(y,n_folds=self.n_splits, shuffle=True, random_state=2016)

        S_train = np.zeros((len(df_train.index), len(self.base_models)))
        S_test = np.zeros((len(df_test.index), len(self.base_models)))
        for i, clf in enumerate(self.base_models):
            #
            PIPELINE= self.pipelines[i]
            #
            S_test_i = np.zeros((len(df_test.index), self.n_splits))
#             X_train, y_train, X_test, y_test= PIPELINE(df_train.copy(),df_test.copy())
            #
            for j, (train_idx, val_idx) in enumerate(folds):
                #
                X_train, y_train, X_holdout, y_holdout= PIPELINE(
                    df_train.copy().loc[train_idx,:],
                    df_train.copy().loc[val_idx,:]
                )
                _, _, X_test, y_test= PIPELINE(df_train.copy().loc[train_idx,:],df_test.copy()
                )
                #
                print ("Fit %s --> %s fold %d" % (str(PIPELINE).split()[1],str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
                y_pred = clf.predict_proba(X_holdout)[:,1]              

                S_train[val_idx, i] = y_pred
                #
                test_probs= clf.predict_proba(X_test)[:,1]
                S_test_i[:, j] = np.log(test_probs) - np.log(1.0 - test_probs)
            agg_lor= S_test_i.mean(axis=1)
            S_test[:, i] = 1.0 / (1.0 + np.exp( -agg_lor) )
        return pd.DataFrame(S_train,columns=['x_'+str(i) for i in range(np.shape(S_train)[1])]),pd.DataFrame(S_test,columns=['x_'+str(i) for i in range(np.shape(S_test)[1])])

In [5]:
# Add output of layer results ofr quick iteration
class Stack:
    def __init__(self,k_folds,hidden_layers,top_layer,saveInternalVectors=False):
        self.saveInternalVectors= saveInternalVectors
        self.layers= []
        for h in hidden_layers:
            self.layers.append( Layer(k_folds,h) )
        self.top_layer= top_layer 
        return None
    
    def fit_predict(self,df_train,y,df_test,external_base_scores= None):
        Xt_train= df_train
        Xt_test= df_test
        i= 1
        for layer in self.layers:
            print 'Fitting stack layer '+str(i)
            Xt_train, Xt_test= layer.fit_predict(Xt_train,y,Xt_test)
            Xt_train.loc[:,'target']= y
            if external_base_scores is not None and i==1:
                Xt_train= np.concatenate( (Xt_train,np.reshape(external_base_scores[0],
                                                               (np.shape(external_base_scores[0])[0],1))),
                                         axis=1)
                Xt_test= np.concatenate( (Xt_test,np.reshape(external_base_scores[1],
                                                            (np.shape(external_base_scores[1])[0],1))), 
                                        axis=1)
            #
            if self.saveInternalVectors:
                fname= 'STACK_internal_train_layer_'+str(i)+'.bin'
                np.save(open(fname,'wb'),Xt_train)
                fname= 'STACK_internal_test_layer_'+str(i)+'.bin'
                np.save(open(fname,'wb'),Xt_test)
            i+=1
        Xt_train.drop('target',axis=1,inplace=True)
        self.top_layer.fit(Xt_train.as_matrix(),y)
        return self.top_layer.predict_proba(Xt_test.as_matrix())[:,1]

Now specify the stack

## Now score the actual data

In [6]:
# # Read data

# # train
# X_train= np.load(open('../WP014/dummy_train_matrix.bin','rb'))
# y_train= np.load(open('../WP014/dummy_train_labels.bin','rb'))

# # test
# X_test= np.load(open('../WP014/dummy_test_matrix.bin','rb'))
# y_test= np.load(open('../WP014/dummy_test_labels.bin','rb'))

In [7]:
import sklearn.model_selection, pipe1,pipe2,pipe3

df_train= pd.read_csv('../data/train.csv')
df_test= pd.read_csv('../data/test.csv')

# index_train, index_test= sklearn.model_selection.train_test_split( range(len(df_train.index)) , 
#                                                                     test_size=0.3,random_state=1)

# df_test= df_train.loc[index_test,:].reset_index(drop=True)
# y_test= df_test.target.values
# df_train= df_train.loc[index_train,:].reset_index(drop=True)
# y_train= df_train.target.values

In [8]:
test_ids= pd.read_csv('../data/test.csv',usecols=['id'])

In [9]:
import lightgbm.sklearn
import xgboost.sklearn
import catboost
import sklearn.linear_model, sklearn.ensemble
import sklearn.neural_network

lgb_params = {}
lgb_params['learning_rate'] = 0.01
lgb_params['n_estimators'] = 1300
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 500
lgb_params['num_leaves']= 25
lgb_params['n_jobs']=8


# lgb_params_3 = {
#     'learning_rate': 0.02,
#     'n_estimators': 800,
#     'max_depth': 4,
#     'n_jobs':8
# }
lgb_params_3 = {
    'learning_rate': 0.02,
    'n_estimators': 800, #150,
    'max_depth': 4,
    'n_jobs':8
#     'min_child_samples':100
}

lgb_params_4 = {
    'learning_rate':0.05,
    'n_estimators':600,
    'num_leaves':35,
    'min_child_samples':500,
    'n_jobs':8
}


xgb_params= {'learning_rate': 0.07,
             'n_estimators':525,
             'max_depth': 4, 
             'nthread':8,
             'subsample': 0.8,
             'min_child_weight':6.0, #kernel now changed this to 0.77
             'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 
             'eval_metric': 'auc', 
             'seed': 99, 
             'silent': True,
             'scale_pos_weight': 1.6,
             'reg_alpha':8,
             'reg_lambda':1.3,
             'gamma':10
            }

cb_params= {
    'learning_rate':0.05, 
    'depth':6, 
    'l2_leaf_reg': 14, 
    'iterations': 650,
    'verbose': False,
    'loss_function':'Logloss'
    }

# lgbmn_params= {'num_leaves': 81, 'verbose': 1, 'learning_rate': 0.005, 
#                'min_data': 650, 'categorical_column': [], 'bagging_fraction': 0.9, 
#                'metric': ['auc'], 'boosting_type': 'gbdt', 'lambda_l1': 30,
#                'bagging_freq': 3, 'lambda_l2': 0, 'is_unbalance': True, 
#                'max_bin': 255, 'objective': ['binary'], 'max_depth': 6, 
#                'feature_fraction': 0.7,'n_estimators':1600
#               }

# Layer 1
lgbm1 = (pipe1.run_pipe1,  lightgbm.sklearn.LGBMClassifier(**lgb_params))
xgbm1= (pipe1.run_pipe1,   xgboost.sklearn.XGBClassifier(**xgb_params))
lgbm3 = (pipe1.run_pipe1,  lightgbm.sklearn.LGBMClassifier(**lgb_params_3))
lgbm4 = (pipe1.run_pipe1,  lightgbm.sklearn.LGBMClassifier(**lgb_params_4))
cb= (pipe1.run_pipe1,      catboost.CatBoostClassifier(**cb_params))
# lgbmn = (pipe3.run_pipe3,  lightgbm.sklearn.LGBMClassifier(**lgbmn_params))

# Top layer
stacker= sklearn.linear_model.LogisticRegression(C=500.0,class_weight='balanced',penalty='l1')

# Define the stack
stack = Stack(10,[ [cb,xgbm1,lgbm1,lgbm3,lgbm4] ], stacker) #, saveInternalVectors=True)   

In [10]:
# Fit and predict
from datetime import datetime
tc= datetime.now()
y_pred = stack.fit_predict(df_train, df_train.target.values, df_test) #,
print' Training took '+str( (datetime.now() - tc).total_seconds())

Fitting stack layer 1
Fit run_pipe1 --> <catboost.core.CatBoostClassifier object at 0x1151a2790> fold 1
Fit run_pipe1 --> <catboost.core.CatBoostClassifier object at 0x1151a2790> fold 2
Fit run_pipe1 --> <catboost.core.CatBoostClassifier object at 0x1151a2790> fold 3
Fit run_pipe1 --> <catboost.core.CatBoostClassifier object at 0x1151a2790> fold 4
Fit run_pipe1 --> <catboost.core.CatBoostClassifier object at 0x1151a2790> fold 5
Fit run_pipe1 --> <catboost.core.CatBoostClassifier object at 0x1151a2790> fold 6
Fit run_pipe1 --> <catboost.core.CatBoostClassifier object at 0x1151a2790> fold 7
Fit run_pipe1 --> <catboost.core.CatBoostClassifier object at 0x1151a2790> fold 8
Fit run_pipe1 --> <catboost.core.CatBoostClassifier object at 0x1151a2790> fold 9
Fit run_pipe1 --> <catboost.core.CatBoostClassifier object at 0x1151a2790> fold 10
Fit run_pipe1 --> XGBClassifier fold 1
Fit run_pipe1 --> XGBClassifier fold 2
Fit run_pipe1 --> XGBClassifier fold 3
Fit run_pipe1 --> XGBClassifier fold 4
F

In [11]:
# print 'Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.,0.290094358949

Added ps_car_11 prevalence encoding, ps_calc_14, and restored separate encodings of folds in stacking CV.

In [12]:
0.289139167662, 0.289187617155

(0.289139167662, 0.289187617155)

In [13]:
print 'Best local score= ',0.290737925496,0.289662696292
print 'Baseline',0.288711072302,0.290006985327

Best local score=  0.290737925496 0.289662696292
Baseline 0.288711072302 0.290006985327


In [14]:
out_df= pd.DataFrame( {'id': test_ids.id.values, 'target': y_pred},
                     columns=['id','target']
                    ).to_csv('submission_wp015d.csv',index=False)

In [15]:
import sklearn.linear_model
stacker= sklearn.linear_model.LogisticRegression(C=500.,class_weight='balanced',penalty='l2')
X_train= np.load('STACK_internal_train_layer_1.bin')[:,[1,2,3,4,5]]
X_test= np.load('STACK_internal_test_layer_1.bin')[:,[1,2,3,4,5]]

# for i in range(np.shape(X_train)[1]):
#     X_train[:,i]= rank(X_train[:,i])
# for i in range(np.shape(X_test)[1]):
#     X_test[:,i]= rank(X_test[:,i])
# Xcross_train= np.zeros([np.shape(X_train)[0], np.shape(X_train)[1]**2 - np.shape(X_train)[1]])
# for iv in range(np.shape(X_train)[0]):
#     for i in range(np.shape(X_train)[1]-1):
#         for j in range(i+1,np.shape(X_train)[1]):
#             Xcross_train[iv,i*j]= X_train[iv,i] * X_train[iv,j]
# Xcross_test= np.zeros([np.shape(X_test)[0], np.shape(X_test)[1]**2 - np.shape(X_test)[1]])
# for iv in range(np.shape(X_test)[0]):
#     for i in range(np.shape(X_test)[1]-1):
#         for j in range(i+1,np.shape(X_test)[1]):
#             Xcross_test[iv,i*j]= X_test[iv,i] * X_test[iv,j]
# for i in range(np.shape(Xcross_test)[1]):
#     Xcross_test[:,i]= rank(Xcross_test[:,i])
# for i in range(np.shape(Xcross_train)[1]):
#     Xcross_train[:,i]= rank(Xcross_train[:,i])

In [16]:
stacker= sklearn.linear_model.LogisticRegression(C=500.,class_weight='balanced',penalty='l1')
stacker.fit(X_train,y_train)
y_pred= stacker.predict_proba(X_test)[:,1]
print 'Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.

Gini score=  0.290443587595


In [17]:
0.29044380853

0.29044380853

In [18]:
xgb_params= {'learning_rate': 0.07,
             'n_estimators':1000, #525, #,354
             'max_depth': 4, 
             'nthread':8,
             'subsample': 0.8,
             'min_child_weight':0.77,
             'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 
             'eval_metric': 'auc', 
             'seed': 99, 
             'silent': True,
             'scale_pos_weight': 1.6,
             'reg_alpha':8,
             'reg_lambda':1.3,
             'gamma':10
            }
xgbm1= xgboost.sklearn.XGBClassifier(**xgb_params)
xgbm1.fit(X_train,y_train,eval_metric='auc',early_stopping_rounds=50,eval_set=[(X_test,y_test)])
y_pred= xgbm1.predict_proba(X_test)[:,1]
print 'Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.

[0]	validation_0-auc:0.628198
Will train until validation_0-auc hasn't improved in 50 rounds.
[1]	validation_0-auc:0.628893
[2]	validation_0-auc:0.636688
[3]	validation_0-auc:0.636717
[4]	validation_0-auc:0.63682
[5]	validation_0-auc:0.637537
[6]	validation_0-auc:0.639109
[7]	validation_0-auc:0.641839
[8]	validation_0-auc:0.642356
[9]	validation_0-auc:0.643074
[10]	validation_0-auc:0.64321
[11]	validation_0-auc:0.643156
[12]	validation_0-auc:0.64316
[13]	validation_0-auc:0.643393
[14]	validation_0-auc:0.643363
[15]	validation_0-auc:0.643354
[16]	validation_0-auc:0.643406
[17]	validation_0-auc:0.643378
[18]	validation_0-auc:0.643479
[19]	validation_0-auc:0.64346
[20]	validation_0-auc:0.643446
[21]	validation_0-auc:0.643894
[22]	validation_0-auc:0.643898
[23]	validation_0-auc:0.643877
[24]	validation_0-auc:0.643965
[25]	validation_0-auc:0.643996
[26]	validation_0-auc:0.644043
[27]	validation_0-auc:0.643943
[28]	validation_0-auc:0.643999
[29]	validation_0-auc:0.644016
[30]	validation_0-au

In [19]:
# keep= np.where(xgbm1.feature_importances_ > 0.)[0]
# keep2= np.where(xgbm1.feature_importances_ > 0.)[0]
keeps= keep[keep2]

NameError: name 'keep' is not defined

In [None]:
# np.save(open('xgboost_rfe_keepers.bin','wb'),keeps)

In [None]:
import catboost

In [None]:
MAX_ROUNDS = 650
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.05
cb_params= {
    'learning_rate':0.05, 
    'depth':6, 
    'l2_leaf_reg': 14, 
    'iterations':650,
    'verbose': True,
    'loss_function':'Logloss'
    }

cb= catboost.CatBoostClassifier(**cb_params)

In [None]:
cb.fit(X_train,y_train)

In [None]:
import sklearn.metrics
y_pred= cb.predict_proba(X_test)[:,1]
print 'Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.

In [None]:
import sklearn.model_selection, pipe3

df_train= pd.read_csv('../data/train.csv')
df_test= pd.read_csv('../data/test.csv')

index_train, index_test= sklearn.model_selection.train_test_split( range(len(df_train.index)) , 
                                                                    test_size=0.3,random_state=1)

df_test= df_train.loc[index_test,:].reset_index(drop=True)
y_test= df_test.target.values
df_train= df_train.loc[index_train,:].reset_index(drop=True)
y_train= df_train.target.values

X_train,y_train,X_test,y_test= pipe3.run_pipe3(df_train.copy(),df_test.copy())

In [None]:
import lightgbm
lgb_params_3 = {
    'learning_rate': 0.02,
    'n_estimators': 150,
    'max_depth': 4,
    'n_jobs':8,
#     'early_stopping_round':50,
    'min_child_samples':100
}

lgbmn = lightgbm.sklearn.LGBMClassifier(**lgb_params_3)
bagger= sklearn.ensemble.BaggingClassifier(lgbmn,n_estimators=10,max_samples=0.3,random_state=1234)
bagger.fit(X_train,y_train)
# lgbmn.fit(X_train,y_train,eval_metric='auc',eval_set=(X_test,y_test))

In [None]:
import sklearn.metrics
y_pred= bagger.predict_proba(X_test)[:,1]
print 'Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.

In [None]:
0.289135174661

In [3]:
a= pd.read_csv('submission_wp015d.csv')
b= pd.read_csv('gpx.csv')

In [4]:
c= 0.5*a.target.values + 0.5*b.target.values

In [5]:
d= pd.DataFrame({'id':a.id.values, 'target':c},columns=['id','target'])

In [6]:
d.to_csv('submission_wp015d_blend_gpx_0.5.csv',index=False)