In [1]:
import numpy as np, pandas as pd,matplotlib.pyplot as plt
%matplotlib inline



In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import copy

In [3]:
def rank(array):
    srtInd = array.argsort()
    ranks = np.empty(len(array), float)
    ranks[srtInd] = np.arange(len(array))
    return ranks / float(len(array))

In [4]:
class Layer(object):
    def __init__(self, n_splits, base_models):
        self.n_splits = n_splits
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):
            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]

                print ("Fit %s fold %d" % (str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
                y_pred = clf.predict_proba(X_holdout)[:,1]              

                S_train[test_idx, i] = y_pred
                #
                test_probs= clf.predict_proba(T)[:,1]
                S_test_i[:, j] = np.log(test_probs) - np.log(1.0 - test_probs)
            agg_lor= S_test_i.mean(axis=1)
            S_test[:, i] = 1.0 / (1.0 + np.exp( -agg_lor) )
        return S_train,S_test

In [5]:
# Add output of layer results ofr quick iteration
class Stack:
    def __init__(self,k_folds,hidden_layers,top_layer,saveInternalVectors=False):
        self.saveInternalVectors= saveInternalVectors
        self.layers= []
        for h in hidden_layers:
            self.layers.append( Layer(k_folds,h) )
        self.top_layer= top_layer 
        return None
    
    def fit_predict(self,X,y,T,external_base_scores= None):
        Xt_train= copy.deepcopy(X)
        Xt_test= copy.deepcopy(T)
        i= 1
        for layer in self.layers:
            print 'Fitting stack layer '+str(i)
            Xt_train, Xt_test= layer.fit_predict(Xt_train,y,Xt_test)
            if external_base_scores is not None and i==1:
                Xt_train= np.concatenate( (Xt_train,np.reshape(external_base_scores[0],
                                                               (np.shape(external_base_scores[0])[0],1))),
                                         axis=1)
                Xt_test= np.concatenate( (Xt_test,np.reshape(external_base_scores[1],
                                                            (np.shape(external_base_scores[1])[0],1))), 
                                        axis=1)
            #
            # Add entropy score from layer
#             train_entropy= np.array(map(lambda i: np.sum(Xt_train[i] * np.log(Xt_train[i])), 
#                                         range(np.shape(Xt_train)[0] )))
#             Xt_train= np.concatenate( (Xt_train,np.reshape(train_entropy,(len(train_entropy),1))) ,axis=1)
#             test_entropy= np.array(map(lambda i: np.sum(Xt_test[i] * np.log(Xt_test[i])), 
#                                         range(np.shape(Xt_test)[0] )))
#             Xt_test= np.concatenate( (Xt_test,np.reshape(test_entropy,(len(test_entropy),1))) ,axis=1)
            #
#             # Rank transform
#             for jj in range(np.shape(Xt_train)[1]):
#                 Xt_train[:,jj]= rank(Xt_train[:,jj])
#                 Xt_test[:,jj]= rank(Xt_test[:,jj])
#             #
            if self.saveInternalVectors:
                fname= 'STACK_internal_train_layer_'+str(i)+'.bin'
                np.save(open(fname,'wb'),Xt_train)
                fname= 'STACK_internal_test_layer_'+str(i)+'.bin'
                np.save(open(fname,'wb'),Xt_test)
            i+=1
#         for i in range(np.shape(Xt_train)[1]): #-1 so we don't apply to entropy!!
#             p= copy.deepcopy(Xt_train[:,i])
#             Xt_train[:,i]= np.log( p ) - np.log(1.0 - p)
        self.top_layer.fit(Xt_train,y)
        return self.top_layer.predict_proba(Xt_test)[:,1]

Now specify the stack

## Now score the actual data

In [6]:
# Read data

# train
X_train= np.load(open('../wp013/full_train_matrix.bin','rb'))
y_train= np.load(open('../wp013/full_train_labels.bin','rb'))

# test
X_test= np.load(open('../wp013/blind_test_matrix.bin','rb'))
y_test= np.load(open('../wp013/blind_test_labels.bin','rb'))

In [7]:
test_ids= pd.read_csv('../data/test.csv',usecols=['id'])

In [8]:
zz_train= np.load(open('train.bin','rb')) #This is the aggregation over all GP features
zz_test= pd.read_csv('gpari.csv').loc[:,'target'].values

In [9]:
# import sklearn.model_selection
# X_tr,X_te, zz_tr, zz_te, y_tr, y_te= sklearn.model_selection.train_test_split( X_train, zz_train, y_train, 
#                                                                               test_size=0.3,random_state=1)

In [10]:
# X_tr2= np.zeros( (np.shape(X_tr)[0],np.shape(X_tr)[1]+1), dtype= float )
# X_tr2[:,0:-1]= X_tr
# X_tr2[:,-1]= zz_tr
# 
# X_te2= np.zeros( (np.shape(X_te)[0],np.shape(X_te)[1]+1), dtype= float )
# X_te2[:,0:-1]= X_te
# X_te2[:,-1]= zz_te

In [25]:
import lightgbm.sklearn
import xgboost.sklearn
import sklearn.linear_model
import sklearn.neural_network

lgb_params = {}
lgb_params['learning_rate'] = 0.01
lgb_params['n_estimators'] = 1300
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 500
lgb_params['num_leaves']= 25
lgb_params['n_jobs']=8

lgb_params_2 = {
    'learning_rate': 0.005,
    'n_estimators': 3700,
    'subsample': 0.7,
    'subsample_freq': 2,
    'colsample_bytree': 0.3,  
    'num_leaves': 16,
    'n_jobs':8
}

lgb_params_3 = {
    'learning_rate': 0.02,
    'n_estimators': 800,
    'max_depth': 4,
    'n_jobs':8
}

lgb_params_4 = {
    'learning_rate':0.05,
    'n_estimators':600,
    'num_leaves':35,
    'min_child_samples':500,
    'n_jobs':8
}


xgb_params= {'learning_rate': 0.07,
             'n_estimators':525,
             'max_depth': 4, 
             'nthread':8,
             'subsample': 0.8,
             'min_child_weight':0.77,
             'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 
             'eval_metric': 'auc', 
             'seed': 99, 
             'silent': True,
             'scale_pos_weight': 1.6,
             'reg_alpha':8,
             'reg_lambda':1.3,
             'gamma':10
            }

xgb2_params= {'learning_rate': 0.05,
             'n_estimators':900,
             'max_depth': 2, 
             'nthread':8,
             'subsample': 0.6,
             'min_child_weight': 0.77,
             'colsample_bytree': 0.6, 
             'objective': 'binary:logistic', 
             'eval_metric': 'auc', 
             'seed': 99, 
             'silent': True,
             'scale_pos_weight': 1.,
             'reg_alpha':0.,
             'reg_lambda':3.,
             'gamma':10
            }

# lgb_stumps_params = {
#     'learning_rate': 0.02,
#     'n_estimators': 12800,
#     'max_depth': 1,
#     'n_jobs':8
# }

# Layer 1
lgbm1 = lightgbm.sklearn.LGBMClassifier(**lgb_params)
xgbm1= xgboost.sklearn.XGBClassifier(**xgb_params)
lgbm2 = lightgbm.sklearn.LGBMClassifier(**lgb_params_2)
lgbm3 = lightgbm.sklearn.LGBMClassifier(**lgb_params_3)
lgbm4 = lightgbm.sklearn.LGBMClassifier(**lgb_params_4)
# xgbm2= xgboost.sklearn.XGBClassifier(**xgb2_params)

# lgbs= lightgbm.sklearn.LGBMClassifier(**lgb_stumps_params)


# import sklearn.neural_network
# mlp_params= {
#     'hidden_layer_sizes':(20,8), 
#     'activation':'relu', 
#     'solver':'adam', 
#     'alpha':0.1,
#     'batch_size':'auto', 
#     'learning_rate_init':0.02, 
#     'shuffle':True, 
#     'tol':0.0001, 
#     'early_stopping': True,
#     'validation_fraction':0.1, 
#     'beta_1':0.9, 
#     'beta_2':0.999, 
#     'epsilon':1e-08
# }

lgb_params_layer2 = {
    'learning_rate': 0.02,
    'n_estimators': 400,
    'max_depth': 1,
    'n_jobs':8
}
# lgb_params_layer2 = {
#     'learning_rate': 0.002,
#     'n_estimators': 3300,
#     'max_depth': 1,
#     'n_jobs':8
# }

# Layer 2
# mlp= sklearn.neural_network.MLPClassifier(**mlp_params)
lgb= lightgbm.sklearn.LGBMClassifier(**lgb_params_layer2)
lr= sklearn.linear_model.LogisticRegression(C=500.0,class_weight='balanced',penalty='l1')

# Top layer
stacker= sklearn.linear_model.LogisticRegression(C=0.0013,class_weight='balanced',penalty='l2')

# Define the stack
stack = Stack(3,[ [lgbm1,lgbm2,lgbm3,lgbm4,xgbm1], [lgb, lr] ], stacker) 
# stack = Stack(3,[ [lgbm1,lgbm2,lgbm3,lgbm4,xgbm1, lgbs], [lgb, lr] ], stacker, saveInternalVectors=True)
# stack = Stack(3,[ [lgbm3,lgbm4] ], stacker)
# stack = Stack(10,[ [lgbm1,lgbm2,lgbm3,lgbm4,xgbm1], [lgb,lr] ], stacker) #, saveInternalVectors=False)      

In [26]:
# Fit and predict
from datetime import datetime
tc= datetime.now()
# y_pred = stack.fit_predict(X_tr, y_tr, X_te)#,
#                            external_base_scores= (zz_tr, zz_te)
#                           ) 
y_pred = stack.fit_predict(X_train, y_train, X_test) #,
#                            external_base_scores= (zz_train, zz_test)
#                           ) 
print' Training took '+str( (datetime.now() - tc).total_seconds())

Fitting stack layer 1
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit XGBClassifier fold 1
Fit XGBClassifier fold 2
Fit XGBClassifier fold 3
Fit XGBClassifier fold 1
Fit XGBClassifier fold 2
Fit XGBClassifier fold 3
Fitting stack layer 2
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LogisticRegression fold 1
Fit LogisticRegression fold 2
Fit LogisticRegression fold 3
 Training took 761.510992


In [27]:
print 'Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.

Gini score=  0.288450117185


In [14]:
print 'Best local score= ',0.289081111974,0.289504671874
print 'Baseline',0.288711072302

Best local score=  0.289081111974 0.289504671874
Baseline 0.288711072302


In [15]:
# out_df= pd.DataFrame( {'id': test_ids.id.values, 'target': y_pred},
#                      columns=['id','target']
#                     ).to_csv('submission_wp013b.csv',index=False)

In [73]:
lgb_params_layer2 = {
    'learning_rate': 0.02,
    'n_estimators': 50,
    'max_depth': 3,
    'n_jobs':8
}
lgb= lightgbm.sklearn.LGBMClassifier(**lgb_params_layer2)
lr= sklearn.linear_model.LogisticRegression(C=500.0,class_weight='balanced',penalty='l1')
# stacker= sklearn.linear_model.LogisticRegression(C=0.0013,class_weight='balanced',penalty='l2')
# stack = Stack(3,[ [ lr] ], stacker)
X_train= np.load(open('STACK_internal_train_layer_1.bin','rb'))[:,[0,2,3,4]]
X_test= np.load(open('STACK_internal_test_layer_1.bin','rb'))[:,[0,2,3,4]]
# y_pred = stack.fit_predict(X_train, y_train, X_test)
lr.fit(X_train,y_train)
y_pred= lr.predict_proba(X_test)[:,1]
print 'Gini score= ',2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.

Gini score=  0.289662705273


In [65]:
0.289612152404

0.289612152404

In [49]:
print 0.289576957232

0.289576957232


Hmmmm

In [16]:
# Xtr= np.load('STACK_internal_train_layer_3.bin')
# Xte= np.load('STACK_internal_test_layer_3.bin')

In [17]:
# lrc= sklearn.linear_model.LogisticRegression(C=0.0013,class_weight={0:1.0,1:15.0})
# lrc.fit(Xtr,y_tr)
# y_pred= lrc.predict_proba(Xte)[:,1]
# print 'Gini score= ',2.*sklearn.metrics.roc_auc_score(y_te,y_pred)-1.

In [18]:
# np.shape(X_te)

In [19]:
# upscore= np.load(open('../wp013/upscore_ids.bin','rb'))
# out_df= pd.read_csv('submission_wp013b.csv')
# out_df.loc[ map(lambda x: x in upscore,out_df.loc[:,'id']), 'target'] += 0.1
# out_df.to_csv('submission_wp013b_upscore.csv',index=False)

In [20]:
# xgb2_params= {'learning_rate': 0.05,
#              'n_estimators':2000,
#              'max_depth': 2, 
#              'nthread':8,
#              'subsample': 0.6,
#              'min_child_weight': 0.77,
#              'colsample_bytree': 0.6, 
#              'objective': 'binary:logistic', 
#              'eval_metric': 'auc', 
#              'seed': 99, 
#              'silent': True,
#              'scale_pos_weight': 1.,
#              'reg_alpha':0.,
#              'reg_lambda':3.,
#              'gamma':10
#             }
# xgbm2= xgboost.sklearn.XGBClassifier(**xgb2_params)
# xgbm2.fit(X_train,y_train,eval_metric='logloss',early_stopping_rounds=50,eval_set=[(X_test,y_test)])

In [21]:
# print 0.151308,581,0.151191,902