In [34]:
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit, KFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
class blending_layer1(object):
    
    def __init__(self, train_x, train_y, val_x, val_y, test_x):
        self.train_x = train_x
        self.train_y = train_y
        self.val_x = val_x
        self.val_y = val_y
        self.test_x = test_x
        
    def shuffle_data(self, model=None):
        ss = ShuffleSplit(n_splits=4, random_state=0, test_size=.25)
        for n_fold, (trn_idx, val_idx) in enumerate(ss.split(self.train_x)):
            if n_fold == 0:
                rf_train_x = self.train_x[val_idx]
                rf_train_y = self.train_y[val_idx]
            elif n_fold == 1:
                et_train_x = self.train_x[val_idx]
                et_train_y = self.train_y[val_idx]
            elif n_fold == 2:
                gbdt_train_x = self.train_x[val_idx]
                gbdt_train_y = self.train_y[val_idx]
            else:
                xgb_train_x = self.train_x[val_idx]
                xgb_train_y = self.train_y[val_idx]
        if model == 'rf':
            return rf_train_x, rf_train_y
        elif model == 'et':
            return et_train_x, et_train_y
        elif model == 'gbdt':
            return gbdt_train_x, gbdt_train_y
        elif model == 'xgb':
            return xgb_train_x, xgb_train_y
        else:
            return ss
        
    def rf_model(self, parameters=None):
        rf_train_x, rf_train_y = self.shuffle_data('rf')
        kf = KFold(n_splits=5, shuffle=True, random_state=0)
        rf_val_pred = np.zeros((rf_train_x.shape[0],))
        rf_off_val_pred = np.zeros((self.val_x.shape[0],))
        rf_test_pred = np.zeros((self.test_x.shape[0],))
        for n_fold, (trn_idx, val_idx) in enumerate(kf.split(rf_train_x)):
            rf = RandomForestClassifier(**parameters)
            rf.fit(rf_train_x[trn_idx], rf_train_y[trn_idx])
            rf_val_pred[val_idx] = rf.predict_proba(rf_train_x[val_idx])[:, 1]
            rf_off_val_pred += rf.predict_proba(self.val_x)[:, 1] / kf.n_splits
            rf_test_pred += rf.predict_proba(self.test_x)[:, 1] / kf.n_splits
            print('Fold %d auc score: %.6f'%(n_fold+1, roc_auc_score(rf_train_y[val_idx], rf_val_pred[val_idx])))
        print('Validate auc score:', roc_auc_score(rf_train_y, rf_val_pred))
        print('Off auc score:', roc_auc_score(self.val_y, rf_off_val_pred))
        return rf_val_pred, rf_off_val_pred, rf_test_pred
    
    def et_model(self, parameters=None):
        et_train_x, et_train_y = self.shuffle_data('et')
        kf = KFold(n_splits=5, shuffle=True, random_state=0)
        et_val_pred = np.zeros((et_train_x.shape[0],))
        et_off_val_pred = np.zeros((self.val_x.shape[0],))
        et_test_pred = np.zeros((self.test_x.shape[0],))
        for n_fold, (trn_idx, val_idx) in enumerate(kf.split(et_train_x)):
            et = ExtraTreesClassifier(**parameters)
            et.fit(et_train_x[trn_idx], et_train_y[trn_idx])
            et_val_pred[val_idx] = et.predict_proba(et_train_x[val_idx])[:, 1]
            et_off_val_pred += et.predict_proba(self.val_x)[:, 1] / kf.n_splits
            et_test_pred += et.predict_proba(self.test_x)[:, 1] / kf.n_splits
            print('Fold %d auc score: %.6f'%(n_fold+1, roc_auc_score(et_train_y[val_idx], et_val_pred[val_idx])))
        print('Validate auc score:', roc_auc_score(et_train_y, et_val_pred))
        print('Off auc score:', roc_auc_score(self.val_y, et_off_val_pred))
        return et_val_pred, et_off_val_pred, et_test_pred
    
    def gbdt_model(self, parameters=None):
        gbdt_train_x, gbdt_train_y = self.shuffle_data('gbdt')
        kf = KFold(n_splits=5, shuffle=True, random_state=0)
        gbdt_val_pred = np.zeros((gbdt_train_x.shape[0],))
        gbdt_off_val_pred = np.zeros((self.val_x.shape[0],))
        gbdt_test_pred = np.zeros((self.test_x.shape[0],))
        for n_fold, (trn_idx, val_idx) in enumerate(kf.split(gbdt_train_x)):
            gbdt = GradientBoostingClassifier(**parameters)
            gbdt.fit(gbdt_train_x[trn_idx], gbdt_train_y[trn_idx])
            gbdt_val_pred[val_idx] = gbdt.predict_proba(gbdt_train_x[val_idx])[:, 1]
            gbdt_off_val_pred += gbdt.predict_proba(self.val_x)[:, 1] / kf.n_splits
            gbdt_test_pred += gbdt.predict_proba(self.test_x)[:, 1] / kf.n_splits
            print('Fold %d auc score: %.6f'%(n_fold+1, roc_auc_score(gbdt_train_y[val_idx], gbdt_val_pred[val_idx])))
        print('Validate auc score:', roc_auc_score(gbdt_train_y, gbdt_val_pred))
        print('Off auc score:', roc_auc_score(self.val_y, gbdt_off_val_pred))
        return gbdt_val_pred, gbdt_off_val_pred, gbdt_test_pred
    
    def xgb_model(self, parameters=None):
        xgb_train_x, xgb_train_y = self.shuffle_data('xgb')
        kf = KFold(n_splits=5, shuffle=True, random_state=0)
        xgb_val_pred = np.zeros((xgb_train_x.shape[0],))
        xgb_off_val_pred = np.zeros((self.val_x.shape[0],))
        xgb_test_pred = np.zeros((self.test_x.shape[0],))
        for n_fold, (trn_idx, val_idx) in enumerate(kf.split(xgb_train_x)):
            xgb = XGBClassifier(**parameters)
            xgb.fit(xgb_train_x[trn_idx], xgb_train_y[trn_idx])
            xgb_val_pred[val_idx] = xgb.predict_proba(xgb_train_x[val_idx])[:, 1]
            xgb_off_val_pred += xgb.predict_proba(self.val_x)[:, 1] / kf.n_splits
            xgb_test_pred += xgb.predict_proba(self.test_x)[:, 1] / kf.n_splits
            print('Fold %d auc score: %.6f'%(n_fold+1, roc_auc_score(xgb_train_y[val_idx], xgb_val_pred[val_idx])))
        print('Validate auc score:', roc_auc_score(xgb_train_y, xgb_val_pred))
        print('Off auc score:', roc_auc_score(self.val_y, xgb_off_val_pred))
        return xgb_val_pred, xgb_off_val_pred, xgb_test_pred
    
    def merge_data(self):
        rf_val_pred, rf_off_val_pred, rf_test_pred = self.rf_model(parameters={"n_jobs": -1})
        et_val_pred, et_off_val_pred, et_test_pred = self.et_model(parameters={"n_jobs": -1})
        gbdt_val_pred, gbdt_off_val_pred, gbdt_test_pred = self.gbdt_model(parameters={"learning_rate": 0.3})
        xgb_val_pred, xgb_off_val_pred, xgb_test_pred = self.xgb_model(parameters={"n_jobs": -1})
        val_pred = np.zeros((self.train_x.shape[0],))
        ss = self.shuffle_data()
        for n_fold, (trn_idx, val_idx) in enumerate(ss.split(self.train_x)):
            if n_fold == 0:
                val_pred[val_idx] = rf_val_pred
            elif n_fold == 1:
                val_pred[val_idx] = et_val_pred
            elif n_fold == 2:
                val_pred[val_idx] = gbdt_val_pred
            else:
                val_pred[val_idx] = xgb_val_pred
        off_val_pred = (rf_off_val_pred + et_off_val_pred + gbdt_off_val_pred + xgb_off_val_pred) / 4
        test_pred = (rf_test_pred + et_test_pred + gbdt_test_pred + xgb_test_pred) / 4
        return val_pred, off_val_pred, test_pred
                
train_x, train_y, val_x, val_y, test_x = np.random.random((7000,10)), np.random.randint(0,2,size=(7000,)), np.random.random((3000,10)), np.random.randint(0,2,size=(3000,)), np.random.random((3000,10))
bld_layer = blending_layer1(train_x, train_y, val_x, val_y, test_x)
val_pred, off_val_pred, test_pred = bld_layer.merge_data()

Fold 1 auc score: 0.555786
Fold 2 auc score: 0.496216
Fold 3 auc score: 0.484502
Fold 4 auc score: 0.531258
Fold 5 auc score: 0.499029
Validate auc score: 0.5123199514860659
Off auc score: 0.5079298643992298
Fold 1 auc score: 0.471877
Fold 2 auc score: 0.514054
Fold 3 auc score: 0.509929
Fold 4 auc score: 0.514874
Fold 5 auc score: 0.491361
Validate auc score: 0.4993188535842146
Off auc score: 0.49754564517613253
Fold 1 auc score: 0.496682
Fold 2 auc score: 0.508614
Fold 3 auc score: 0.533783
Fold 4 auc score: 0.522591
Fold 5 auc score: 0.485253
Validate auc score: 0.5081690126127256
Off auc score: 0.4956298614207518
Fold 1 auc score: 0.511242
Fold 2 auc score: 0.477768
Fold 3 auc score: 0.496770
Fold 4 auc score: 0.573182
Fold 5 auc score: 0.443396
Validate auc score: 0.4975828183515095
Off auc score: 0.5073019389892113


In [35]:
print(val_pred.shape, off_val_pred.shape, test_pred.shape)

(7000,) (3000,) (3000,)


In [36]:
train_x = pd.read_csv('../data/train_.csv')
test_x = pd.read_csv('../data/test_.csv')
train_y = pd.read_csv('../data/y_.csv')
val_x = pd.read_csv('../data/val_x_.csv')
val_y = pd.read_csv('../data/val_y_.csv')

In [37]:
val_x = val_x[train_x.columns]
test_x = test_x[train_x.columns]

In [38]:
train_x, train_y, val_x, val_y, test_x = train_x.values, train_y.values, val_x.values, val_y.values, test_x.values

In [39]:
bld_layer = blending_layer1(train_x, train_y, val_x, val_y, test_x)
val_pred, off_val_pred, test_pred = bld_layer.merge_data()



Fold 1 auc score: 0.652681




Fold 2 auc score: 0.628044




Fold 3 auc score: 0.652528




Fold 4 auc score: 0.634571




Fold 5 auc score: 0.642686
Validate auc score: 0.6422080656789375
Off auc score: 0.7042383136520015




Fold 1 auc score: 0.626869




Fold 2 auc score: 0.614348




Fold 3 auc score: 0.626784




Fold 4 auc score: 0.639442




Fold 5 auc score: 0.625295
Validate auc score: 0.6262574263034375
Off auc score: 0.6935119859877666


  y = column_or_1d(y, warn=True)


Fold 1 auc score: 0.746407
Fold 2 auc score: 0.763997
Fold 3 auc score: 0.759598
Fold 4 auc score: 0.755206
Fold 5 auc score: 0.770662
Validate auc score: 0.7590887182386873
Off auc score: 0.7547450717623077


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Fold 1 auc score: 0.769340


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Fold 2 auc score: 0.764185


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Fold 3 auc score: 0.762763


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Fold 4 auc score: 0.748868


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Fold 5 auc score: 0.761270
Validate auc score: 0.761046324383174
Off auc score: 0.7526743359143175


In [40]:
print(val_pred.shape, off_val_pred.shape, test_pred.shape)

(276759,) (30752,) (48744,)


In [41]:
val_pred, off_val_pred, test_pred = pd.DataFrame(val_pred), pd.DataFrame(off_val_pred), pd.DataFrame(test_pred)

In [42]:
val_pred.head()

Unnamed: 0,0
0,0.0
1,0.025713
2,0.0
3,0.246714
4,0.07504


In [43]:
val_pred.to_csv('../data/val_pred.csv', index=False)
off_val_pred.to_csv('../data/off_val_pred.csv', index=False)
test_pred.to_csv('../data/test_pred.csv',index=False)

In [26]:
#print(train_x.shape, train_y.shape, val_x.shape, val_y.shape,test_x.shape)

(508761, 388) (508761, 1) (30752, 388) (30752, 1) (48744, 388)


In [18]:
#train_x.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,inst_AMT_PAYMENT,EXT_SOURCE_1over2_NAminus1_Add0.1,EXT_SOURCE_2over1_NAminus1_Add0.1,EXT_SOURCE_1over3_NAminus1_Add0.1,EXT_SOURCE_3over1_NAminus1_Add0.1,EXT_SOURCE_2over3_NAminus1_Add0.1,EXT_SOURCE_3over2_NAminus1_Add0.1,EXT_SOURCE_1_log,EXT_SOURCE_2_log,EXT_SOURCE_3_log
0,0.0,1.0,0.0,0.0,0.0,112500.0,495000.0,21933.0,495000.0,0.0,...,37261.11,1.194,0.837,1.369,0.7305,1.146,0.8726,0.5225,0.4434,0.388
1,0.0,1.0,1.0,0.0,0.0,213750.0,3060000.0,77485.5,3060000.0,0.0,...,4135.241,0.736,1.358,1.291,0.7744,1.755,0.57,0.4165,0.5527,0.3203
2,0.0,1.0,0.0,0.0,0.0,450000.0,1417495.5,136687.5,1363500.0,0.0,...,33406.465,1.487,0.6724,1.409,0.71,0.9473,1.056,0.6045,0.4224,0.4448
3,0.0,0.0,0.0,0.0,0.0,112500.0,467257.5,21910.5,328500.0,0.0,...,5104.3423,-1.522,-0.6567,-1.567,-0.6377,1.029,0.971,-1.0,0.3994,0.388
4,0.0,0.0,0.0,1.0,0.0,157500.0,848745.0,36090.0,675000.0,0.0,...,9121.096,-1.067,-0.9365,-1.82,-0.5493,1.705,0.5864,-1.0,0.5557,0.3325


In [19]:
#test_x.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,inst_AMT_PAYMENT,EXT_SOURCE_1over2_NAminus1_Add0.1,EXT_SOURCE_2over1_NAminus1_Add0.1,EXT_SOURCE_1over3_NAminus1_Add0.1,EXT_SOURCE_3over1_NAminus1_Add0.1,EXT_SOURCE_2over3_NAminus1_Add0.1,EXT_SOURCE_3over2_NAminus1_Add0.1,EXT_SOURCE_1_log,EXT_SOURCE_2_log,EXT_SOURCE_3_log
0,0,1,0,0,0,135000.0,568800.0,20560.5,450000.0,0,...,5885.1323,0.9585,1.043,3.285,0.3044,3.428,0.2917,0.561,0.582,0.148
1,0,0,0,0,0,99000.0,222768.0,17370.0,180000.0,0,...,6240.205,1.698,0.589,1.248,0.8013,0.735,1.36,0.448,0.2559,0.3599
2,0,0,1,0,0,202500.0,663264.0,69777.0,630000.0,-1,...,9740.235,-1.125,-0.8887,-1.266,-0.79,1.125,0.889,-1.0,0.5303,0.4768
3,0,1,0,0,2,315000.0,1575000.0,49018.5,1575000.0,0,...,4356.7314,1.026,0.974,0.878,1.139,0.8555,1.169,0.4224,0.4119,0.478
4,0,0,1,1,1,180000.0,625500.0,32067.0,625500.0,0,...,11100.338,0.5747,1.74,-0.3357,-2.979,-0.584,-1.712,0.1841,0.3547,-1.0


In [20]:
#val_x.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,inst_AMT_PAYMENT,EXT_SOURCE_1over2_NAminus1_Add0.1,EXT_SOURCE_2over1_NAminus1_Add0.1,EXT_SOURCE_1over3_NAminus1_Add0.1,EXT_SOURCE_3over1_NAminus1_Add0.1,EXT_SOURCE_2over3_NAminus1_Add0.1,EXT_SOURCE_3over2_NAminus1_Add0.1,EXT_SOURCE_1_log,EXT_SOURCE_2_log,EXT_SOURCE_3_log
0,0,1,0,0,0,135000.0,568800.0,20560.5,450000.0,0,...,5885.1323,0.9585,1.043,3.285,0.3044,3.428,0.2917,0.561,0.582,0.148
1,0,0,0,0,0,99000.0,222768.0,17370.0,180000.0,0,...,6240.205,1.698,0.589,1.248,0.8013,0.735,1.36,0.448,0.2559,0.3599
2,0,0,1,0,0,202500.0,663264.0,69777.0,630000.0,-1,...,9740.235,-1.125,-0.8887,-1.266,-0.79,1.125,0.889,-1.0,0.5303,0.4768
3,0,1,0,0,2,315000.0,1575000.0,49018.5,1575000.0,0,...,4356.7314,1.026,0.974,0.878,1.139,0.8555,1.169,0.4224,0.4119,0.478
4,0,0,1,1,1,180000.0,625500.0,32067.0,625500.0,0,...,11100.338,0.5747,1.74,-0.3357,-2.979,-0.584,-1.712,0.1841,0.3547,-1.0
