In [1]:
import time

import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import xgboost

In [2]:
trainDataFrameLevel1 = pd.read_csv('./data/trainLevel1Preds.csv')
trainLabels = trainDataFrameLevel1['TARGET']
trainFeaturesLevel1 = trainDataFrameLevel1.drop(['TARGET'], axis=1)

In [4]:
print trainDataFrameLevel1.shape
print trainLabels.shape
print trainFeaturesLevel1.shape
np.array_equal(trainLabels.index, trainFeaturesLevel1.index)

(76020, 189)
(76020,)
(76020, 188)


True

## To include original features:

In [5]:
# for original data:
# load original features
trainDataFrameOrig = pd.read_csv('./data/train.csv')

# remove constant columns
colsToRemove1 = []
for col in trainDataFrameOrig.columns:
    if trainDataFrameOrig[col].std() == 0:
        colsToRemove1.append(col)

trainDataFrameOrig.drop(colsToRemove1, axis=1, inplace=True)

# remove duplicate columns
colsToRemove2 = []
columns = trainDataFrameOrig.columns
for i in range(len(columns)-1):
    v = trainDataFrameOrig[columns[i]].values
    for j in range(i+1,len(columns)):
        if np.array_equal(v,trainDataFrameOrig[columns[j]].values):
            colsToRemove2.append(columns[j])

trainDataFrameOrig.drop(colsToRemove2, axis=1, inplace=True)
trainDataFrameOrig.drop(['ID'], axis=1, inplace=True)

#trainLabels = trainDataFrame['TARGET']
trainFeaturesOriginal = trainDataFrameOrig.drop(['TARGET'], axis=1)

In [6]:
print trainDataFrameOrig.shape
print trainFeaturesOriginal.shape

(76020, 307)
(76020, 306)


## Helper to get train/validation data:

In [10]:
trainDataFrame = pd.concat([trainFeaturesLevel1, trainFeaturesOriginal, trainLabels], axis=1)

In [11]:
print trainDataFrame.shape


(76020, 495)


In [13]:
trainDataFrame.head()

Unnamed: 0,gbc_0,gbc_1,gbc_2,gbc_3,gbc_4,gbc_5,gbc_6,gbc_7,gbc_8,gbc_9,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,0.342172,0.414435,0.323783,0.44965,0.31551,0.371713,0.344857,0.363904,0.328568,0.3931,...,0,0,0,0,0,0,0,0,39205.17,0
1,0.212977,0.179875,0.16836,0.277437,0.185459,0.194718,0.210133,0.172217,0.229657,0.183873,...,0,0,0,0,0,0,0,0,49278.03,0
2,0.060077,0.056295,0.058152,0.060049,0.052826,0.065247,0.058802,0.062107,0.05423,0.049104,...,0,0,0,0,0,0,0,0,67333.77,0
3,0.35101,0.46445,0.381325,0.449268,0.410513,0.451058,0.449003,0.489866,0.399165,0.416468,...,0,0,0,0,0,0,0,0,64007.97,0
4,0.279508,0.122038,0.199128,0.21148,0.158174,0.204654,0.17854,0.141302,0.146963,0.243928,...,0,0,0,0,0,0,0,0,117310.979016,0


In [12]:
dataTarget0 = trainDataFrame[trainDataFrame.TARGET == 0]
dataTarget1 = trainDataFrame[trainDataFrame.TARGET == 1]
np.random.seed(1)

In [14]:
def getBalancedTrainAndValidationSets():
    # shuffle 
    dt0 = dataTarget0.reindex(np.random.permutation(dataTarget0.index))
    dt1 = dataTarget1.reindex(np.random.permutation(dataTarget1.index))

    trn0 = dt0[0:36500]
    trn1 = dt1[0:1500]
    trn = pd.concat([trn0, trn1])
    y_train = trn['TARGET']
    X_train = trn.drop(['TARGET'], axis=1)
        
    val0 = dt0[36500:]
    val1 = dt1[1500:]
    val = pd.concat([val0, val1])
    y_val = val['TARGET']
    X_val = val.drop(['TARGET'], axis=1)
        
    return X_train, y_train, X_val, y_val

In [None]:
# cross validated performance
n_folds = 20
clf = xgboost.XGBClassifier(learning_rate=0.05, n_estimators=125)
trn_scores = []
val_scores = []
start = time.time()
for i in range(n_folds):
    X_train, y_train, X_val, y_val = getBalancedTrainAndValidationSets()
    clf.fit(X_train, y_train)
    # evaluate w .predict_proba() !!!! :
    trn_score = roc_auc_score(y_train, clf.predict_proba(X_train)[:,1])
    val_score = roc_auc_score(y_val, clf.predict_proba(X_val)[:,1])
    # evaluate w .predict():
    # trn_score = roc_auc_score(y_train, clf.predict(X_train))
    # val_score = roc_auc_score(y_val, clf.predict(X_val))
    print "(%d/%d) train auc: %.4f" % (i, n_folds, trn_score)
    print "(%d/%d) validation auc: %.4f" % (i, n_folds, val_score)
    trn_scores.append(trn_score)
    val_scores.append(val_score)

print "-----------------------"
print "'%s' performance on %d fold CV:" % (clf.__name__, n_folds)
print "Train Mean: %.4f" % np.mean(trn_scores)
print "Train Std: %.4f" % np.std(trn_scores)
print "Validation Mean: %.4f" % np.mean(val_scores)
print "Validation Std: %.4f" % np.std(val_scores)
print "Total Time (mins): %.1f" % ((time.time()-start)/60.)

### to use eval_metric=auc, we can use booster obj

In [None]:
# for original data:
# load original features
trainDataFrame = pd.read_csv('./data/train.csv')

# remove constant columns
colsToRemove1 = []
for col in trainDataFrame.columns:
    if trainDataFrame[col].std() == 0:
        colsToRemove1.append(col)

trainDataFrame.drop(colsToRemove1, axis=1, inplace=True)

# remove duplicate columns
colsToRemove2 = []
columns = trainDataFrame.columns
for i in range(len(columns)-1):
    v = trainDataFrame[columns[i]].values
    for j in range(i+1,len(columns)):
        if np.array_equal(v,trainDataFrame[columns[j]].values):
            colsToRemove2.append(columns[j])

trainDataFrame.drop(colsToRemove2, axis=1, inplace=True)
trainDataFrame.drop(['ID'], axis=1, inplace=True)

In [21]:
print X_train.shape

(38000, 494)


In [15]:
param = {'bst:max_depth':5, 'bst:eta':0.0202048, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
num_round = 500

X_train, y_train, X_val, y_val = getBalancedTrainAndValidationSets()
dtrain = xgboost.DMatrix(X_train, label=y_train)
dtest = xgboost.DMatrix(X_val, label=y_val)
evallist  = [(dtest,'eval'), (dtrain,'train')]
clf = xgboost.train( param, dtrain, num_round, evallist )


[0]	eval-auc:0.841237	train-auc:0.852656
[1]	eval-auc:0.842967	train-auc:0.855461
[2]	eval-auc:0.842924	train-auc:0.855375
[3]	eval-auc:0.851160	train-auc:0.864808
[4]	eval-auc:0.851122	train-auc:0.865404
[5]	eval-auc:0.852709	train-auc:0.867875
[6]	eval-auc:0.853114	train-auc:0.869246
[7]	eval-auc:0.853211	train-auc:0.869591
[8]	eval-auc:0.853147	train-auc:0.869583
[9]	eval-auc:0.853430	train-auc:0.870455
[10]	eval-auc:0.853481	train-auc:0.870554
[11]	eval-auc:0.853551	train-auc:0.870302
[12]	eval-auc:0.854345	train-auc:0.870961
[13]	eval-auc:0.854545	train-auc:0.871167
[14]	eval-auc:0.854479	train-auc:0.871028
[15]	eval-auc:0.854493	train-auc:0.871293
[16]	eval-auc:0.854562	train-auc:0.871275
[17]	eval-auc:0.854543	train-auc:0.871206
[18]	eval-auc:0.854457	train-auc:0.871239
[19]	eval-auc:0.854530	train-auc:0.871333
[20]	eval-auc:0.854510	train-auc:0.871421
[21]	eval-auc:0.854698	train-auc:0.871963
[22]	eval-auc:0.854687	train-auc:0.871985
[23]	eval-auc:0.854998	train-auc:0.872452
[2

In [16]:
preds = clf.predict(xgboost.DMatrix(X_val))

In [17]:
roc_auc_score(y_val, preds)

0.86075685374758526

In [None]:
len(preds)

In [18]:
len(preds[preds>.5])

49