In [1]:
import pandas as pd
import numpy as  np
from sklearn.model_selection import train_test_split,StratifiedKFold
from xgboost import XGBClassifier
from sklearn import metrics
from hyperopt import fmin,tpe,hp,STATUS_OK,Trials,space_eval

In [2]:
dfTest = pd.read_csv('../input/preprocessedtestv3/preprocessedTestV3.csv')
dfTrain = pd.read_csv('../input/preprocessedtrainv3/preprocessedTrainV3.csv')

In [3]:
def evaluateModel(df,model):
    
    df = df - df.min()/df.max()-df.min()
    
    y_pred = model.predict(df)
    
    result = pd.DataFrame(y_pred)
    
    result.columns = ['isFraud']
    result.to_csv("./result_xgb.csv")
    
    print("Result file saved")
    

In [4]:
def xgbClassifier(df,params=None):
    
    tmpPara = {'n_estimator':550,'eval_metric':'auc','verbosity':1,'n_jobs':-1,'reg_alpha':8.0,'colsample_bytree':0.8,'learning_rate':0.17500000000000002,'gamma':0.6000000000000001,'reg_lambda':1.20000000000000001,'max_depth':10,'min_child_weight':1.0,'scale_pos_weight':60.0,'subsample':0.700000000000001,'objective':'binary:logistic'}
    
    if(params!=None):
        tmpPara = params
    
    X_train = df.drop(['isFraud'],axis=1)
    Y_train = df['isFraud']
    
    X_train = X_train - X_train.min()/X_train.max() - X_train.min()
    
    skf = StratifiedKFold(n_splits=7)
#     if()
    xgbc = XGBClassifier(**tmpPara)
    
    for trainIndx,testIndx in skf.split(X_train,Y_train):
        
        Xtrain = X_train.iloc[trainIndx]
        Ytrain = Y_train.iloc[trainIndx]
        
        Xtest = X_train.iloc[testIndx]
        Ytest = Y_train.iloc[testIndx]
    
        xgbc.fit(Xtrain,Ytrain)
        
        print("Accuracy : ",xgbc.score(Xtest,Ytest))
        
        prob = xgbc.predict_proba(Xtest)[:,1]
        
        auc = metrics.roc_auc_score(Ytest,prob)
        print("validation-AUC : ",auc,"\n===============================")
        
        
    return xgbc

In [5]:
train,test = train_test_split(dfTrain,test_size=0.25,stratify=dfTrain['isFraud'])

XTrain = train.drop(['isFraud'],axis=1)
YTrain = train['isFraud']

XTest = test.drop(['isFraud'],axis=1)
YTest = test['isFraud']


XTrain = XTrain - XTrain.min()/XTrain.max()-XTrain.min()
XTest = XTest - XTest.min()/XTest.max()-XTest.min()


def objectiveFn(paramSpace):
    
    xgbc = XGBClassifier(**paramSpace)    
    xgbc.fit(XTrain,YTrain)
    
    prob = xgbc.predict_proba(XTest)[:,1]
    auc = metrics.roc_auc_score(YTest,prob)
    print("AUC : ",auc)
    return {'loss':-auc,'status':STATUS_OK} # -ve auc minimization ==> +auc maximization


In [6]:
paramSpace = {
    'n_estimators' : hp.randint('n_estimators',300,800),
    'max_depth' : hp.randint('max_depth',9,18),
#     'grow_policy' : hp.randint('grow_policy',0,1),
    'learning_rate' : hp.quniform('learning_rate',0.01,0.2,0.015),
    'objective' : 'binary:logistic',
    'booster':'gbtree',
    'tree_method' : 'gpu_hist',
    'n_jobs': -1,
    'gamma' : hp.quniform('gamma',0.4,1,0.045),
    'min_child_weight' : hp.quniform('min_child_weight',1,12,1),
    'subsample':hp.quniform('subsample',0.55,1,0.045),
    'colsample_bytree':hp.quniform('colsample_bytree', 0.65, 1, 0.048),
    'reg_alpha' : hp.quniform('reg_alpha', 0, 10, 1),
    'reg_lambda': hp.quniform('reg_lambda', 1, 2, 0.095),
    'scale_pos_weight': hp.quniform('scale_pos_weight', 45, 200, 9), # helps in convergence for high imbalance
    'base_score' : hp.quniform('base_score',0.67,0.72,0.045),
    'eval_metric'  : 'auc',
    'nthread':4
}

In [7]:
def tuneParams():
    trialModels = Trials()
    
    bestParam = fmin(fn=objectiveFn,space=paramSpace,algo=tpe.suggest,max_evals=10,trials=trialModels)
    
    print(bestParam)
    return [bestParam,trialModels]

In [8]:
# param_models = tuneParams()

In [9]:
bestParams = {'base_score': 0.6749999999999999, 'colsample_bytree': 0.672, 'gamma': 0.44999999999999996, 'learning_rate': 0.105,
              'max_depth': 9, 'min_child_weight': 8.0, 'n_estimators': 671, 'reg_alpha': 4.0, 'reg_lambda': 1.1400000000000001,
              'scale_pos_weight': 45.0, 'subsample': 0.8099999999999999,'n_jobs':-1,'verbosity':1}

model = xgbClassifier(dfTrain,bestParams)
evaluateModel(dfTest,model)

Accuracy :  0.9807500829737803
validation-AUC :  0.9616624896384297 
Accuracy :  0.9496143633834871
validation-AUC :  0.9360516844556065 
Accuracy :  0.9457579972183588
validation-AUC :  0.9433116750424605 
Accuracy :  0.9671260589202175
validation-AUC :  0.9553846694198651 
Accuracy :  0.9697022379567581
validation-AUC :  0.9549636420157972 
Accuracy :  0.9549721835883171
validation-AUC :  0.9461414255576382 
Accuracy :  0.7420818055379947
validation-AUC :  0.9556520155891286 
Result file saved


In [10]:
# n_estimator=550,verbosity=1,n_jobs=-1,reg_alpha=8.0,colsample_bytree=0.8,
# learning_rate=0.17500000000000002,gamma=0.6000000000000001,
# reg_lambda=1.20000000000000001,max_depth=10,min_child_weight=1.0,scale_pos_weight=60.0,
# subsample=0.700000000000001,objective='binary:logistic'


# model = xgbClassifier(dfTrain)
# evaluateModel(dfTest,model)