In [2]:
import pandas as pd
import numpy as  np
from sklearn.model_selection import train_test_split,StratifiedKFold
from xgboost import XGBClassifier
from sklearn import metrics
from hyperopt import fmin,tpe,hp,STATUS_OK,Trials,space_eval

In [3]:
def changeDType(df,flag=False):
    
    if(flag):
        numericDtype = ['int32','int64','float64','float32']
    
    for i in df.columns:
        if (df[i].dtype == 'int64' or df[i].dtype == 'int32'):
            df[i] = pd.to_numeric(df[i],downcast='integer')
        
        if (df[i].dtype == 'float64' or df[i].dtype == 'float32'):
            df[i] = pd.to_numeric(df[i],downcast='float')


In [4]:
dfTest = pd.read_csv('../input/preprocessedtestv3/preprocessedTestV3.csv')
dfTrain = pd.read_csv('../input/preprocessedtrainv3/preprocessedTrainV3.csv')

In [5]:
changeDType(dfTest)
changeDType(dfTrain)

In [6]:
def evaluateModel(df,model):
    
    df = df - df.min()/df.max()-df.min()
    
    y_pred = model.predict(df)
    
    result = pd.DataFrame(y_pred)
    
    result.columns = ['isFraud']
    result.to_csv("./result_xgb_final.csv")
    
    print("Result file saved")
    

In [7]:
def xgbClassifier(df,params=None,splits=5):
    
    tmpPara = {'n_estimator':550,'eval_metric':'auc','verbosity':1,'n_jobs':-1,'reg_alpha':8.0,'colsample_bytree':0.8,'learning_rate':0.17500000000000002,'gamma':0.6000000000000001,'reg_lambda':1.20000000000000001,'max_depth':10,'min_child_weight':1.0,'scale_pos_weight':60.0,'subsample':0.700000000000001,'objective':'binary:logistic'}
    
    if(params!=None):
        tmpPara = params
    
    X_train = df.drop(['isFraud'],axis=1)
    Y_train = df['isFraud']
    
    X_train = X_train - X_train.min()/X_train.max() - X_train.min()
    
    skf = StratifiedKFold(n_splits=splits)
#     if()
    xgbc = XGBClassifier(**tmpPara)
    
    for trainIndx,testIndx in skf.split(X_train,Y_train):
        
        Xtrain = X_train.iloc[trainIndx]
        Ytrain = Y_train.iloc[trainIndx]
        
        Xtest = X_train.iloc[testIndx]
        Ytest = Y_train.iloc[testIndx]
    
        xgbc.fit(Xtrain,Ytrain)
        
        print("Accuracy : ",xgbc.score(Xtest,Ytest))
        
        prob = xgbc.predict_proba(Xtest)[:,1]
        
        auc = metrics.roc_auc_score(Ytest,prob)
        print("validation-AUC : ",auc,"\n===============================")
        
        
    return xgbc

In [8]:
train,test = train_test_split(dfTrain,test_size=0.25,stratify=dfTrain['isFraud'])

XTrain = train.drop(['isFraud'],axis=1)
YTrain = train['isFraud']

XTest = test.drop(['isFraud'],axis=1)
YTest = test['isFraud']


XTrain = XTrain - XTrain.min()/XTrain.max()-XTrain.min()
XTest = XTest - XTest.min()/XTest.max()-XTest.min()


def objectiveFn(paramSpace):
    
    xgbc = XGBClassifier(**paramSpace)    
    xgbc.fit(XTrain,YTrain)
    
    prob = xgbc.predict_proba(XTest)[:,1]
    auc = metrics.roc_auc_score(YTest,prob)
    print("AUC : ",auc)
    return {'loss':-auc,'status':STATUS_OK} # -ve auc minimization ==> +auc maximization


In [9]:
paramSpace = {
    'n_estimators' : hp.randint('n_estimators',300,800),
    'max_depth' : hp.randint('max_depth',9,18),
#     'grow_policy' : hp.randint('grow_policy',0,1),
    'learning_rate' : hp.quniform('learning_rate',0.01,0.2,0.015),
    'objective' : 'binary:logistic',
    'booster':'gbtree',
    'tree_method' : 'gpu_hist',
    'n_jobs': -1,
    'gamma' : hp.quniform('gamma',0.4,1,0.06),
    'min_child_weight' : hp.quniform('min_child_weight',1,12,1),
    'subsample':hp.quniform('subsample',0.55,1,0.055),
    'colsample_bytree':hp.quniform('colsample_bytree', 0.65, 1, 0.055),
    'reg_alpha' : hp.quniform('reg_alpha', 0, 10, 1),
    'reg_lambda': hp.quniform('reg_lambda', 1, 2, 0.12),
    'scale_pos_weight': hp.quniform('scale_pos_weight', 45, 200, 7), # helps in convergence for high imbalance
    'base_score' : hp.quniform('base_score',0.67,0.72,0.055),
    'eval_metric'  : 'auc',
}

In [10]:
def tuneParams():
    trialModels = Trials()
    
    bestParam = fmin(fn=objectiveFn,space=paramSpace,algo=tpe.suggest,max_evals=99,trials=trialModels)
    
    print(bestParam)
    return [bestParam,trialModels]

In [16]:
# param_models = tuneParams()

AUC :                                                 
0.9487654493908302                                    
AUC :                                                                            
0.9474298110913414                                                               
AUC :                                                                            
0.9399103253713272                                                               
AUC :                                                                            
0.9495908008097683                                                               
AUC :                                                                            
0.9463726409923019                                                               
AUC :                                                                            
0.9473695813799765                                                               
AUC :                                                                 

In [10]:
# print(param_models[0])



In [12]:
# {'base_score': 0.66, 'colsample_bytree': 0.935, 'gamma': 0.6, 'learning_rate': 0.03, 'max_depth': 15, 'min_child_weight': 10.0,
# 'n_estimators': 773, 'reg_alpha': 1.0, 'reg_lambda': 2.04, 'scale_pos_weight': 42.0, 'subsample': 0.935}
# AUC 0.8463
best_params = {'objective':'binary:logistic','n_jobs':-1,'base_score': 0.66, 'colsample_bytree': 0.935, 'gamma': 0.6, 'learning_rate': 0.03, 'max_depth': 15, 'min_child_weight': 10.0, 'n_estimators': 773, 'reg_alpha': 1.0, 'reg_lambda': 2.04, 'scale_pos_weight': 42.0, 'subsample': 0.935}
model = xgbClassifier(dfTrain,best_params)
evaluateModel(dfTest,model)

Accuracy :  0.982829391205397
validation-AUC :  0.9639806864407422 
Accuracy :  0.9786840607981793
validation-AUC :  0.9570895847258265 
Accuracy :  0.9775258067138096
validation-AUC :  0.9600860710574438 
Accuracy :  0.9651304559863448
validation-AUC :  0.9464900734649435 
Accuracy :  0.9774445257254328
validation-AUC :  0.9604679046068505 
Accuracy :  0.9801674388360563
validation-AUC :  0.9657926823154163 
Accuracy :  0.9763670724025116
validation-AUC :  0.9551439863972097 
Accuracy :  0.979801263945053
validation-AUC :  0.95609537084803 
Accuracy :  0.9755136046818801
validation-AUC :  0.9599264188058563 
Result file saved


In [21]:
# bestParams = {'base_score': 0.6749999999999999, 'colsample_bytree': 0.672, 'gamma': 0.44999999999999996, 'learning_rate': 0.105,
#               'max_depth': 9, 'min_child_weight': 8.0, 'n_estimators': 671, 'reg_alpha': 4.0, 'reg_lambda': 1.1400000000000001,
#               'scale_pos_weight': 45.0, 'subsample': 0.8099999999999999,'n_jobs':-1,'verbosity':1}

# model = xgbClassifier(dfTrain,bestParams)
# evaluateModel(dfTest,model)

In [32]:
# n_estimator=550,verbosity=1,n_jobs=-1,reg_alpha=8.0,colsample_bytree=0.8,
# learning_rate=0.17500000000000002,gamma=0.6000000000000001,
# reg_lambda=1.20000000000000001,max_depth=10,min_child_weight=1.0,scale_pos_weight=60.0,
# subsample=0.700000000000001,objective='binary:logistic'


# model = xgbClassifier(dfTrain)
# evaluateModel(dfTest,model)

In [14]:
best_params = {'objective':'binary:logistic','n_jobs':-1,'base_score': 0.66, 'colsample_bytree': 0.935, 'gamma': 0.6, 'learning_rate': 0.03, 'max_depth': 15, 'min_child_weight': 10.0, 'n_estimators': 773, 'reg_alpha': 1.0, 'reg_lambda': 2.04, 'scale_pos_weight': 42.0, 'subsample': 0.935}
model = xgbClassifier(dfTrain,best_params)
evaluateModel(dfTest,model)

Accuracy :  0.9771238140824464
validation-AUC :  0.9428550125332225 
Accuracy :  0.425175658833517
validation-AUC :  0.9131053232629306 
Result file saved


In [11]:
# n_estimator=550,verbosity=1,n_jobs=-1,reg_alpha=8.0,colsample_bytree=0.8,
# learning_rate=0.17500000000000002,gamma=0.6000000000000001,
# reg_lambda=1.20000000000000001,max_depth=10,min_child_weight=1.0,scale_pos_weight=60.0,
# subsample=0.700000000000001,objective='binary:logistic'

# v1
params = {'n_estimator':550,'verbosity':1,'n_jobs':-1,'reg_alpha':8.0,'colsample_bytree':0.8,'learning_rate':0.17500000000000002,'gamma':0.6000000000000001,'reg_lambda':1.20000000000000001,'max_depth':10,'min_child_weight':1.0,'scale_pos_weight':60.0,'subsample':0.700000000000001,'objective':'binary:logistic'}
model = xgbClassifier(dfTrain,params,7)
evaluateModel(dfTest,model)

# 0.86854

Parameters: { "n_estimator" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Accuracy :  0.979027389249759
validation-AUC :  0.9512300126495153 
Parameters: { "n_estimator" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Accuracy :  0.9154918447338475
validation-AUC :  0.9423773685607336 
Parameters: { "n_estimator" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being u

In [13]:
#v2
params = {'n_estimator':500,'verbosity':1,'n_jobs':-1,'reg_alpha':6.0,'colsample_bytree':0.7,'learning_rate':0.2,'gamma':0.7000000000000001,'reg_lambda':1.50000000000000001,'max_depth':15,'min_child_weight':1.0,'scale_pos_weight':45.0,'subsample':0.6500000000000001,'objective':'binary:logistic'}
model = xgbClassifier(dfTrain,params,7)
evaluateModel(dfTest,model)

Parameters: { "n_estimator" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Accuracy :  0.9820144453400345
validation-AUC :  0.952461824320171 
Parameters: { "n_estimator" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Accuracy :  0.9735586041218864
validation-AUC :  0.944592170957121 
Parameters: { "n_estimator" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being us