In [1]:
%%time

import pandas as pd
import numpy as np
## DATA IMPORT ## 
## For this project the data from https://www.kaggle.com/c/ieee-fraud-detection/data has been used. ##

data_path = "./data/"

train = pd.read_csv(data_path + "train_processed.csv")
test = pd.read_csv(data_path + "test_processed.csv")

print('train_set shape is: {}'.format(train.shape))
print('test_set shape is: {}'.format(test.shape))

train_set shape is: (590540, 435)
test_set shape is: (506691, 434)
Wall time: 41.3 s


In [2]:
X = train.copy()
y = train['isFraud'].copy()
X = X.drop(['isFraud'], axis=1)

In [3]:
import xgboost as xgb

data_dmatrix = xgb.DMatrix(data=X,label=y) # define data_dmatrix

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17, shuffle=False)

from sklearn.impute import SimpleImputer

# Create our imputer to replace missing values with the mean e.g.
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(X_train)

# Impute our data, then train
X_train_imp = imp.transform(X_train)

from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# Undersample the train dataset
ros = RandomUnderSampler(random_state=17)

X_train, y_train = ros.fit_resample(X_train_imp, y_train)
print('Resampled dataset shape {}'.format(Counter(y_train)))

Resampled dataset shape Counter({0: 16599, 1: 16599})


In [4]:
# UNDERSAMPLING APPROACH - XGBOOST

from xgboost import XGBClassifier


# declare parameters
params = {
    'objective':'binary:logistic',
    'max_depth': 4,
    'alpha': 10,
    'learning_rate': 1.0,
    'n_estimators':100
}         
           
          
# instantiate the classifier 
xgb_clf = XGBClassifier(**params)


# fit the classifier to the training data
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)

from sklearn.metrics import roc_auc_score

print('XGBoost model roc_auc_score score: {0:0.4f}'. format(roc_auc_score(y_test, y_pred)))



XGBoost model roc_auc_score score: 0.7402


In [5]:
from xgboost import cv

params = {
    "objective":"binary:logistic",
    'colsample_bytree': 0.3,
    'learning_rate': 0.1,
    'max_depth': 5, 
    'alpha': 10}

xgb_cv = cv(dtrain=data_dmatrix,
            params=params, 
            nfold=10,
            num_boost_round=50, 
            early_stopping_rounds=20, 
            metrics="auc", 
            as_pandas=True,
            seed=17)

In [6]:
xgb_cv.head()

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.729512,0.003509,0.728379,0.007882
1,0.761706,0.022398,0.760917,0.02282
2,0.77756,0.024385,0.776679,0.020955
3,0.792145,0.018374,0.791021,0.014517
4,0.80354,0.015404,0.801941,0.013515
