<b>XGBoost Fraud Detection using a highly unbalanced dataset from Kaggle</b><br>
No feature engineering allowing for a concise notebook illustrating Hyperparameter Tuning & Training with AWS Sagemaker

https://www.kaggle.com/mlg-ulb/creditcardfraud

In [1]:
# Define Python Modules

import os
import boto3
import numpy as np
import pandas as pd

from sagemaker import s3_input
from sagemaker import get_execution_role
from sagemaker.model import Model
from sagemaker.estimator import Estimator
from sagemaker.predictor import RealTimePredictor
from sagemaker.predictor import csv_serializer
from sagemaker.predictor import json_deserializer
from sagemaker.tuner import HyperparameterTuner
from sagemaker.tuner import IntegerParameter
from sagemaker.tuner import ContinuousParameter
from sagemaker.amazon.amazon_estimator import get_image_uri

In [2]:
# Define Unique Variables

role             = 'AmazonSageMaker-ExecutionRole-20191005T164168'
prefix           = 'xgboost'
bucket           = 'creditcardfraud123'
repo_name        = 'xgboost'
repo_version     = 'latest'
csv_filename     = 's3://' + bucket + '/creditcard.csv'
endpoint_name    = 'creditcardfraudxgboost'
sagemaker_client = boto3.Session().client('sagemaker')

In [3]:
# Randomly Separate Train, Validation, and Test Datasets

df = pd.read_csv(csv_filename)

model_data = df[df.columns.tolist()[-1:] + df.columns.tolist()[:-1]]

train_data, validation_data, test_data = np.split(
                                                  model_data.sample(frac=1),
                                                  [
                                                   int(0.7 * len(model_data)),
                                                   int(0.9 * len(model_data))
                                                  ]
                                                 )

train_data.to_csv(     'train.csv'     , header=False, index=False)
validation_data.to_csv('validation.csv', header=False, index=False)

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv'          )).upload_file('train.csv'     )
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

In [4]:
# Define Class Weights for Highly Unbalanced Fraud Data

class_counts  = df['Class'].reset_index().groupby('Class').agg('count').values
class_weights = float(class_counts[0]/float(class_counts[1]))

print('Non-Fraud Count in Train Set = {}'.format(    float(class_counts[0])))
print('Fraud Count in Train Set     = {}'.format(    float(class_counts[1])))
print('Class Weight Scale Factor    = {:.1f}'.format(class_weights         ))

Non-Fraud Count in Train Set = 284315.0
Fraud Count in Train Set     = 492.0
Class Weight Scale Factor    = 577.9


In [5]:
# Launch Training and HyperParameter Tuning Jobs using Spot Instances

xgb   = Estimator(
                  image_name               = get_image_uri(
                                                           region_name  = boto3.Session().region_name,
                                                           repo_name    = repo_name                  ,
                                                           repo_version = repo_version
                                                          )                            ,
                  role                     = get_execution_role()                      ,
                  train_instance_count     = 1                                         ,
                  train_instance_type      = 'ml.m5.large'                             ,
                  train_use_spot_instances = True                                      ,
                  train_max_run            = 3600                                      ,
                  train_max_wait           = 3600                                      ,
                  output_path              = 's3://{}/{}/output'.format(bucket, prefix)
                 )

xgb.set_hyperparameters(
                        scale_pos_weight = class_weights    , # Calculated in previous cell
                        objective        = 'reg:logistic'   ,
                        eval_metric      = 'auc'            ,
                        max_depth        = 10               ,
                        eta              = 0.713            ,
                        gamma            = 4                ,
                        min_child_weight = 9.93             ,
                        subsample        = 0.8              ,
                        silent           = 0                ,
                        num_round        = 100
                       )

tuner = HyperparameterTuner(
                            estimator             = xgb             ,
                            max_jobs              = 3               ,
                            max_parallel_jobs     = 2               ,
                            objective_metric_name = 'validation:auc',
                            strategy              = 'Bayesian'      ,
                            hyperparameter_ranges = {
                                                     'alpha'             : ContinuousParameter(0  , 1000),
                                                     'colsample_bylevel' : ContinuousParameter(0.1,    1),
                                                     'colsample_bytree'  : ContinuousParameter(0.5,    1),
                                                     'lambda'            : ContinuousParameter(0  , 1000),
                                                     'max_delta_step'    : IntegerParameter(   0  ,   10)
                                                    }
                           )

tuner.fit({
           'train'      : s3_input(s3_data='s3://{}/{}/train/'.format(     bucket, prefix), content_type='text/csv'),
           'validation' : s3_input(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='text/csv')
         })

	get_image_uri(region, 'xgboost', '0.90-1').


In [9]:
# Create Ephemeral Endpoint to Analyze Best Tuned Model

if sagemaker_client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuner.latest_tuning_job.name)['HyperParameterTuningJobStatus'] == 'Completed':
    
    best_training_job = sagemaker_client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuner.latest_tuning_job.name)['BestTrainingJob']['TrainingJobName']
    
    Model(
          model_data = sagemaker_client.describe_training_job(TrainingJobName=best_training_job)['ModelArtifacts']['S3ModelArtifacts'],
          image      = get_image_uri(
                                     region_name  = boto3.Session().region_name,
                                     repo_name    = repo_name                  ,
                                     repo_version = repo_version
                                    ),
          role       = role
         ).deploy(
                  initial_instance_count = 1              ,
                  instance_type          = 'ml.t2.2xlarge',
                  endpoint_name          = endpoint_name
                 )

    predictor = RealTimePredictor(endpoint_name)

    predictor.content_type = 'text/csv'
    predictor.serializer   = csv_serializer
    predictor.deserializer = json_deserializer

    true_positive  = 0
    false_positive = 0
    true_negative  = 0
    false_negative = 0

    for testdata in test_data.to_numpy():
        
        prediction = np.round(predictor.predict(testdata[1:]))

        if prediction > 0:

            if testdata[0] == prediction:
                true_positive  += 1
            else:
                false_positive += 1

        else:

            if testdata[0] == prediction:
                true_negative  += 1
            else:
                false_negative += 1

    predictor.delete_endpoint()
    predictor.delete_model()

    print(' ')
    if true_positive+true_negative+false_positive+false_negative > 0:     print('Accuracy  = {:.1f}'.format(((true_positive+true_negative)/(true_positive+true_negative+false_positive+false_negative))*100))
    if true_positive+false_positive > 0:                                  print('Precision = {:.1f}'.format((true_positive/(true_positive+false_positive))*100))
    if true_positive+false_negative > 0:                                  print('Recall    = {:.1f}'.format((true_positive/(true_positive+false_negative))*100))
    if true_positive+false_positive and true_positive+false_negative > 0: print('F1 Score  = {:.1f}'.format((2*((true_positive/(true_positive+false_positive))*(true_positive/(true_positive+false_negative)))/((true_positive/(true_positive+false_positive))+(true_positive/(true_positive+false_negative))))*100))

else:

    print('Please wait for tuning job to complete')

	get_image_uri(region, 'xgboost', '0.90-1').


---------------------------------------------------------------------------------------------------! 
Accuracy  = 99.5
Precision = 19.4
Recall    = 84.2
F1 Score  = 31.5
