<b>Naive Bayes Classifier Fraud Detection using a highly unbalanced dataset from Kaggle</b><br>
No feature engineering allowing for a concise notebook illustrating Hyperparameter Tuning & Training with AWS Sagemaker

https://www.kaggle.com/mlg-ulb/creditcardfraud

In [1]:
# Define Python Modules

import os
import boto3
import numpy as np
import pandas as pd

from sagemaker import s3_input
from sagemaker import get_execution_role
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.model import SKLearnPredictor
from sagemaker.tuner import HyperparameterTuner
from sagemaker.tuner import ContinuousParameter

In [2]:
# Define Unique Variables

role             = 'AmazonSageMaker-ExecutionRole-20191005T164168'
prefix           = 'naivebayesclassifier'
bucket           = 'creditcardfraud123'
csv_filename     = 's3://' + bucket + '/creditcard.csv'
endpoint_name    = 'creditcardnaivebayesclassifier'
sagemaker_client = boto3.Session().client('sagemaker')

In [3]:
# Randomly Separate Train, Validation, and Test Datasets

df = pd.read_csv(csv_filename)

model_data = df[df.columns.tolist()[-1:] + df.columns.tolist()[:-1]]

train_data, validation_data, test_data = np.split(
                                                  model_data.sample(frac=1),
                                                  [
                                                   int(0.7 * len(model_data)),
                                                   int(0.9 * len(model_data))
                                                  ]
                                                 )

train_data.to_csv(     'train.csv'     , header=True, index=False)
validation_data.to_csv('validation.csv', header=True, index=False)

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv'          )).upload_file('train.csv'     )
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

In [4]:
# Define Class Weights for Highly Unbalanced Fraud Data

class_counts  = df['Class'].reset_index().groupby('Class').agg('count').values
class_weights = float(class_counts[0]/float(class_counts[1]))

print('Non-Fraud Count in Train Set = {}'.format(    float(class_counts[0])))
print('Fraud Count in Train Set     = {}'.format(    float(class_counts[1])))
print('Class Weight Scale Factor    = {:.1f}'.format(class_weights         ))

Non-Fraud Count in Train Set = 284315.0
Fraud Count in Train Set     = 492.0
Class Weight Scale Factor    = 577.9


In [6]:
%%writefile script.py

import argparse
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB

def model_fn(model_dir):
    return joblib.load('/opt/ml/model/model.joblib')

if __name__ =='__main__':

    parser = argparse.ArgumentParser()

    parser.add_argument('--features', type=str)
    parser.add_argument('--target'  , type=str)

    parser.add_argument('--var_smoothing', type=float)

    args, _ = parser.parse_known_args()

    train_df = pd.read_csv('/opt/ml/input/data/train/train.csv'    )
    test_df  = pd.read_csv('/opt/ml/input/data/test/validation.csv')

    X_train = train_df[args.features.split()]
    y_train = train_df[args.target          ]
    X_test  = test_df[ args.features.split()]
    y_test  = test_df[ args.target          ]

    model = GaussianNB(
                       var_smoothing = args.var_smoothing
                      ).fit(
                            X_train,
                            y_train
                           )

    joblib.dump(
                model                       ,
                '/opt/ml/model/model.joblib'
               )

    for q in [10, 50, 90]:
        print('AE-at-' + str(q) + 'th-percentile: ' + str(np.percentile(a=np.abs(model.predict(X_test) - y_test), q=q)))

Overwriting script.py


In [8]:
# Launch Training and HyperParameter Tuning Jobs using Spot Instances

rfc = SKLearn(
              entry_point              = 'script.py'         ,
              role                     = get_execution_role(),
              train_instance_count     = 1                   ,
              train_instance_type      = 'ml.m5.large'       ,
              train_use_spot_instances = True                ,
              train_max_run            = 3600                ,
              train_max_wait           = 3600                ,
              metric_definitions       = [{
                                           'Name'  : 'median-AE'                          ,
                                           'Regex' : "AE-at-50th-percentile: ([0-9.]+).*$"
                                         }]                  ,
              hyperparameters          = {
                                          'var_smoothing' : 1e-9                                                                                                           ,
                                          'features' : 'Time V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 Amount',
                                          'target'   : 'Class'
                                         }
             )

tuner = HyperparameterTuner(
                            estimator             = rfc              ,
                            max_jobs              = 3                ,
                            max_parallel_jobs     = 2                ,
                            hyperparameter_ranges = {
                                                     'var_smoothing' : ContinuousParameter(1e-9, 1e-8)
                                                    }                ,
                            objective_type        = 'Minimize'       ,
                            objective_metric_name = 'median-AE'      ,
                            metric_definitions    = [{
                                                      'Name'  : 'median-AE'                          ,
                                                      'Regex' : 'AE-at-50th-percentile: ([0-9.]+).*$'
                                                    }]
                           )

tuner.fit({
           'train' : s3_input(s3_data='s3://{}/{}/train/'.format(     bucket, prefix), content_type='text/csv'),
           'test'  : s3_input(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='text/csv')
         })

In [27]:
# Create Ephemeral Endpoint to Analyze Best Tuned Model

if sagemaker_client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuner.latest_tuning_job.name)['HyperParameterTuningJobStatus'] == 'Completed':
    
    best_training_job = sagemaker_client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuner.latest_tuning_job.name)['BestTrainingJob']['TrainingJobName']

    SKLearnModel(
                 model_data  = sagemaker_client.describe_training_job(TrainingJobName=best_training_job)['ModelArtifacts']['S3ModelArtifacts'],
                 role        = get_execution_role()                                                                                           ,
                 entry_point = 'script.py'
                ).deploy(
                         initial_instance_count = 1              ,
                         instance_type          = 'ml.t2.2xlarge',
                         endpoint_name          = endpoint_name
                        )

    predictor = SKLearnPredictor(endpoint_name)

    true_positive  = 0
    false_positive = 0
    true_negative  = 0
    false_negative = 0

    columns = ['Time','V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','V11','V12','V13','V14','V15','V16','V17','V18','V19','V20','V21','V22','V23','V24','V25','V26','V27','V28','Amount']
    
    for i in range(0, test_data.count()['Class']):
        
        prediction = predictor.predict(test_data.iloc[i:i+1][columns])[0]
        
        if prediction > 0:

            if int(test_data.iloc[i:i+1]['Class']) == prediction:
                true_positive  += 1
            else:
                false_positive += 1

        else:

            if int(test_data.iloc[i:i+1]['Class']) == prediction:
                true_negative  += 1
            else:
                false_negative += 1

    predictor.delete_endpoint()
    predictor.delete_model()

    print(' ')
    if true_positive+true_negative+false_positive+false_negative > 0:     print('Accuracy  = {:.1f}'.format(((true_positive+true_negative)/(true_positive+true_negative+false_positive+false_negative))*100))
    if true_positive+false_positive > 0:                                  print('Precision = {:.1f}'.format((true_positive/(true_positive+false_positive))*100))
    if true_positive+false_negative > 0:                                  print('Recall    = {:.1f}'.format((true_positive/(true_positive+false_negative))*100))
    if true_positive+false_positive and true_positive+false_negative > 0: print('F1 Score  = {:.1f}'.format((2*((true_positive/(true_positive+false_positive))*(true_positive/(true_positive+false_negative)))/((true_positive/(true_positive+false_positive))+(true_positive/(true_positive+false_negative))))*100))

else:

    print('Please wait for tuning job to complete')

---------------------------------------------------------------------------------------------------! 
Accuracy  = 99.8
Precision = 28.9
Recall    = 27.7
F1 Score  = 28.3
