In [None]:
# Import AWS Sagemaker package and get IAM role
import sagemaker

role = sagemaker.get_execution_role()

In [None]:
# Update package manager because Sagemaker notebook instances look out of data
!pip install --upgrade pip

In [None]:
# upgrade scikit-learn as imblearn has some recent changes
!pip install --upgrade scikit-learn

In [None]:
# Install python package for doing SMOTE - because we have massive class imbalance
!pip install -U imbalanced-learn

In [None]:
# install matplotlib for visualizing tree later
!pip install -U matplotlib

In [None]:
# install graphviz as matplotlib will need this for the tree layout 
!pip install -U graphviz

In [None]:
# install xgboost so we can inspect, locally, the model objects we build
!pip install -U xgboost

In [None]:
# Import necessary numerical libraries
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt   
from IPython.display import Image                 
from IPython.display import display               
from sklearn.datasets import dump_svmlight_file   
from sklearn.model_selection import train_test_split
from time import gmtime, strftime                 
import sys                                        
import math                                       
import json
import boto3

import s3fs
import pickle
import tarfile
import xgboost as xgb

In [None]:
# Import imblearn package for doing SMOTE resampling
from imblearn.over_sampling import SMOTE

In [None]:
# Copy raw data from S3 bucket to filepath accessible to the notebook
data_bucket = 'immersion-day-ccdata'
raw_data_filename = 'creditcard.csv'

s3 = boto3.resource('s3')
s3.Bucket(data_bucket).download_file(raw_data_filename, 'ccRaw.csv')

In [None]:
# Read data into pandas dataframe
ccRaw = pd.read_csv('./ccRaw.csv')
pd.set_option('display.max_rows', 20) 
ccRaw

In [None]:
# Look at response variable. Looks like we don't have to convert response variable to a dummy variable - already done
ccRaw.Class.unique() 

In [None]:
# Now create dataset splits for training, validation and test
# Split out test data first (10% of the total)
nonTest_data, test_data = train_test_split( ccRaw, test_size=0.1, random_state=42, stratify=ccRaw['Class'].array )

In [None]:
# Perform SMOTE on nonTest_data to synthetically generate a more balanced data set
nonTest_X = nonTest_data.drop('Class', axis=1)
nonTest_y = nonTest_data['Class']

smote = SMOTE(random_state=42)
nonTest_X_res, nonTest_y_res = smote.fit_resample(nonTest_X, nonTest_y)

In [None]:
# Split resampled non-test data into training and validation sets
train_data_X, validation_data_X, train_data_y, validation_data_y = train_test_split( nonTest_X_res, nonTest_y_res, \
                                                                                    test_size=(2.0/7.0), random_state=1987, stratify=nonTest_y_res ) 

In [None]:
# Transform training, validation, and test sets into libSVM format for use in xgboost
dump_svmlight_file(X=train_data_X, y=train_data_y, f='train.libsvm')
dump_svmlight_file(X=validation_data_X, y=validation_data_y, f='validation.libsvm')
dump_svmlight_file(X=test_data.drop(['Class'], axis=1), y=test_data['Class'], f='test.libsvm')

# Write training and validation sets to our S3 bucket
bucket = '<your_s3_bucket_name_here>'
prefix = 'sagemaker/ccData_xgboost'
boto3.Session().resource('s3').Bucket(bucket).Object(prefix + '/train/train.libsvm').upload_file('train.libsvm')
boto3.Session().resource('s3').Bucket(bucket).Object(prefix + '/validation/validation.libsvm').upload_file('validation.libsvm')

In [None]:
# Specify containers defining training instances with xgboost algorithm
containers = {
                'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
                'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
                'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
                'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'
             }

In [None]:
# Get AWS sagemaker client
sm = boto3.client('sagemaker')

In [None]:
%%time

# Define HyperParameterTuningJob
# We will only tune the learning rate by maximizing the AUC value of the 
# validation set. The hyperparameter search is a random one, using a sample of
# 10 training jobs - better methods for searching the hyperparameter space are 
# available, but for simplicty and demonstration purposes we will use the 
# random search method. Run a max of 3 training jobs in parallel
job_name = "xgb-cc" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
response = sm.create_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=job_name,
    HyperParameterTuningJobConfig={
        'Strategy': 'Random',
        'HyperParameterTuningJobObjective': {
            'Type': 'Maximize',
            'MetricName': 'validation:auc'
        },
        'ResourceLimits': {
            'MaxNumberOfTrainingJobs': 10,
            'MaxParallelTrainingJobs': 3
        },
        'ParameterRanges': {
            'ContinuousParameterRanges': [
                {
                    'Name': 'eta',
                    'MinValue': '0.01',
                    'MaxValue': '0.4',
                    'ScalingType': 'Linear'
                },
            ]
        },
        'TrainingJobEarlyStoppingType': 'Off'
    },
    TrainingJobDefinition={
        'StaticHyperParameters': {
            "max_depth":"3",
            "eval_metric":"auc",
            "scale_pos_weight":"2.0",
            "subsample":"0.5",
            "objective":"binary:logistic",
            "num_round":"100",
            "seed":"42"
        },
        'AlgorithmSpecification': {
        'TrainingImage': containers[boto3.Session().region_name],
        'TrainingInputMode': "File"
        },
        'RoleArn': role,
        "InputDataConfig": [
            {
                "ChannelName": "train",
                "DataSource": {
                    "S3DataSource": {
                        "S3DataType": "S3Prefix",
                        "S3Uri": "s3://{}/{}/train".format(bucket, prefix),
                        "S3DataDistributionType": "FullyReplicated"
                    }
                },
                "ContentType": "libsvm",
                "CompressionType": "None"
            },
            {
                "ChannelName": "validation",
                "DataSource": {
                    "S3DataSource": {
                        "S3DataType": "S3Prefix",
                        "S3Uri": "s3://{}/{}/validation".format(bucket, prefix),
                        "S3DataDistributionType": "FullyReplicated"
                    }
                },
                "ContentType": "libsvm",
                "CompressionType": "None"
            }
        ],
        "OutputDataConfig": {
            "S3OutputPath": "s3://{}/{}/xgboost-ccData-hyperopt/output".format(bucket, prefix)
        },
        'ResourceConfig': {
        'InstanceCount': 1,
        'InstanceType': 'ml.c4.xlarge',
        'VolumeSizeInGB': 10
        },
        'StoppingCondition': {
            'MaxRuntimeInSeconds': 60 * 60
        }
    }
)

In [None]:
# Get status of HyperParameterTuningJob to see that it has completed
status_hyperopt = sm.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=job_name)['HyperParameterTuningJobStatus']

status_hyperopt


In [None]:
# Now inspect the results from the different training jobs. We want to select
# the training job that gave the highest AUC value on the validation set

# this gets the results from all the training jobs as a pandas dataframe
df_hyperopt = sagemaker.HyperparameterTuningJobAnalytics(job_name).dataframe()

In [None]:
# Let's inspect the training job results
df_hyperopt

In [None]:
# Let's look at the details of the 'best' training job
df_hyperopt.loc[df_hyperopt['FinalObjectiveValue'].idxmax()]

In [None]:
# Capture name of the best training job
optimalTrainingJob = df_hyperopt.loc[df_hyperopt['FinalObjectiveValue'].idxmax()]['TrainingJobName']
optimalTrainingJob

In [None]:
# Read in pickled model object for the optimal training job
# so we can inspect the model locally
model_path_prefix = 's3://david-hoyle-sagemaker/sagemaker/ccData_xgboost/xgboost-ccData-hyperopt/output/' 
model_path = model_path_prefix + optimalTrainingJob + '/output/model.tar.gz'

fs = s3fs.S3FileSystem()

with fs.open(model_path, 'rb') as f:
    with tarfile.open(fileobj=f, mode='r') as tar_f:
        with tar_f.extractfile('xgboost-model') as extracted_f:
            xgb_optimalModel = pickle.load(extracted_f)

In [None]:
# Plot first tree in the ensemble of the optimal training job
xgb.plot_tree(xgb_optimalModel,num_trees=0)
plt.rcParams['figure.figsize'] = [50, 100]
plt.show()

In [None]:
# Plot second tree in the ensemble of the optimal training job
xgb.plot_tree(xgb_optimalModel,num_trees=1)
plt.rcParams['figure.figsize'] = [50, 100]
plt.show()

In [None]:
# Create model for use in endpoint that we will use for predictions
inference_job_name = 'optimal-cc-xgb' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
create_model_response = sm.create_model(
    ModelName=inference_job_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        'Image': containers[boto3.Session().region_name],
        'ModelDataUrl': sm.describe_training_job(TrainingJobName=optimalTrainingJob)['ModelArtifacts']['S3ModelArtifacts']})

print(create_model_response['ModelArn'])


In [None]:
# Create endpoint config
xgboost_endpoint_config = 'ccData-xgboost-endpoint-config-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(xgboost_endpoint_config)
create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=xgboost_endpoint_config,
    ProductionVariants=[{
        'InstanceType': 'ml.t2.medium',
        'InitialInstanceCount': 1,
        'ModelName': inference_job_name,
        'VariantName': 'AllTraffic'}])

print("Endpoint Config Arn: " + create_endpoint_config_response['EndpointConfigArn'])


In [None]:
%%time

# Create endpoint
xgboost_endpoint = 'EXAMPLE-ccData-xgb-endpoint-' + strftime("%Y%m%d%H%M", gmtime())
print(xgboost_endpoint)
create_endpoint_response = sm.create_endpoint(
    EndpointName=xgboost_endpoint,
    EndpointConfigName=xgboost_endpoint_config)
print(create_endpoint_response['EndpointArn'])

resp = sm.describe_endpoint(EndpointName=xgboost_endpoint)
status = resp['EndpointStatus']
print("Status: " + status)

try:
    sm.get_waiter('endpoint_in_service').wait(EndpointName=xgboost_endpoint)
finally:
    resp = sm.describe_endpoint(EndpointName=xgboost_endpoint)
    status = resp['EndpointStatus']
    print("Arn: " + resp['EndpointArn'])
    print("Status: " + status)

    if status != 'InService':
        message = sm.describe_endpoint(EndpointName=xgboost_endpoint)['FailureReason']
        print('Endpoint creation failed with the following error: {}'.format(message))
        raise Exception('Endpoint creation did not succeed')
        

In [None]:
runtime = boto3.client('runtime.sagemaker')

In [None]:
# Define function for calling endpoint with input test data
def do_predict(data, endpoint_name, content_type):
    payload = '\n'.join(data)
    response = runtime.invoke_endpoint(EndpointName=endpoint_name, 
                                   ContentType=content_type, 
                                   Body=payload)
    result = response['Body'].read()
    result = result.decode("utf-8")
    result = result.split(',')
    preds = [float((num)) for num in result]
    preds = [round(num) for num in preds]
    return preds

def batch_predict(data, batch_size, endpoint_name, content_type):
    items = len(data)
    arrs = []
    
    for offset in range(0, items, batch_size):
        if offset+batch_size < items:
            results = do_predict(data[offset:(offset+batch_size)], endpoint_name, content_type)
            arrs.extend(results)
        else:
            arrs.extend(do_predict(data[offset:items], endpoint_name, content_type))
        sys.stdout.write('.')
    return(arrs)

In [None]:
%%time

# Calculate error rate for our test set
import json

with open('test.libsvm', 'r') as f:
    payload = f.read().strip()

labels = [int(line.split(' ')[0]) for line in payload.split('\n')]
test_data = [line for line in payload.split('\n')]
preds = batch_predict(test_data, 100, xgboost_endpoint, 'text/x-libsvm')

print ('\nerror rate=%f' % ( sum(1 for i in range(len(preds)) if preds[i]!=labels[i]) /float(len(preds))))

In [None]:
# Calculate confusion matrix
pd.crosstab(index=np.array(labels), columns=np.array(preds))

In [None]:
# Delete endpoint
sm.delete_endpoint(EndpointName=xgboost_endpoint)