In [1]:
import sagemaker
sess = sagemaker.Session()

In [2]:
# Define IAM role
import boto3
import re
from sagemaker import get_execution_role

role = get_execution_role()

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import sys
import time
import json
from IPython.display import display
from time import strftime, gmtime
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

import yaml
from sqlalchemy import create_engine, text

## Load data from Redshift 

In [4]:
conf = yaml.load(open('./vars.yaml', 'r'))

In [5]:
rs_username = conf['REDSHIFT_USER']
rs_password = conf['REDSHIFT_PASSWORD']
rs_db = conf['REDSHIFT_DB']
rs_host = conf['REDSHIFT_HOST']

In [6]:
engine = create_engine(f'redshift+psycopg2://{rs_username}:{rs_password}@{rs_host}:5439/{rs_db}')

In [7]:
sql_get_dataset = '''
SELECT
isfraud
, amount
, diff_dest_equal_amount
, diff_origin_equal_amount
, is_cash_in
, is_cash_out
, is_debit
, is_payment
, is_transfer
, namedest_c
, nameorig_c
, newbalancedest
, newbalanceorig
, oldbalancedest
, oldbalanceorg
, step
FROM dataset
'''

In [8]:
df = pd.read_sql(sql_get_dataset, con=engine)

In [9]:
df.head()

Unnamed: 0,isfraud,amount,diff_dest_equal_amount,diff_origin_equal_amount,is_cash_in,is_cash_out,is_debit,is_payment,is_transfer,namedest_c,nameorig_c,newbalancedest,newbalanceorig,oldbalancedest,oldbalanceorg,step
0,0,3.993023,0,0,0,0,0,1,0,0,1,0.0,5.204926,0.0,5.230799,1
1,0,4.067039,0,0,0,0,0,1,0,0,1,0.0,4.47548,0.0,4.618623,1
2,0,3.604805,0,0,0,0,0,1,0,0,1,0.0,0.0,0.0,3.426836,1
3,0,3.408535,0,1,0,0,0,1,0,0,1,0.0,3.399719,0.0,3.705094,1
4,0,3.194464,0,0,0,0,0,1,0,0,1,0.0,0.0,0.0,2.654177,1


## Splitting the data 

<p><span style="font-weight: 400;">Usually in ML you can split the data into two parts: <em>Train</em> and <em>Test</em>. When applying HyperParameter Optimization, it's important to also have a <em>Validation</em> set.</span></p>
<p><span style="text-decoration: underline;"><span style="font-weight: 400;">In this case we will split the data to 4! Why?</span></span></p>
<ol>
<li style="font-weight: 400;"><span style="font-weight: 400;">The </span><span style="color: #3366ff;"><strong>Train</strong></span><span style="font-weight: 400;"> set will be used to fit the model using hyper parameter optimization.</span></li>
<li style="font-weight: 400;"><span style="font-weight: 400;">The </span><span style="color: #993366;"><strong>Validation</strong></span><span style="font-weight: 400;"> set is going to be used for validation while training.</span></li>
<li style="font-weight: 400;"><span style="font-weight: 400;">The </span><span style="color: #ff9900;"><strong>Test</strong></span><span style="font-weight: 400;"> set is going to evaluate the best performing model on the validation set.</span></li>
<li style="font-weight: 400;">The <span style="color: #008080;"><strong>fourth</strong></span> and final part will be used in the Capstone project to simulate new data - since our dataset is finite and we would like to demonstrate how to improve the model on future data.</li>
</ol>

In [10]:
quantiles = df['step'].quantile([0.6,0.8,0.9]).tolist()
train_end_step, test_end_step, validation_end_step = quantiles
print('train_end_step:' , train_end_step)
print('test_end_step:' , test_end_step)
print('validation_end_step:' , validation_end_step)

train_end_step: 281.0
test_end_step: 355.0
validation_end_step: 399.0


In [11]:
df_train = df[df['step'] < train_end_step]
df_test = df[(df['step'] >= train_end_step) & (df['step'] < test_end_step)]
df_validation = df[(df['step'] >= test_end_step) & (df['step'] < validation_end_step)]
df_new_data = df[(df['step'] >= validation_end_step)]

In [12]:
bucket = 'aws-capstone-gad'

In [None]:
!aws s3 mb s3://{bucket}

In [13]:
s3_train_path = f's3://{bucket}/data/train/train.csv'
s3_test_path = f's3://{bucket}/data/test/test.csv'
s3_val_path = f's3://{bucket}/data/val/val.csv'
s3_new_data_path = f's3://{bucket}/data/new_data/new_data.csv'

In [14]:
# Saving the data to S3 for training
df_train.drop('step', axis=1).to_csv(s3_train_path, index=None, header=None)
df_test.drop('step', axis=1).to_csv(s3_test_path, index=None, header=None)
df_validation.drop('step', axis=1).to_csv(s3_val_path, index=None, header=None)
df_new_data.drop('step', axis=1).to_csv(s3_new_data_path, index=None, header=None)

## Set up the hyper parameter training job

In [24]:
container = sagemaker.image_uris.retrieve('xgboost', boto3.Session().region_name, 'latest')

In [25]:
xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, 'models'),
                                    sagemaker_session=sess)

In [26]:
xgb.set_hyperparameters(eval_metric='auc',
                        objective='binary:logistic',
                        num_round=4000,
                        rate_drop=0.3)

In [27]:
hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                        'min_child_weight': ContinuousParameter(20, 1000),
                        'alpha': ContinuousParameter(0, 2),
                        'max_depth': IntegerParameter(4, 15)}

In [28]:
objective_metric_name = 'validation:auc'

In [29]:
tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=10,
                            max_parallel_jobs=5, 
                            early_stopping_type='Auto')

In [30]:
s3_input_train = TrainingInput(s3_data=s3_train_path.replace('train.csv', ''), content_type='csv')
s3_input_validation = TrainingInput(s3_data=s3_val_path.replace('val.csv', ''), content_type='csv')

In [None]:
tuner.fit({'train': s3_input_train, 'validation': s3_input_validation}, include_cls_metadata=False)

.......................................................................................................................................