In [None]:
import sagemaker
import os
import numpy as np
import pandas as pd
from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sagemaker.tuner import IntegerParameter,ContinuousParameter,HyperparameterTuner
from pandas import read_csv
from sklearn.utils import shuffle

In [None]:
session = sagemaker.Session()
role = get_execution_role()
bucket_name = session.default_bucket()

In [None]:
data_dir = './data/heart_data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [None]:
all_data = pd.read_csv(os.path.join("./data",'data.csv'))
#Not enough data points for "2","3","4" individually - I merge them with the same label
all_data = all_data.replace({'num': {2:1,3:1,4:1}})
all_data = shuffle(all_data)
labels = all_data["num"]
top_features = ['age', 'ekgmo', 'cmo', 'thalrest', 'cday', 'trestbpd', 'tpeakbps', 'tpeakbpd', 'thaldur', 'thalach', 'trestbps', 'ekgday', 'chol', 'oldpeak']
features = all_data.iloc[:,:-1]
features = features.loc[:,top_features]


In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.20)
features_train, features_val, labels_train, labels_val = train_test_split(features_train, labels_train, test_size=0.25)


In [None]:
pd.DataFrame(features_test).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)
pd.concat([labels_train, features_train], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)
pd.concat([labels_val, features_val], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)

In [None]:
prefix = 'heart-data'
test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri

#Retreive the container which contains the train and inference code for xgboost algorithm
xgb_container = get_image_uri(session.boto_region_name, 'xgboost','0.90-1') 

In [None]:
xgb = sagemaker.estimator.Estimator(xgb_container,
                                    role,    
                                    train_instance_count=1,
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket_name, prefix),
                                    sagemaker_session=session)

xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        scale_pos_weight=1.0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

In [None]:
xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb, 
                                               objective_metric_name = 'validation:rmse', 
                                               objective_type = 'Minimize', 
                                               max_jobs = 6, 
                                               max_parallel_jobs = 3, 
                                               hyperparameter_ranges = {
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10)
                                               })

In [None]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_test = sagemaker.s3_input(s3_data=test_location, content_type='csv')
s3_input_val = sagemaker.s3_input(s3_data=val_location, content_type='csv')

In [None]:
xgb_hyperparameter_tuner.fit({'train':s3_input_train,'validation':s3_input_val})

In [None]:
xgb_hyperparameter_tuner.wait()

In [None]:
xgb_attached = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())

In [None]:
xgb_transformer = xgb_attached.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')
xgb_transformer.wait()

In [None]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

In [None]:
predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [None]:
print(confusion_matrix(labels_test, predictions))
print(classification_report(labels_test, predictions))