# EmOpti Workshop - XGboost

Kernel `Python 3 (Data Science)` works well with this notebook

In [None]:
import sagemaker
import boto3
from sagemaker import get_execution_role
from sagemaker.session import s3_input

region = boto3.Session().region_name

session = sagemaker.Session()
s3bucket = session.default_bucket()
s3prefix = "emopti"
local_data_path = './data/emopti_data.csv'

sagemaker_role = get_execution_role()

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [None]:
train_filename = 'train.csv'

train_data_s3path = session.upload_data(bucket=s3bucket, path=f'data/xgb/{train_filename}', key_prefix=f'{s3prefix}/xgb/data')
print("Train data uploaded to: " + train_data_s3path)


In [None]:
from sagemaker.inputs import TrainingInput

train_input = TrainingInput(f's3://{s3bucket}/{s3prefix}/xgb/data', content_type='csv')

#### This line automatically looks for the XGBoost image URI and builds an XGBoost container using the specified version.

In [None]:

xgb_container = sagemaker.image_uris.retrieve("xgboost", region, "1.3-1")

In [None]:
%%time
from sagemaker.xgboost.estimator import XGBoost

# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "verbosity":"1",
        "objective":"binary:logistic",
        "num_round":"50"
}

# set an output path where the trained model will be saved
output_path = f's3://{s3bucket}/{s3prefix}/xgb-output/'

# construct a SageMaker estimator that calls the xgboost-container
xgb = sagemaker.estimator.Estimator(image_uri=xgb_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker_role,
                                          instance_count=1, 
                                          instance_type='ml.c5.4xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path)



# execute the XGBoost training job
xgb.fit({'train': train_input})


### Create an endpoint for doing predictions with the trained model  
The endpoint will take a few minutes to create

In [None]:
from sagemaker.serializers import CSVSerializer

xgb_predictor = xgb.deploy(
	initial_instance_count = 1,
	instance_type = 'ml.m5.xlarge',
	serializer = CSVSerializer()
)

In [None]:
import pandas as pd
df = pd.read_csv('data/xgb/test.csv')

In [None]:
df.head(5)

In [None]:
with open('data/xgb/test.csv') as fd:
    lines = fd.readlines()

In [None]:
%%time
predictions = []
for line in lines[1:]:
    val = float(xgb_predictor.predict(line))
    pred = 0
    if val > 0.6:
        pred = 1
    predictions.append(pred)

In [None]:
predictions[0:20]

## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

df_test_labels = pd.read_csv('data/xgb/test_labels.csv', header=None)
df_test_labels.head(10)

In [None]:
cm = confusion_matrix(df_test_labels, predictions)
cm

In [None]:
import numpy as np
import seaborn as sns
import matplotlib

#labels = [f'True Neg\n{cm[0][0]}', f'False Pos\n{cm[0][1]}', f'False Neg\n{cm[1][0]}', f'True Pos\n{cm[1][1]}']
#labels = np.asarray(labels).reshape(2,2)
ax = sns.heatmap(cm, annot=True, fmt='', cmap='Blues')
ax.set_xticklabels(['ADMIT', 'DISCHARGE'])
ax.set_yticklabels(['ADMIT', 'DISCHARGE'])
ax.set(ylabel = "True Label", xlabel = "Predicted Label")



### The predictor endpoint will run indefinitely, so delete the endpoint to stop any charges

In [None]:
xgb_predictor.delete_endpoint()