# Set up SageMaker Environment

In [None]:
import sys

!{sys.executable} -m pip install imbalanced-learn --upgrade

In [None]:
#Set-up Sagemaker Environment:

import sagemaker #Import SageMaker Framework

sess = sagemaker.Session()

bucket = "rzoghbi-medium-smote-article-dataset" #Replace with the bucket where your Data is located.
subfolder = "" #Prefix 

#

# Define IAM role
import boto3 # AWS Python SDK
from sagemaker import get_execution_role #Defined when you create your instance
import os

role = get_execution_role()

#Note: The execution role is only available when running a notebook within SageMaker. 
#If you run get_execution_role in a notebook not on SageMaker, expect a region error.

print(role)

# Import Required packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import sys
import time
import json
from IPython.display import display
from time import strftime, gmtime
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer
from sklearn.model_selection import train_test_split



# Import XGBoost model docker image

In [None]:
container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "latest")
display(container)

### Test connection with S3 Bucket, if properly setup, you should see contents of the bucket

In [None]:
conn = boto3.client('s3')
contents = conn.list_objects(Bucket=bucket, Prefix=subfolder)['Contents']
for f in contents:
    print(f['Key'])

# Model Training

### To demonstrate performace of the different resampling methods, we will train and evaluate a predictor using the imbalanced dataset, rebalanced with SMOTE, rebalanced with KMeans SMOTE and rebalanced with SVM SMOTE

### This means that we will have 4 different (training/validation - test) processes
#### train / validation : imbalanced
#### train / validation : default SMOTE
#### train / validation : KMeans SMOTE
#### train / validation : SVM Smote
#### train / validation : SMOTE ENN

## Model Training with Imbalanced Dataset

#### We will create TrainingInputs that our function can use as pointer to our files in S3

In [None]:
#Import data from S3 buckets:

#TrainingInputs 
s3_input_train = TrainingInput(
    s3_data="s3://{}/train/".format(bucket), content_type="csv")

#ValidationInputs 
s3_input_validation = TrainingInput(
   s3_data="s3://{}/validation/".format(bucket), content_type="csv")

## Setup_Hyperparameter_Tuning 


We will set up a hyper parameter tunning for each model training.
This will even the working conditions of the algorithm, so each training procedure is done with the best set of hyper parameters

In [None]:
from sagemaker.image_uris import retrieve

training_image = retrieve(framework="xgboost", region=boto3.Session().region_name, version="latest")

s3_input_train = "s3://{}/train".format(bucket, content_type="csv")
s3_input_validation = "s3://{}/validation/".format(bucket, content_type="csv")

In [None]:
from time import gmtime, strftime, sleep

tuning_job_name = "xgboost-tuningjob-" + strftime("%d-%H-%M-%S", gmtime())

print(tuning_job_name)

tuning_job_config = {
    "ParameterRanges": {
        "CategoricalParameterRanges": [],
        "ContinuousParameterRanges": [
            {
                "MaxValue": "1",
                "MinValue": "0",
                "Name": "eta",
            },
            {
                "MaxValue" : "1",
                "MinValue" : "0.1",
                "Name" : "colsample_bylevel"
                
            },
            {
                "MaxValue" : "1",
                "MinValue" : "0.5",
                "Name" : "colsample_bytree"
            },
            {
                "MaxValue" : "5",
                "MinValue" : "0",
                "Name" : "gamma"
            },
            {
                "MaxValue": "120",
                "MinValue": "1",
                "Name": "min_child_weight",
            },
            {
                "MaxValue": "1000",
                "MinValue": "0",
                "Name": "alpha",
            },
            {
                "MaxValue": "1",
                "MinValue": "0.5",
                "Name": "subsample",
            }
        ],
        "IntegerParameterRanges": [
            {
                "MaxValue": "10",
                "MinValue": "0",
                "Name": "max_depth",
            },
            {
                "MaxValue": "10",
                "MinValue": "0",
                "Name": "max_delta_step",
            }
        ],
    },
    "ResourceLimits": {"MaxNumberOfTrainingJobs": 8, "MaxParallelTrainingJobs": 2},
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {"MetricName": "validation:auc", "Type": "Maximize"},
}

In [None]:

training_job_definition = {
    "AlgorithmSpecification": {"TrainingImage": training_image, "TrainingInputMode": "File"},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "CompressionType": "None",
            "ContentType": "csv",
            "DataSource": {
                "S3DataSource": {
                    "S3DataDistributionType": "FullyReplicated",
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_input_train,
                }
            },
        },
        {
            "ChannelName": "validation",
            "CompressionType": "None",
            "ContentType": "csv",
            "DataSource": {
                "S3DataSource": {
                    "S3DataDistributionType": "FullyReplicated",
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_input_validation,
                }
            },
        },
    ],
    "OutputDataConfig": {"S3OutputPath": "s3://{}/output".format(bucket)},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.m4.xlarge", "VolumeSizeInGB": 10},
    "RoleArn": role,
    "StaticHyperParameters": {
        "eval_metric": "auc",
        "num_round": "100",
        "objective": "binary:logistic",
        "rate_drop": "0.3",
        "tweedie_variance_power": "1.4",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 43200},
}

In [None]:
smclient = boto3.Session().client("sagemaker")

In [None]:
smclient.create_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name,
    HyperParameterTuningJobConfig=tuning_job_config,
    TrainingJobDefinition=training_job_definition,
)

In [None]:
smclient.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)[
    "HyperParameterTuningJobStatus"
]

In [None]:
# run this cell to check current status of hyperparameter tuning job
tuning_job_result = smclient.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name
)

status = tuning_job_result["HyperParameterTuningJobStatus"]
if status != "Completed":
    print("Reminder: the tuning job has not been completed.")

job_count = tuning_job_result["TrainingJobStatusCounters"]["Completed"]
print("%d training jobs have completed" % job_count)

objective = tuning_job_result["HyperParameterTuningJobConfig"]["HyperParameterTuningJobObjective"]
is_minimize = objective["Type"] != "Maximize"
objective_name = objective["MetricName"]

In [None]:
from pprint import pprint

if tuning_job_result.get("BestTrainingJob", None):
    print("Best model found so far:")
    pprint(tuning_job_result["BestTrainingJob"])
else:
    print("No training jobs have reported results yet.")

#### Set training parameters and launch training

In [None]:
s3_input_train = TrainingInput(
    s3_data="s3://{}/train/".format(bucket), content_type="csv")

s3_input_validation = TrainingInput(
   s3_data="s3://{}/validation/".format(bucket), content_type="csv")

In [None]:
sess = sagemaker.Session()



In [None]:
xgb = sagemaker.estimator.Estimator(
    container, #container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "latest")
    role, #role = arn:aws:iam::947941747067:role/service-role/AmazonSageMaker-ExecutionRole-20211124T165440
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/output".format(bucket),
    sagemaker_session=sess,
)

In [None]:
xgb.set_hyperparameters(
    alpha=16.913041126287006),
    colsample_bylevel=0.9195357961477112,
    colsample_bytree=0.8625951683028819,
    max_depth=6,
    eta=0.3571425509611488,
    gamma=0.3972934045288123,
    min_child_weight=100.38786351113177,
    objective="binary:logistic",
    eval_metric= "auc", 
    num_round=100,
    max_delta_step=5,
    subsample=0.5
)

In [None]:
sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(
    container, #container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "latest")
    role, #role = arn:aws:iam::947941747067:role/service-role/AmazonSageMaker-ExecutionRole-20211124T165440
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/output".format(bucket),
    sagemaker_session=sess,
)


xgb.set_hyperparameters(
    alpha=16.913041126287006,
    colsample_bylevel=0.9195357961477112,
    colsample_bytree=0.8625951683028819,
    max_depth=6,
    eta=0.3571425509611488,
    gamma=0.3972934045288123,
    min_child_weight=100.38786351113177,
    objective="binary:logistic",
    eval_metric= "auc", 
    num_round=100,
    max_delta_step=5,
    subsample=0.5
)


xgb.fit({"train": s3_input_train, "validation": s3_input_validation})

### Review training Metrics

In [None]:
%matplotlib inline
from sagemaker.analytics import TrainingJobAnalytics

training_job_name = xgb._current_job_name
metric_name = ['train:auc','validation:auc']

metrics_dataframe = TrainingJobAnalytics(training_job_name=training_job_name,metric_names=metric_name).dataframe()
#plt = metrics_dataframe.plot(kind='line', figsize=(12,5), x='timestamp', y='value', style='b.', legend=False)
#plt.set_ylabel(metric_name);
metrics_dataframe

### Review performance on Test Dataset

#### With our model trained, let's deploy it to a hosted endpoint

In [None]:
xgb_predictor = xgb.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=CSVSerializer()
)

In [None]:
data_key = 'test/test.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

test_data = pd.read_csv(data_location, sep=',', header=None)
test_data

In [None]:
test_data.to_numpy()[:, 1:]

In [None]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ""
    for array in split_array:
        predictions = ",".join([predictions, xgb_predictor.predict(array).decode("utf-8")])

    return np.fromstring(predictions[1:], sep=",")


predictions = predict(test_data.to_numpy()[:, 1:])

In [None]:
predictions.max()

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score

y_test = test_data.iloc[:, 0]
y_pred = np.where(predictions > 0.03, 1, 0)

precision, recall, fscore, support = score(y_test, y_pred)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

In [None]:
df_cm = pd.crosstab(
    index=test_data.iloc[:, 0],
    columns=np.round(predictions),
    rownames=["actual"],
    colnames=["predictions"],
)


import seaborn as sn
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='g')

## Model Training with Default-SMOTE Resampled Dataset

#### Set training parameters and launch training

In [None]:
#Import data from S3 buckets:

#TrainingInputs 
s3_input_train_smote = TrainingInput(
    s3_data="s3://{}/train_smote/".format(bucket), content_type="csv")

#ValidationInputs 
s3_input_validation_smote = TrainingInput(
   s3_data="s3://{}/validation_smote/".format(bucket), content_type="csv")

## Setup_Hyperparameter_Tuning 


In [None]:
from sagemaker.image_uris import retrieve

training_image = retrieve(framework="xgboost", region=boto3.Session().region_name, version="latest")

s3_input_train = "s3://{}/train_smote".format(bucket, content_type="csv")
s3_input_validation = "s3://{}/validation_smote/".format(bucket, content_type="csv")

In [None]:
from time import gmtime, strftime, sleep

tuning_job_name = "xgboost-tuningjob-" + strftime("%d-%H-%M-%S", gmtime())

print(tuning_job_name)

tuning_job_config = {
    "ParameterRanges": {
        "CategoricalParameterRanges": [],
        "ContinuousParameterRanges": [
            {
                "MaxValue": "1",
                "MinValue": "0",
                "Name": "eta",
            },
            {
                "MaxValue" : "1",
                "MinValue" : "0.1",
                "Name" : "colsample_bylevel"
                
            },
            {
                "MaxValue" : "1",
                "MinValue" : "0.5",
                "Name" : "colsample_bytree"
            },
            {
                "MaxValue" : "5",
                "MinValue" : "0",
                "Name" : "gamma"
            },
            {
                "MaxValue": "120",
                "MinValue": "1",
                "Name": "min_child_weight",
            },
            {
                "MaxValue": "1000",
                "MinValue": "0",
                "Name": "alpha",
            },
            {
                "MaxValue": "1",
                "MinValue": "0.5",
                "Name": "subsample",
            }
        ],
        "IntegerParameterRanges": [
            {
                "MaxValue": "10",
                "MinValue": "0",
                "Name": "max_depth",
            },
            {
                "MaxValue": "10",
                "MinValue": "0",
                "Name": "max_delta_step",
            }
        ],
    },
    "ResourceLimits": {"MaxNumberOfTrainingJobs": 8, "MaxParallelTrainingJobs": 2},
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {"MetricName": "validation:auc", "Type": "Maximize"},
}

In [None]:

training_job_definition = {
    "AlgorithmSpecification": {"TrainingImage": training_image, "TrainingInputMode": "File"},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "CompressionType": "None",
            "ContentType": "csv",
            "DataSource": {
                "S3DataSource": {
                    "S3DataDistributionType": "FullyReplicated",
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_input_train,
                }
            },
        },
        {
            "ChannelName": "validation",
            "CompressionType": "None",
            "ContentType": "csv",
            "DataSource": {
                "S3DataSource": {
                    "S3DataDistributionType": "FullyReplicated",
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_input_validation,
                }
            },
        },
    ],
    "OutputDataConfig": {"S3OutputPath": "s3://{}/output".format(bucket)},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.m4.xlarge", "VolumeSizeInGB": 10},
    "RoleArn": role,
    "StaticHyperParameters": {
        "eval_metric": "auc",
        "num_round": "100",
        "objective": "binary:logistic",
        "rate_drop": "0.3",
        "tweedie_variance_power": "1.4",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 43200},
}

In [None]:
smclient = boto3.Session().client("sagemaker")

In [None]:
smclient.create_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name,
    HyperParameterTuningJobConfig=tuning_job_config,
    TrainingJobDefinition=training_job_definition,
)

In [None]:
smclient.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)[
    "HyperParameterTuningJobStatus"
]

In [None]:
# run this cell to check current status of hyperparameter tuning job
tuning_job_result = smclient.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name
)

status = tuning_job_result["HyperParameterTuningJobStatus"]
if status != "Completed":
    print("Reminder: the tuning job has not been completed.")

job_count = tuning_job_result["TrainingJobStatusCounters"]["Completed"]
print("%d training jobs have completed" % job_count)

objective = tuning_job_result["HyperParameterTuningJobConfig"]["HyperParameterTuningJobObjective"]
is_minimize = objective["Type"] != "Maximize"
objective_name = objective["MetricName"]

In [None]:
from pprint import pprint

if tuning_job_result.get("BestTrainingJob", None):
    print("Best model found so far:")
    pprint(tuning_job_result["BestTrainingJob"])
else:
    print("No training jobs have reported results yet.")

In [None]:
#Import data from S3 buckets:

#TrainingInputs 
s3_input_train_smote = TrainingInput(
    s3_data="s3://{}/train_smote/".format(bucket), content_type="csv")

#ValidationInputs 
s3_input_validation_smote = TrainingInput(
   s3_data="s3://{}/validation_smote/".format(bucket), content_type="csv")

In [None]:
sess = sagemaker.Session()

xgb_smote = sagemaker.estimator.Estimator(
    container, #container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "latest")
    role, #role = arn:aws:iam::947941747067:role/service-role/AmazonSageMaker-ExecutionRole-20211124T165440
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/output".format(bucket),
    sagemaker_session=sess,
)


xgb_smote.set_hyperparameters(
    alpha=40.216409796811405,
    colsample_bylevel=0.7849005093037494,
    min_child_weight=70.10162823835203,
    colsample_bytree=0.5625572571583626,
    max_depth=9,
    gamma=2.3464688151496302,
    max_delta_step=2,
    eta=0.42230417721555225,
    objective="binary:logistic",
    eval_metric= "auc", 
    num_round=100,
    subsample=0.6261665126373279
)
xgb_smote.fit({"train": s3_input_train_smote, "validation": s3_input_validation_smote})

In [None]:
%matplotlib inline
from sagemaker.analytics import TrainingJobAnalytics

training_job_name = xgb_smote._current_job_name
metric_name = ['train:auc','validation:auc']

metrics_dataframe = TrainingJobAnalytics(training_job_name=training_job_name,metric_names=metric_name).dataframe()
#plt = metrics_dataframe.plot(kind='line', figsize=(12,5), x='timestamp', y='value', style='b.', legend=False)
#plt.set_ylabel(metric_name);
metrics_dataframe

#### With our model trained, let's deploy it to a hosted endpoint

In [None]:
xgb_predictor_smote = xgb_smote.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=CSVSerializer()
)

In [None]:
data_key = 'test/test.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

test_data = pd.read_csv(data_location, sep=',', header=None)
test_data

In [None]:
test_data.to_numpy()[:, 1:]

In [None]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions_smote = ""
    for array in split_array:
        predictions_smote = ",".join([predictions_smote, xgb_predictor_smote.predict(array).decode("utf-8")])

    return np.fromstring(predictions_smote[1:], sep=",")


predictions_smote = predict(test_data.to_numpy()[:, 1:])

In [None]:
print(predictions_smote)


In [None]:
from sklearn.metrics import precision_recall_fscore_support as score

y_test = test_data.iloc[:, 0]
y_pred = np.where(predictions_smote > 0.03, 1, 0)

precision, recall, fscore, support = score(y_test, y_pred)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

In [None]:
wtf = np.round(predictions_smote)

In [None]:
df_cm_smote = pd.crosstab(
    index=test_data.iloc[:, 0],
    columns=np.round(predictions_smote),
    rownames=["actual"],
    colnames=["predictions_smote"],
)


import seaborn as sn
sn.set(font_scale=1.4)
sn.heatmap(df_cm_smote, annot=True, annot_kws={"size": 16}, fmt='g')

## Model Training with KMeans-SMOTE Resampled Dataset

## Setup_Hyperparameter_Tuning 


In [None]:
from sagemaker.image_uris import retrieve

training_image = retrieve(framework="xgboost", region=boto3.Session().region_name, version="latest")

s3_input_train = "s3://{}/train_KMSmote".format(bucket, content_type="csv")
s3_input_validation = "s3://{}/validation_KMSmote/".format(bucket, content_type="csv")

In [None]:
from time import gmtime, strftime, sleep

tuning_job_name = "xgboost-tuningjob-" + strftime("%d-%H-%M-%S", gmtime())

print(tuning_job_name)

tuning_job_config = {
    "ParameterRanges": {
        "CategoricalParameterRanges": [],
        "ContinuousParameterRanges": [
            {
                "MaxValue": "1",
                "MinValue": "0",
                "Name": "eta",
            },
            {
                "MaxValue" : "1",
                "MinValue" : "0.1",
                "Name" : "colsample_bylevel"
                
            },
            {
                "MaxValue" : "1",
                "MinValue" : "0.5",
                "Name" : "colsample_bytree"
            },
            {
                "MaxValue" : "5",
                "MinValue" : "0",
                "Name" : "gamma"
            },
            {
                "MaxValue": "120",
                "MinValue": "1",
                "Name": "min_child_weight",
            },
            {
                "MaxValue": "1000",
                "MinValue": "0",
                "Name": "alpha",
            },
            {
                "MaxValue": "1",
                "MinValue": "0.5",
                "Name": "subsample",
            }
        ],
        "IntegerParameterRanges": [
            {
                "MaxValue": "10",
                "MinValue": "0",
                "Name": "max_depth",
            },
            {
                "MaxValue": "10",
                "MinValue": "0",
                "Name": "max_delta_step",
            }
        ],
    },
    "ResourceLimits": {"MaxNumberOfTrainingJobs": 8, "MaxParallelTrainingJobs": 2},
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {"MetricName": "validation:auc", "Type": "Maximize"},
}

In [None]:

training_job_definition = {
    "AlgorithmSpecification": {"TrainingImage": training_image, "TrainingInputMode": "File"},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "CompressionType": "None",
            "ContentType": "csv",
            "DataSource": {
                "S3DataSource": {
                    "S3DataDistributionType": "FullyReplicated",
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_input_train,
                }
            },
        },
        {
            "ChannelName": "validation",
            "CompressionType": "None",
            "ContentType": "csv",
            "DataSource": {
                "S3DataSource": {
                    "S3DataDistributionType": "FullyReplicated",
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_input_validation,
                }
            },
        },
    ],
    "OutputDataConfig": {"S3OutputPath": "s3://{}/output".format(bucket)},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.m4.xlarge", "VolumeSizeInGB": 10},
    "RoleArn": role,
    "StaticHyperParameters": {
        "eval_metric": "auc",
        "num_round": "100",
        "objective": "binary:logistic",
        "rate_drop": "0.3",
        "tweedie_variance_power": "1.4",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 43200},
}

In [None]:
smclient = boto3.Session().client("sagemaker")

In [None]:
smclient.create_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name,
    HyperParameterTuningJobConfig=tuning_job_config,
    TrainingJobDefinition=training_job_definition,
)

In [None]:
smclient.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)[
    "HyperParameterTuningJobStatus"
]

In [None]:
# run this cell to check current status of hyperparameter tuning job
tuning_job_result = smclient.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name
)

status = tuning_job_result["HyperParameterTuningJobStatus"]
if status != "Completed":
    print("Reminder: the tuning job has not been completed.")

job_count = tuning_job_result["TrainingJobStatusCounters"]["Completed"]
print("%d training jobs have completed" % job_count)

objective = tuning_job_result["HyperParameterTuningJobConfig"]["HyperParameterTuningJobObjective"]
is_minimize = objective["Type"] != "Maximize"
objective_name = objective["MetricName"]

In [None]:
from pprint import pprint

if tuning_job_result.get("BestTrainingJob", None):
    print("Best model found so far:")
    pprint(tuning_job_result["BestTrainingJob"])
else:
    print("No training jobs have reported results yet.")

In [None]:
#Import data from S3 buckets:

#TrainingInputs 
s3_input_train_KMSmote = TrainingInput(
    s3_data="s3://{}/train_KMSmote/".format(bucket), content_type="csv")

#ValidationInputs 
s3_input_validation_KMSmote = TrainingInput(
   s3_data="s3://{}/validation_KMSmote/".format(bucket), content_type="csv")

In [None]:
sess = sagemaker.Session()

xgb_KMSmote = sagemaker.estimator.Estimator(
    container, #container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "latest")
    role, #role = arn:aws:iam::947941747067:role/service-role/AmazonSageMaker-ExecutionRole-20211124T165440
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/output".format(bucket),
    sagemaker_session=sess,
)


xgb_KMSmote.set_hyperparameters(
    alpha=0.0,
    max_depth=6,
    colsample_bylevel=0.9821905551514846,
    colsample_bytree=0.7352252202660664,
    eta=0.7354161456910047,
    gamma=3.8526330925960295,
    max_delta_step=9,
    min_child_weight=93.9021661490582,
    subsample=0.5544205615337394,
    objective="binary:logistic",
    eval_metric= "auc", 
    num_round=100,
)


xgb_KMSmote.fit({"train": s3_input_train_KMSmote, "validation": s3_input_validation_KMSmote})

In [None]:
%matplotlib inline
from sagemaker.analytics import TrainingJobAnalytics

training_job_name = xgb_KMSmote._current_job_name
metric_name = ['train:auc','validation:auc']

metrics_dataframe = TrainingJobAnalytics(training_job_name=training_job_name,metric_names=metric_name).dataframe()
#plt = metrics_dataframe.plot(kind='line', figsize=(12,5), x='timestamp', y='value', style='b.', legend=False)
#plt.set_ylabel(metric_name);
metrics_dataframe

#### With our model trained, let's deploy it to a hosted endpoint

In [None]:
xgb_predictor_KMSmote = xgb_KMSmote.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=CSVSerializer()
)

In [None]:
data_key = 'test/test.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

test_data = pd.read_csv(data_location, sep=',', header=None)
test_data

In [None]:
test_data.to_numpy()[:, 1:]

In [None]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions_KMSmote = ""
    for array in split_array:
        predictions_KMSmote = ",".join([predictions_KMSmote, xgb_predictor_KMSmote.predict(array).decode("utf-8")])

    return np.fromstring(predictions_KMSmote[1:], sep=",")


predictions_KMSmote = predict(test_data.to_numpy()[:, 1:])

In [None]:
print(predictions_KMSmote)


In [None]:
from sklearn.metrics import precision_recall_fscore_support as score

y_test = test_data.iloc[:, 0]
y_pred = np.where(predictions_KMSmote > 0.03, 1, 0)

precision, recall, fscore, support = score(y_test, y_pred)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

In [None]:
df_cm_KMSmote = pd.crosstab(
    index=test_data.iloc[:, 0],
    columns=np.round(predictions_KMSmote),
    rownames=["actual"],
    colnames=["predictions_KMSmote"],
)


import seaborn as sn
sn.set(font_scale=1.4)
sn.heatmap(df_cm_KMSmote, annot=True, annot_kws={"size": 16}, fmt='g')

## Model Training with SVM-SMOTE Resampled Dataset

## Setup_Hyperparameter_Tuning 


In [None]:
from sagemaker.image_uris import retrieve

training_image = retrieve(framework="xgboost", region=boto3.Session().region_name, version="latest")

s3_input_train = "s3://{}/train_svm".format(bucket, content_type="csv")
s3_input_validation = "s3://{}/validation_svm/".format(bucket, content_type="csv")

In [None]:
from time import gmtime, strftime, sleep

tuning_job_name = "xgboost-tuningjob-" + strftime("%d-%H-%M-%S", gmtime())

print(tuning_job_name)

tuning_job_config = {
    "ParameterRanges": {
        "CategoricalParameterRanges": [],
        "ContinuousParameterRanges": [
            {
                "MaxValue": "1",
                "MinValue": "0",
                "Name": "eta",
            },
            {
                "MaxValue" : "1",
                "MinValue" : "0.1",
                "Name" : "colsample_bylevel"
                
            },
            {
                "MaxValue" : "1",
                "MinValue" : "0.5",
                "Name" : "colsample_bytree"
            },
            {
                "MaxValue" : "5",
                "MinValue" : "0",
                "Name" : "gamma"
            },
            {
                "MaxValue": "120",
                "MinValue": "1",
                "Name": "min_child_weight",
            },
            {
                "MaxValue": "1000",
                "MinValue": "0",
                "Name": "alpha",
            },
            {
                "MaxValue": "1",
                "MinValue": "0.5",
                "Name": "subsample",
            }
        ],
        "IntegerParameterRanges": [
            {
                "MaxValue": "10",
                "MinValue": "0",
                "Name": "max_depth",
            },
            {
                "MaxValue": "10",
                "MinValue": "0",
                "Name": "max_delta_step",
            }
        ],
    },
    "ResourceLimits": {"MaxNumberOfTrainingJobs": 8, "MaxParallelTrainingJobs": 2},
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {"MetricName": "validation:auc", "Type": "Maximize"},
}

In [None]:

training_job_definition = {
    "AlgorithmSpecification": {"TrainingImage": training_image, "TrainingInputMode": "File"},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "CompressionType": "None",
            "ContentType": "csv",
            "DataSource": {
                "S3DataSource": {
                    "S3DataDistributionType": "FullyReplicated",
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_input_train,
                }
            },
        },
        {
            "ChannelName": "validation",
            "CompressionType": "None",
            "ContentType": "csv",
            "DataSource": {
                "S3DataSource": {
                    "S3DataDistributionType": "FullyReplicated",
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_input_validation,
                }
            },
        },
    ],
    "OutputDataConfig": {"S3OutputPath": "s3://{}/output".format(bucket)},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.m4.xlarge", "VolumeSizeInGB": 10},
    "RoleArn": role,
    "StaticHyperParameters": {
        "eval_metric": "auc",
        "num_round": "100",
        "objective": "binary:logistic",
        "rate_drop": "0.3",
        "tweedie_variance_power": "1.4",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 43200},
}

In [None]:
smclient = boto3.Session().client("sagemaker")

In [None]:
smclient.create_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name,
    HyperParameterTuningJobConfig=tuning_job_config,
    TrainingJobDefinition=training_job_definition,
)

In [None]:
smclient.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)[
    "HyperParameterTuningJobStatus"
]

In [None]:
# run this cell to check current status of hyperparameter tuning job
tuning_job_result = smclient.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name
)

status = tuning_job_result["HyperParameterTuningJobStatus"]
if status != "Completed":
    print("Reminder: the tuning job has not been completed.")

job_count = tuning_job_result["TrainingJobStatusCounters"]["Completed"]
print("%d training jobs have completed" % job_count)

objective = tuning_job_result["HyperParameterTuningJobConfig"]["HyperParameterTuningJobObjective"]
is_minimize = objective["Type"] != "Maximize"
objective_name = objective["MetricName"]

In [None]:
from pprint import pprint

if tuning_job_result.get("BestTrainingJob", None):
    print("Best model found so far:")
    pprint(tuning_job_result["BestTrainingJob"])
else:
    print("No training jobs have reported results yet.")

In [None]:
#Import data from S3 buckets:

#TrainingInputs 
s3_input_train_svm = TrainingInput(
    s3_data="s3://{}/train_svm/".format(bucket), content_type="csv")

#ValidationInputs 
s3_input_validation_svm = TrainingInput(
   s3_data="s3://{}/validation_svm/".format(bucket), content_type="csv")

In [None]:
sess = sagemaker.Session()

xgb_svm = sagemaker.estimator.Estimator(
    container, #container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "latest")
    role, #role = arn:aws:iam::947941747067:role/service-role/AmazonSageMaker-ExecutionRole-20211124T165440
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/output".format(bucket),
    sagemaker_session=sess,
)


xgb_svm.set_hyperparameters(
    alpha=458.3444684628589,
    colsample_bylevel=0.8950713179635998,
    colsample_bytree=0.6821249355523158,
    max_depth=10,
    max_delta_step=10,
    eta=0.08754223069900237,
    gamma=1.8978341258783693,
    min_child_weight=1.8978341258783693,
    objective="binary:logistic",
    eval_metric= "auc", 
    num_round=100,
)

xgb_svm.fit({"train": s3_input_train_svm, "validation": s3_input_validation_svm})

In [None]:
%matplotlib inline
from sagemaker.analytics import TrainingJobAnalytics

training_job_name = xgb_svm._current_job_name
metric_name = ['train:auc','validation:auc']

metrics_dataframe = TrainingJobAnalytics(training_job_name=training_job_name,metric_names=metric_name).dataframe()
#plt = metrics_dataframe.plot(kind='line', figsize=(12,5), x='timestamp', y='value', style='b.', legend=False)
#plt.set_ylabel(metric_name);
metrics_dataframe

#### With our model trained, let's deploy it to a hosted endpoint

In [None]:
xgb_predictor_svm = xgb_svm.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=CSVSerializer()
)

In [None]:
data_key = 'test/test.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

test_data = pd.read_csv(data_location, sep=',', header=None)
test_data

In [None]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions_svm = ""
    for array in split_array:
        predictions_svm = ",".join([predictions_svm, xgb_predictor_svm.predict(array).decode("utf-8")])

    return np.fromstring(predictions_svm[1:], sep=",")


predictions_svm = predict(test_data.to_numpy()[:, 1:])

In [None]:
print(predictions_svm)


In [None]:
from sklearn.metrics import precision_recall_fscore_support as score

y_test = test_data.iloc[:, 0]
y_pred = np.where(predictions_svm > 0.03, 1, 0)

precision, recall, fscore, support = score(y_test, y_pred)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

In [None]:
df_cm_svm = pd.crosstab(
    index=test_data.iloc[:, 0],
    columns=np.round(predictions_svm),
    rownames=["actual"],
    colnames=["predictions_svm"],
)


import seaborn as sn
sn.set(font_scale=1.4)
sn.heatmap(df_cm_svm, annot=True, annot_kws={"size": 16}, fmt='g')

## Model Training with SMOTE ENN Resampled Dataset

In [None]:
from sagemaker.image_uris import retrieve

training_image = retrieve(framework="xgboost", region=boto3.Session().region_name, version="latest")

s3_input_train = "s3://{}/train_SmoteENN".format(bucket, content_type="csv")
s3_input_validation = "s3://{}/validation_SmoteENN/".format(bucket, content_type="csv")

In [None]:
from time import gmtime, strftime, sleep

tuning_job_name = "xgboost-tuningjob-" + strftime("%d-%H-%M-%S", gmtime())

print(tuning_job_name)

tuning_job_config = {
    "ParameterRanges": {
        "CategoricalParameterRanges": [],
        "ContinuousParameterRanges": [
            {
                "MaxValue": "1",
                "MinValue": "0",
                "Name": "eta",
            },
            {
                "MaxValue" : "1",
                "MinValue" : "0.1",
                "Name" : "colsample_bylevel"
                
            },
            {
                "MaxValue" : "1",
                "MinValue" : "0.5",
                "Name" : "colsample_bytree"
            },
            {
                "MaxValue" : "5",
                "MinValue" : "0",
                "Name" : "gamma"
            },
            {
                "MaxValue": "120",
                "MinValue": "1",
                "Name": "min_child_weight",
            },
            {
                "MaxValue": "1000",
                "MinValue": "0",
                "Name": "alpha",
            },
            {
                "MaxValue": "1",
                "MinValue": "0.5",
                "Name": "subsample",
            }
        ],
        "IntegerParameterRanges": [
            {
                "MaxValue": "10",
                "MinValue": "0",
                "Name": "max_depth",
            },
            {
                "MaxValue": "10",
                "MinValue": "0",
                "Name": "max_delta_step",
            }
        ],
    },
    "ResourceLimits": {"MaxNumberOfTrainingJobs": 8, "MaxParallelTrainingJobs": 2},
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {"MetricName": "validation:auc", "Type": "Maximize"},
}

In [None]:

training_job_definition = {
    "AlgorithmSpecification": {"TrainingImage": training_image, "TrainingInputMode": "File"},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "CompressionType": "None",
            "ContentType": "csv",
            "DataSource": {
                "S3DataSource": {
                    "S3DataDistributionType": "FullyReplicated",
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_input_train,
                }
            },
        },
        {
            "ChannelName": "validation",
            "CompressionType": "None",
            "ContentType": "csv",
            "DataSource": {
                "S3DataSource": {
                    "S3DataDistributionType": "FullyReplicated",
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_input_validation,
                }
            },
        },
    ],
    "OutputDataConfig": {"S3OutputPath": "s3://{}/output".format(bucket)},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.m4.xlarge", "VolumeSizeInGB": 10},
    "RoleArn": role,
    "StaticHyperParameters": {
        "eval_metric": "auc",
        "num_round": "100",
        "objective": "binary:logistic",
        "rate_drop": "0.3",
        "tweedie_variance_power": "1.4",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 43200},
}

In [None]:
smclient = boto3.Session().client("sagemaker")

In [None]:
smclient.create_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name,
    HyperParameterTuningJobConfig=tuning_job_config,
    TrainingJobDefinition=training_job_definition,
)

In [None]:
smclient.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)[
    "HyperParameterTuningJobStatus"
]

In [None]:
# run this cell to check current status of hyperparameter tuning job
tuning_job_result = smclient.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name
)

status = tuning_job_result["HyperParameterTuningJobStatus"]
if status != "Completed":
    print("Reminder: the tuning job has not been completed.")

job_count = tuning_job_result["TrainingJobStatusCounters"]["Completed"]
print("%d training jobs have completed" % job_count)

objective = tuning_job_result["HyperParameterTuningJobConfig"]["HyperParameterTuningJobObjective"]
is_minimize = objective["Type"] != "Maximize"
objective_name = objective["MetricName"]

In [None]:
from pprint import pprint

if tuning_job_result.get("BestTrainingJob", None):
    print("Best model found so far:")
    pprint(tuning_job_result["BestTrainingJob"])
else:
    print("No training jobs have reported results yet.")

In [None]:
#Import data from S3 buckets:

#TrainingInputs 
s3_input_train_SmoteENN = TrainingInput(
    s3_data="s3://{}/train_SmoteENN/".format(bucket), content_type="csv")

#ValidationInputs 
s3_input_validation_SmoteENN = TrainingInput(
   s3_data="s3://{}/validation_SmoteENN/".format(bucket), content_type="csv")

In [None]:
sess = sagemaker.Session()

xgb_SmoteENN = sagemaker.estimator.Estimator(
    container, #container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "latest")
    role, #role = arn:aws:iam::947941747067:role/service-role/AmazonSageMaker-ExecutionRole-20211124T165440
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/output".format(bucket),
    sagemaker_session=sess,
)


xgb_SmoteENN.set_hyperparameters(
    alpha=234.24489549493765,
    colsample_bylevel= 0.7695493793830517,
    colsample_bytree= 0.7246719293002704,
    eta=0.10026578123125553,
    gamma=3.462909577823404,
    max_delta_step= 7,
    min_child_weight=114.93467551230289,
    max_depth= 9,
    subsample=0.8808286982132013,
    objective="binary:logistic",
    eval_metric= "auc", 
    num_round=100,
)
xgb_SmoteENN.fit({"train": s3_input_train_SmoteENN, "validation": s3_input_validation_SmoteENN})


In [None]:
%matplotlib inline
from sagemaker.analytics import TrainingJobAnalytics

training_job_name = xgb_SmoteENN._current_job_name
metric_name = ['train:auc','validation:auc']

metrics_dataframe = TrainingJobAnalytics(training_job_name=training_job_name,metric_names=metric_name).dataframe()
#plt = metrics_dataframe.plot(kind='line', figsize=(12,5), x='timestamp', y='value', style='b.', legend=False)
#plt.set_ylabel(metric_name);
metrics_dataframe

#### With our model trained, let's deploy it to a hosted endpoint

In [None]:
xgb_predictor_SmoteENN = xgb_SmoteENN.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=CSVSerializer()
)

In [None]:
data_key = 'test/test.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

test_data = pd.read_csv(data_location, sep=',', header=None)
test_data

In [None]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions_SmoteENN = ""
    for array in split_array:
        predictions_SmoteENN = ",".join([predictions_SmoteENN, xgb_predictor_SmoteENN.predict(array).decode("utf-8")])

    return np.fromstring(predictions_SmoteENN[1:], sep=",")


predictions_SmoteENN = predict(test_data.to_numpy()[:, 1:])

In [None]:
print(predictions_SmoteENN)


In [None]:
from sklearn.metrics import precision_recall_fscore_support as score

y_test = test_data.iloc[:, 0]
y_pred = np.where(predictions_SmoteENN > 0.03, 1, 0)

precision, recall, fscore, support = score(y_test, y_pred)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

In [None]:
df_cm_SmoteENN = pd.crosstab(
    index=test_data.iloc[:, 0],
    columns=np.round(predictions_SmoteENN),
    rownames=["actual"],
    colnames=["predictions_SmoteENN"],
)


import seaborn as sn
sn.set(font_scale=1.4)
sn.heatmap(df_cm_SmoteENN, annot=True, annot_kws={"size": 16}, fmt='g')