In [None]:
import os

import boto3
import sagemaker

from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput

from sagemaker.processing import (
    ProcessingInput,
    ProcessingOutput,
    Processor,
    ScriptProcessor,
)

from sagemaker import Model
from sagemaker.xgboost import XGBoostPredictor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.model_metrics import (
    MetricsSource,
    ModelMetrics,
)
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.steps import (
    ProcessingStep,
    CacheConfig,
    TuningStep,
)
from sagemaker.workflow.step_collections import RegisterModel, CreateModelStep
from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep

from sagemaker.workflow.functions import Join, JsonGet
from sagemaker.workflow.execution_variables import ExecutionVariables

from sagemaker.tuner import (
    ContinuousParameter,
    IntegerParameter,
    HyperparameterTuner,
    WarmStartConfig,
    WarmStartTypes,
)

In [None]:
# Create the SageMaker Session

region = sagemaker.Session().boto_region_name
sm_client = boto3.client("sagemaker")
boto_session = boto3.Session(region_name=region)
sagemaker_session = sagemaker.session.Session(boto_session=boto_session, sagemaker_client=sm_client)
work_directory = "./"
default_bucket = sagemaker_session.default_bucket()
base_job_prefix = 'sagemaker/DEMO-xgboost-banking'

In [None]:
train_input = sagemaker_session.upload_data(
    path="{}/{}".format(work_directory, "bank-additional-full.csv"),
    bucket=default_bucket,
    key_prefix="{}/{}".format(base_job_prefix, "data"),
)

In [None]:
# Define variables and parameters needed for the Pipeline steps

role = sagemaker.get_execution_role()
model_package_group_name = "banking-classification"

processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
processing_instance_type = ParameterString(
    name="ProcessingInstanceType", default_value="ml.t3.large"
)
training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
model_approval_status = ParameterString(
    name="ModelApprovalStatus", default_value="PendingManualApproval"
)
input_data = ParameterString(
    name="InputDataUrl",
    default_value=os.path.join("s3://",default_bucket, base_job_prefix, 'data/bank-additional-full.csv'),# f"s3://sagemaker-servicecatalog-seedcode-{region}/dataset/abalone-dataset.csv",
)
model_approval_status = ParameterString(
    name="ModelApprovalStatus", default_value="PendingManualApproval"
)

# Cache Pipeline steps to reduce execution time on subsequent executions
cache_config = CacheConfig(enable_caching=True, expire_after="1d")

In [None]:
%%writefile preprocess.py

"""Feature engineers the banking dataset."""
import argparse
import logging
import os
import pathlib
import requests
import tempfile

import boto3
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection._base import SelectorMixin
from sklearn.feature_extraction.text import _VectorizerMixin

from collections import Counter

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())


# Since we get a headerless CSV file we specify the column names here.
feature_columns_names = [
    "age",
    "job", 
    "marital",
    "education",
    "default",
    "housing",
    "loan",
    "contact",
    "month",
    "day_of_week",
    "duration",
    "campaign",
    "pdays",
    "previous",
    "poutcome",
    "emp.var.rate",
    "cons.price.idx",
    "cons.conf.idx",
    "euribor3m",
    "nr.employed",
]
label_column = "y"

feature_columns_dtype = {
    "age": np.float64,
    "job": str, 
    "marital": str,
    "education": str,
    "default": str,
    "housing": str,
    "loan": str,
    "contact": str,
    "month": str,
    "day_of_week": str,
    "duration": np.float64,
    "campaign": np.float64,
    "pdays": np.float64,
    "previous": np.float64,
    "poutcome": str,
    "emp.var.rate": np.float64,
    "cons.price.idx": np.float64,
    "cons.conf.idx": np.float64,
    "euribor3m": np.float64,
    "nr.employed": np.float64,
}
label_column_dtype = {"y": str}


def merge_two_dicts(x, y):
    """Merges two dicts, returning a new copy."""
    z = x.copy()
    z.update(y)
    return z


if __name__ == "__main__":
    logger.debug("Starting preprocessing.")
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-data", type=str, required=True)
    args = parser.parse_args()

    base_dir = "/opt/ml/processing"
    pathlib.Path(f"{base_dir}/data").mkdir(parents=True, exist_ok=True)
    input_data = args.input_data
    bucket = input_data.split("/")[2]
    key = "/".join(input_data.split("/")[3:])
    
    logger.info("The key contains", print(key))

    logger.info("Downloading data from bucket: %s, key: %s", bucket, key)
    fn = f"{base_dir}/data/banking-additional-full.csv"
    s3 = boto3.resource("s3")
    s3.Bucket(bucket).download_file(key, fn)

    logger.debug("Reading downloaded data.")
    df = pd.read_csv(
        fn,
        header=0,
#         names=feature_columns_names + [label_column],
        dtype=merge_two_dicts(feature_columns_dtype, label_column_dtype),
    )
    os.unlink(fn)
    
    logger.info("Downloaded df contains %s rows and %s columns", df.shape[0], df.shape[1])
    logger.debug("Defining transformers.")
    numeric_features = ["age", "duration", "campaign", "pdays","previous","emp.var.rate","cons.price.idx","cons.conf.idx","euribor3m","nr.employed"]
    numeric_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
        ]
    )

    categorical_features = ["job", "marital","education","housing","loan","contact","month","day_of_week","poutcome"]
    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]
    )

    preprocess = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )

    logger.info("Applying transforms.")
    df['y'] = df['y'].map({'yes':1, 'no':0})
    y = df.pop("y")
    X_pre = preprocess.fit_transform(df)
    
    # estimate scale_pos_weight value
#     counter = Counter(y)
#     imbalance_ratio = counter[0] / counter[1]
    
    ### Create functions to retrieve the column names from the "preprocess" transformer
    def get_feature_out(estimator, feature_in):
        if hasattr(estimator,'get_feature_names'):
            if isinstance(estimator, _VectorizerMixin):
                # handling all vectorizers
                return [f'vec_{f}' \
                    for f in estimator.get_feature_names()]
            else:
                return estimator.get_feature_names(feature_in)
        elif isinstance(estimator, SelectorMixin):
            return np.array(feature_in)[estimator.get_support()]
        else:
            return feature_in


    def get_ct_feature_names(ct):
        # handles all estimators, pipelines inside ColumnTransfomer
        # doesn't work when remainder =='passthrough'
        # which requires the input column names.
        output_features = []

        for name, estimator, features in ct.transformers_:
            if name!='remainder':
                if isinstance(estimator, Pipeline):
                    current_features = features
                    for step in estimator:
                        current_features = get_feature_out(step, current_features)
                    features_out = current_features
                else:
                    features_out = get_feature_out(estimator, features)
                output_features.extend(features_out)
            elif estimator=='passthrough':
                output_features.extend(ct._feature_names_in[features])

        return output_features

    X = pd.DataFrame(X_pre, 
                 columns=get_ct_feature_names(preprocess))

    X['y'] = y
    
    # Move our target column from the first to the last position (column) in the data frame
    temp_cols = list(X.columns)
    temp_cols = [temp_cols[-1]] + temp_cols[:-1]
    X = X[temp_cols]

    logger.info("Splitting %d rows of data into train, validation, test datasets.", len(X))
    train, validation, test = np.split(X, [int(0.7 * len(X)), int(0.85 * len(X))])

    logger.info("Writing out datasets to %s.", base_dir)
    train.to_csv(f"{base_dir}/train/train.csv", header=False, index=False)
    validation.to_csv(
        f"{base_dir}/validation/validation.csv", header=False, index=False
    )
    test.to_csv(f"{base_dir}/test/test.csv", header=False, index=False)

In [None]:
sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name=f"{base_job_prefix}/sklearn-banking-preprocess",
    sagemaker_session=sagemaker_session,
    role=role,
)

data_repo_prefix="{}/{}".format(base_job_prefix, "data")

step_process = ProcessingStep(
    name="PreprocessBankingDataForHPO",
    processor=sklearn_processor,
    outputs=[
        ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/train",
            destination=f"s3://{default_bucket}/{data_repo_prefix}/{ExecutionVariables.PIPELINE_EXECUTION_ID}/PreprocessBankingDataForHPO)",
#             destination=Join(
#                 on="/",
#                 values=[
#                     "s3:/",
#                     default_bucket,
# #                     base_job_prefix,
#                       data_repo_prefix,
#                     ExecutionVariables.PIPELINE_EXECUTION_ID,
#                     "PreprocessBankingDataForHPO",
#                 ],
#             ),
        ),
        ProcessingOutput(
            output_name="validation",
            source="/opt/ml/processing/validation",
            destination=f"s3://{default_bucket}/{data_repo_prefix}/{ExecutionVariables.PIPELINE_EXECUTION_ID}/PreprocessBankingDataForHPO)",
#             destination=Join(
#                 on="/",
#                 values=[
#                     "s3:/",
#                     default_bucket,
# #                     base_job_prefix,
#                     data_repo_prefix,
#                     ExecutionVariables.PIPELINE_EXECUTION_ID,
#                     "PreprocessBankingDataForHPO",
#                 ],
#             ),
        ),
        ProcessingOutput(
            output_name="test",
            source="/opt/ml/processing/test",
            destination=f"s3://{default_bucket}/{data_repo_prefix}/{ExecutionVariables.PIPELINE_EXECUTION_ID}/PreprocessBankingDataForHPO)",
#             destination=Join(
#                 on="/",
#                 values=[
#                     "s3:/",
#                     default_bucket,
# #                     base_job_prefix,
#                     data_repo_prefix,
#                     ExecutionVariables.PIPELINE_EXECUTION_ID,
#                     "PreprocessBankingDataForHPO",
#                 ],
#             ),
        ),
    ],
    code="preprocess.py",
    job_arguments=["--input-data", input_data],
)

In [None]:
# Define the output path for the model artifacts from the Hyperparameter Tuning Job
model_path = f"s3://{default_bucket}/{base_job_prefix}/BankingTrain"

image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.0-1",
    py_version="py3",
    instance_type=training_instance_type,
)

xgb_train = Estimator(
    image_uri=image_uri,
    instance_type=training_instance_type,
    instance_count=1,
    output_path=model_path,
    base_job_name=f"{base_job_prefix}/banking-train",
    sagemaker_session=sagemaker_session,
    role=role,
)

xgb_train.set_hyperparameters(
    eval_metric="logloss",
    objective="binary:logistic",  # Define the object metric for the training job
    num_round=30,
    eta=0.1,
    gamma=4,
    min_child_weight=3,
    subsample=0.7,
    silent=0,
    scale_pos_weight=7.7, # Based on imbalance_ratio calculation listed in the preprocess.py script
)

objective_metric_name = "validation:logloss"

hyperparameter_ranges = {
    "alpha": ContinuousParameter(0.01, 10.0),  # , scaling_type="Logarithmic"
    "lambda": ContinuousParameter(0.01, 10.0),  # , scaling_type="Logarithmic"
    "max_depth": IntegerParameter(1, 10),
}

tuner_log = HyperparameterTuner(
    xgb_train,
    objective_metric_name,
    hyperparameter_ranges,
    max_jobs=6,
    max_parallel_jobs=2,
    strategy="Bayesian",
    objective_type="Minimize",
    early_stopping_type = 'Auto'
)

step_tuning = TuningStep(
    name="HPTuning",
    tuner=tuner_log,
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
            content_type="text/csv",
        ),
        "validation": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                "validation"
            ].S3Output.S3Uri,
            content_type="text/csv",
        ),
    },
    cache_config=cache_config,
)

In [None]:
# Creating 2 SageMaker Models

model_bucket_key = f"{default_bucket}/{base_job_prefix}/BankingTrain"
best_model = Model(
    image_uri=image_uri,
    model_data=step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
    sagemaker_session=sagemaker_session,
    role=role,
    predictor_cls=XGBoostPredictor,
)

step_create_first = CreateModelStep(
    name="CreateTopModel",
    model=best_model,
    inputs=sagemaker.inputs.CreateModelInput(instance_type="ml.m4.large"),
)

# second_best_model = Model(
#     image_uri=image_uri,
#     model_data=step_tuning.get_top_model_s3_uri(top_k=1, s3_bucket=model_bucket_key),
#     sagemaker_session=sagemaker_session,
#     role=role,
#     predictor_cls=XGBoostPredictor,
# )

# step_create_second = CreateModelStep(
#     name="CreateSecondBestModel",
#     model=second_best_model,
#     inputs=sagemaker.inputs.CreateModelInput(instance_type="ml.m4.large"),
# )

In [None]:
%%writefile evaluate.py

"""Evaluation script for measuring mean squared error."""
import json
import logging
import pathlib
import pickle
import tarfile

import numpy as np
import pandas as pd
import xgboost

from sklearn.metrics import auc, precision_score, recall_score, roc_auc_score

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())


if __name__ == "__main__":
    logger.debug("Starting evaluation.")
    model_path = "/opt/ml/processing/model/model.tar.gz"
    with tarfile.open(model_path) as tar:
        tar.extractall(path=".")

    logger.debug("Loading xgboost model.")
    model = pickle.load(open("xgboost-model", "rb"))

    logger.debug("Reading test data.")
    test_path = "/opt/ml/processing/test/test.csv"
    df = pd.read_csv(test_path, header=None)

    logger.debug("Reading test data.")
    y_test = df.iloc[:, 0].to_numpy()
    df.drop(df.columns[0], axis=1, inplace=True)
    X_test = xgboost.DMatrix(df.values)
    ######

    logger.info("Performing predictions against test data.")
    predictions = model.predict(X_test)
    predictions = np.round(predictions)
    logger.info("Predictions are of data type %s ", predictions.dtype)
    logger.info("Y Test are of data type %s ", y_test.dtype)

    logger.debug("Calculating mean squared error.")
    auc = roc_auc_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    report_dict = {
        "classification_metrics": {
            "auc": {"value": auc},
            "precision": {"value": precision},
            "recall": {"value": recall},
        },
    }

    output_dir = "/opt/ml/processing/evaluation"
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

    logger.info("Writing out evaluation report with precision: %f and Recall: %f and an AUC of %f " , precision, recall, auc)
    evaluation_path = f"{output_dir}/evaluation.json"
    with open(evaluation_path, "w") as f:
        f.write(json.dumps(report_dict))

In [None]:
# A ProcessingStep is used to evaluate the performance of a selected model from the HPO step. In this case, the top performing model
# is evaluated. Based on the results of the evaluation, the model is registered into the Model Registry using a ConditionStep.

script_eval = ScriptProcessor(
    image_uri=image_uri,
    command=["python3"],
    instance_type=processing_instance_type,
    instance_count=1,
    base_job_name=f"{base_job_prefix}/script-tuning-step-eval",
    sagemaker_session=sagemaker_session,
    role=role,
)

evaluation_report = PropertyFile(
    name="BestTuningModelEvaluationReport",
    output_name="evaluation",
    path="evaluation.json",
)

# This can be extended to evaluate multiple models from the HPO step
step_eval = ProcessingStep(
    name="EvaluateTopModel",
    processor=script_eval,
    inputs=[
        ProcessingInput(
            source=step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
            destination="/opt/ml/processing/model",
        ),
        ProcessingInput(
            source=step_process.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri,
            destination="/opt/ml/processing/test",
        ),
    ],
    outputs=[
        ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
    ],
    code="evaluate.py",
    property_files=[evaluation_report],
    cache_config=cache_config,
)

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri="{}/evaluation.json".format(
            step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
        ),
        content_type="application/json",
    )
)

In [None]:
# Register the model in the Model Registry
# Multiple models can be registered into the Model Registry using multiple RegisterModel steps. These models can either be added to the
# same model package group as different versions within the group or the models can be added to different model package groups.

step_register_best = RegisterModel(
    name="RegisterBestBankingModel",
    estimator=xgb_train,
    model_data=step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
    content_types=["text/csv"],
    response_types=["application/json"], # "text/csv"
    inference_instances=["ml.t2.medium", "ml.m5.large"],
    transform_instances=["ml.m5.large"],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
)

In [None]:
# condition step for evaluating model quality and branching execution

cond_lte = ConditionLessThanOrEqualTo(
    left=JsonGet(
        step_name=step_eval.name,
        property_file=evaluation_report,
        json_path="classification_metrics.auc.value",
    ),
    right=0.8,
)
step_cond = ConditionStep(
    name="CheckAUCBankingEvaluation",
    conditions=[cond_lte],
    if_steps=[step_register_best],
    else_steps=[],
)

In [None]:
pipeline = Pipeline(
    name="tuning-step-pipeline",
    parameters=[
        processing_instance_type,
        processing_instance_count,
        training_instance_type,
        input_data,
        model_approval_status,
    ],
    steps=[
        step_process,
        step_tuning,
        step_create_first,
#         step_create_second,
        step_eval,
        step_cond,
    ],
    sagemaker_session=sagemaker_session,
)

In [None]:
import json

definition = json.loads(pipeline.definition())
definition

In [None]:
pipeline.upsert(role_arn=role)

In [None]:
pipeline.start()

In [None]:
### Test Endpoint with test data

In [None]:
import pandas as pd
import boto3
import io

bucket='stackvidhya'

file_key = 'sagemaker/DEMO-xgboost-banking/data/bilcu1istvih/PreprocessBankingDataForHPO/test.csv'

s3_client = boto3.client('s3')

obj = s3_client.get_object(Bucket=default_bucket, Key=file_key)

test_df = pd.read_csv(io.BytesIO(obj['Body'].read()), header=None)

test_df.head()

In [None]:
test_df.dtypes

In [None]:
y_test.dtype

In [None]:
y_test = test_df.iloc[:, 0]
# y_test = y_test.astype(int)
test_x = test_df.iloc[:, 1:]

# subset_test_df = test_df.iloc[0:5, :].copy()

In [None]:
runtime = boto3.client("sagemaker-runtime")
# sm_client = boto3.client("sagemaker")
endpoint_name = "bankingClassificationEndpoint"

# csv serialization
response = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=test_x.to_csv(header=False, index=False).encode("utf-8"),
    ContentType="text/csv",
)

predictions = response["Body"].read()
# print(response["Body"].read())

In [None]:
predictions = predictions.decode('utf-8')
predictions = predictions.split(',')

In [None]:
# predictions += [float(r) for r in predictions]
predictions = [float(r) for r in predictions]
# float(predictions)

In [None]:
test_df.dtypes

In [None]:
test_df['predict_probability'] = predictions
test_df['predict_binary'] = np.round(predictions).astype(int)
test_df.head()

In [None]:
from sklearn.metrics import auc, precision_score, recall_score, roc_auc_score
import numpy as np

auc = roc_auc_score(y_test, test_df['predict_binary'])
precision = precision_score(y_test, test_df['predict_binary'])
recall = recall_score(y_test, test_df['predict_binary'])

In [None]:
print(f'Testing model performance returned an AUC of {auc} along with a Precision score of {precision} and a Recall score of {recall} ')