### Import packages, prepare data for S3

In [1]:
import os
import boto3
import pandas as pd
import numpy as np
import s3fs
from datetime import datetime
import time
import sagemaker
from sagemaker.estimator import Estimator
from sagemaker import image_uris
from sagemaker.inputs import TrainingInput
from sagemaker.debugger import Rule, rule_configs, ProfilerRule
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, CategoricalParameter, ContinuousParameter
from sagemaker.analytics import HyperparameterTuningJobAnalytics
from IPython.display import FileLink, FileLinks
from sagemaker.predictor import csv_serializer


In [10]:
### configure S3
role = sagemaker.get_execution_role()
bucket = 'udacity-mle-capstone-starbucks'
prefix = 'xgboost'
sess = sagemaker.Session()
region = sess._region_name
print (region)

us-east-1


In [31]:
### read train and test data
df_train = pd.read_csv('data/df_train.csv')
df_test = pd.read_csv('data/df_test.csv')

In [34]:
### process as readable input for xgboost
def process_for_xgb(df):
    df.drop(columns=['person','time','offer_id'],inplace=True)
    df_new = pd.concat([df['target'], df.drop('target',axis=1)],axis=1)
    return df_new

In [35]:
df_train_xgb = process_for_xgb(df_train)
df_test_xgb = process_for_xgb(df_test)

In [37]:
for name, dataset in zip(["train", "test"], [df_train_xgb, df_test_xgb]):
    sess.upload_string_as_file_body(body=dataset.to_csv(index=False, header=False),
                                   bucket=bucket,
                                   key=f"{prefix}/input/{name}.csv"
                                   )
                                   
# configure data inputs for SageMaker training
train_input = TrainingInput(f"s3://{bucket}/{prefix}/input/train.csv", content_type="text/csv")
validation_input = TrainingInput(f"s3://{bucket}/{prefix}/input/test.csv", content_type="text/csv")

### Set up parameters, estimator, and tuner

In [69]:
rules=[
    Rule.sagemaker(rule_configs.create_xgboost_report()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

In [50]:
xgboost_container = image_uris.retrieve("xgboost", region, "1.2-1")

estimator=Estimator(
    role=role,
    image_uri=xgboost_container,
    base_job_name="xgb_starbucks_hpo",
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=f's3://{bucket}/{prefix}/output/',
#     hyperparameters=hyperparameters,
#     rules=rules, 
)

estimator.set_hyperparameters(    
    max_depth = 6,
    eta = 0.1,
    objective='binary:logistic',
    num_round=100,
    eval_metric='auc')

hyperparameter_ranges = {'eta': ContinuousParameter(0.01,1.0),
                         'max_depth': IntegerParameter(5, 10),
                        'num_round': IntegerParameter(50, 200)
                        }

tuner = HyperparameterTuner(estimator,
                    objective_metric_name='validation:auc',
                    hyperparameter_ranges=hyperparameter_ranges,
                    max_parallel_jobs=10,
                    max_jobs=60)    
    


In [None]:
tuner.fit({'train': train_input, 'validation': validation_input}, 
              wait=True)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


.............................................

In [52]:
#### Describe the tuning results
exp = HyperparameterTuningJobAnalytics(
  hyperparameter_tuning_job_name='sagemaker-xgboost-230419-0834')
jobs = exp.dataframe()
jobs.sort_values('FinalObjectiveValue', ascending=0)

Unnamed: 0,eta,max_depth,num_round,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
16,0.079036,5.0,142.0,sagemaker-xgboost-230419-0834-044-826ad614,Completed,0.85442,2023-04-19 08:41:05+00:00,2023-04-19 08:41:37+00:00,32.0
19,0.073204,5.0,148.0,sagemaker-xgboost-230419-0834-041-cfea97ed,Completed,0.85424,2023-04-19 08:40:53+00:00,2023-04-19 08:41:25+00:00,32.0
15,0.068235,5.0,169.0,sagemaker-xgboost-230419-0834-045-a7773391,Completed,0.85415,2023-04-19 08:41:14+00:00,2023-04-19 08:41:46+00:00,32.0
23,0.068231,5.0,168.0,sagemaker-xgboost-230419-0834-037-28ffc1e1,Completed,0.85413,2023-04-19 08:40:30+00:00,2023-04-19 08:41:12+00:00,42.0
13,0.060062,6.0,145.0,sagemaker-xgboost-230419-0834-047-09e9b8c3,Completed,0.85389,2023-04-19 08:41:25+00:00,2023-04-19 08:42:02+00:00,37.0
5,0.067329,5.0,155.0,sagemaker-xgboost-230419-0834-055-be08eb5b,Completed,0.85375,2023-04-19 08:42:00+00:00,2023-04-19 08:42:33+00:00,33.0
39,0.067962,6.0,124.0,sagemaker-xgboost-230419-0834-021-ca3c6c80,Completed,0.85366,2023-04-19 08:38:59+00:00,2023-04-19 08:39:31+00:00,32.0
6,0.060664,6.0,146.0,sagemaker-xgboost-230419-0834-054-a49381f0,Completed,0.85366,2023-04-19 08:41:59+00:00,2023-04-19 08:42:31+00:00,32.0
20,0.074913,5.0,174.0,sagemaker-xgboost-230419-0834-040-db081ceb,Completed,0.85359,2023-04-19 08:40:42+00:00,2023-04-19 08:41:19+00:00,37.0
0,0.070127,5.0,162.0,sagemaker-xgboost-230419-0834-060-a9b3b1da,Completed,0.85352,2023-04-19 08:42:37+00:00,2023-04-19 08:43:09+00:00,32.0


### Train model with optimized hyperparameter

In [53]:
best_estimator=tuner.best_estimator()


2023-04-19 08:41:57 Starting - Found matching resource for reuse
2023-04-19 08:41:57 Downloading - Downloading input data
2023-04-19 08:41:57 Training - Training image download completed. Training in progress.
2023-04-19 08:41:57 Uploading - Uploading generated training model
2023-04-19 08:41:57 Completed - Resource reused by training job: sagemaker-xgboost-230419-0834-054-a49381f0


In [54]:
best_estimator.hyperparameters()

{'_tuning_objective_metric': 'validation:auc',
 'eta': '0.07903618528752897',
 'eval_metric': 'auc',
 'max_depth': '5',
 'num_round': '142',
 'objective': 'binary:logistic'}

In [74]:
hyperparameters = { 'eta': '0.07903618528752897',
                 'max_depth': '5',
                 'num_round': '142',
                 'objective': 'binary:logistic'
                        }

In [75]:
estimator=Estimator(
    role=role,
    image_uri=xgboost_container,
    base_job_name="xgb-starbucks-clf",
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=f's3://{bucket}/{prefix}/output/',
    hyperparameters=hyperparameters,
    rules=rules, 
)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [76]:
estimator.fit({'train': train_input, 'validation': validation_input}, 
              wait=True)

INFO:sagemaker:Creating training-job with name: xgb-starbucks-clf-2023-04-19-11-42-41-675


2023-04-19 11:42:41 Starting - Starting the training job...
2023-04-19 11:43:07 Starting - Preparing the instances for trainingCreateXgboostReport: InProgress
ProfilerReport: InProgress
...
2023-04-19 11:43:42 Downloading - Downloading input data...
2023-04-19 11:44:07 Training - Downloading the training image...
2023-04-19 11:44:38 Training - Training image download completed. Training in progress...[34m[2023-04-19 11:44:49.952 ip-10-0-250-218.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mI

### Check training report

In [77]:
# get profiler report and training report from S3
profiler_report_name = [rule["RuleConfigurationName"] 
                        for rule in estimator.latest_training_job.rule_job_summary() 
                        if "Profiler" in rule["RuleConfigurationName"]][0]

xgb_profile_job_name = [rule["RuleEvaluationJobArn"].split("/")[-1] 
                        for rule in estimator.latest_training_job.rule_job_summary() 
                        if "CreateXgboostReport" in rule["RuleConfigurationName"]][0]

base_output_path = os.path.dirname(estimator.latest_job_debugger_artifacts_path())
rule_output_path = os.path.join(base_output_path, "rule-output/")
xgb_report_path = os.path.join(rule_output_path, "CreateXgboostReport")
profile_report_path = os.path.join(rule_output_path, profiler_report_name)

while True:
    
    xgb_job_info = sess.sagemaker_client.describe_processing_job(ProcessingJobName=xgb_profile_job_name)

    if xgb_job_info["ProcessingJobStatus"] == "Completed":
        break
    else:
        print(f"Job Status: {xgb_job_info['ProcessingJobStatus']}")
        time.sleep(30)

s3.download(xgb_report_path, "reports/xgb/", recursive=True)
s3.download(profile_report_path, "reports/profiler/", recursive=True)
display("Click link below to view the profiler report", FileLink("reports/profiler/profiler-output/profiler-report.html"))
display("Click link below to view the XGBoost Training report", FileLink("reports/xgb/xgboost_report.html"))

Job Status: InProgress
Job Status: InProgress
Job Status: InProgress
Job Status: InProgress
Job Status: InProgress


'Click link below to view the profiler report'

'Click link below to view the XGBoost Training report'

In [110]:
# in training report, the most 3 important features are f1, f5, f20, then f12, f4
features = df_train_xgb.columns[1:].values

In [114]:
print ("The most 3 important features are:{}, {}, {}".format(features[1],features[5],features[20]) )
print ("{} and {} are also important feature".format(features[12],features[4]))

The most 3 important features are:total_amount, reward, success_rate
discount and member_duration are also important feature


### Deploy model and test

In [78]:
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m5.large')

INFO:sagemaker:Creating model with name: xgb-starbucks-clf-2023-04-19-12-00-00-852
INFO:sagemaker:Creating endpoint-config with name xgb-starbucks-clf-2023-04-19-12-00-00-852
INFO:sagemaker:Creating endpoint with name xgb-starbucks-clf-2023-04-19-12-00-00-852


------!

In [79]:
xgb_predictor.endpoint_name

'xgb-starbucks-clf-2023-04-19-12-00-00-852'

In [90]:
# create predictor
xgb_predictor=sagemaker.predictor.RealTimePredictor(
    "xgb-starbucks-clf-2023-04-19-12-00-00-852", 
    sagemaker_session=sess,
    serializer=csv_serializer,
    content_type='text/csv')

# make inference
output = xgb_predictor.predict(df_test_xgb.iloc[-3:, 1:].values).decode('utf-8')

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [104]:
### convert to label
[int(float(x)>0.5) for x in output.split(',')]

[1, 0, 1]

In [106]:
### actual target
df_test_xgb.iloc[-3:, 0].values

array([1, 0, 1])