In [1]:
import sys
import sagemaker
import boto3
import re
from sagemaker import get_execution_role

In [2]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
prefix = "sagemaker/aws-sagemaker-training/DEMO-xgboost-churn"

In [3]:
try:
    # get_execution_role() will only work within Sagemaker studio or notebook instance
    role = sagemaker.get_execution_role()
except ValueError:
    # Will need to get the role ARN by initializing a a new IAM session and get the role by their name
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='AmazonSageMaker-ExecutionRole-20220817T160055')['Role']['Arn']
    print("Role ARN successfully extracted")

Couldn't call 'get_role' to get Role ARN from role name strike-aws-learning to get Role path.


Role ARN successfully extracted


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import sys
import time
import json
from IPython.display import display
from time import strftime, gmtime
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer

# Data

## Download data

In [5]:
# Initialize S3 client object to download data
s3 = boto3.client("s3")
s3.download_file(f"sagemaker-sample-files", "datasets/tabular/synthetic/churn.txt", "data-samples/churn.txt")

In [7]:
# Visualize dataset downloaded
churn = pd.read_csv("data-samples/churn.txt")
pd.set_option("display.max_columns", 500)
print(churn.shape)
churn

(5000, 21)


Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,PA,163,806,403-2562,no,yes,300,8.162204,3,7.579174,3.933035,4,6.508639,4.065759,100,5.111624,4.928160,6,5.673203,3,True.
1,SC,15,836,158-8416,yes,no,0,10.018993,4,4.226289,2.325005,0,9.972592,7.141040,200,6.436188,3.221748,6,2.559749,8,False.
2,MO,131,777,896-6253,no,yes,300,4.708490,3,4.768160,4.537466,3,4.566715,5.363235,100,5.142451,7.139023,2,6.254157,4,False.
3,WY,75,878,817-5729,yes,yes,700,1.268734,3,2.567642,2.528748,5,2.333624,3.773586,450,3.814413,2.245779,6,1.080692,6,False.
4,WY,146,878,450-4942,yes,no,0,2.696177,3,5.908916,6.015337,3,3.670408,3.751673,250,2.796812,6.905545,4,7.134343,6,True.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,NH,4,787,151-3162,yes,yes,800,10.862632,5,7.250969,6.936164,1,8.026482,4.921314,350,6.748489,4.872570,8,2.122530,9,False.
4996,SD,140,836,351-5993,no,no,0,1.581127,8,3.758307,7.377591,7,1.328827,0.939932,300,4.522661,6.938571,2,4.600473,4,False.
4997,SC,32,836,370-3127,no,yes,700,0.163836,5,4.243980,5.841852,3,2.340554,0.939469,450,5.157898,4.388328,7,1.060340,6,False.
4998,MA,142,776,604-2108,yes,yes,600,2.034454,5,3.014859,4.140554,3,3.470372,6.076043,150,4.362780,7.173376,3,4.871900,7,True.


## Prepare data for training

Drop columns, encode target variable and split data

In [8]:
churn = churn.drop("Phone", axis=1)
churn["Area Code"] = churn["Area Code"].astype(object)
churn = churn.drop(["Day Charge", "Eve Charge", "Night Charge", "Intl Charge"], axis=1)

In [9]:
model_data = pd.get_dummies(churn)
model_data = pd.concat(
    [model_data["Churn?_True."], model_data.drop(["Churn?_False.", "Churn?_True."], axis=1)], axis=1
)

  model_data = pd.get_dummies(churn)


In [10]:
train_data, validation_data, test_data = np.split(
    model_data.sample(frac=1, random_state=1729),
    [int(0.7 * len(model_data)), int(0.9 * len(model_data))],
)
train_data.to_csv("data-samples/customer-churn-xgboost-train.csv", header=False, index=False)
validation_data.to_csv("data-samples/customer-churn-xgboost-validation.csv", header=False, index=False)

In [16]:
# Upload data to S3 bucket to be used in training
# Note: When in windows, there is combination of \ and / done when using os.path.join, which might mess up the folder creation in S3. Using .replace() after os.path.join to change all \
# used in Windows to / and then create correct folders in S3
boto3.Session().resource("s3").Bucket(bucket).Object(os.path.join(prefix, "train/train.csv").replace("\\","/")).upload_file("data-samples/customer-churn-xgboost-train.csv")
boto3.Session().resource("s3").Bucket(bucket).Object(os.path.join(prefix, "validation/validation.csv").replace("\\","/")).upload_file("data-samples/customer-churn-xgboost-validation.csv")

# Train

In [17]:
# Get container image name for XGBoost
container = sagemaker.image_uris.retrieve("xgboost", sess.boto_region_name, "1.5-1")
display(container)

'764974769150.dkr.ecr.eu-west-2.amazonaws.com/sagemaker-xgboost:1.5-1'

In [19]:
# Define training and validation dataset input location
s3_input_train = TrainingInput(s3_data="s3://{}/{}/train".format(bucket, prefix), content_type="csv")
s3_input_validation = TrainingInput(s3_data="s3://{}/{}/validation/".format(bucket, prefix), content_type="csv")

In [20]:
# Create Estimator object using Sagemaker SDK, which will help us to create a training job (on AWS) easier when calling .fit()
xgb = sagemaker.estimator.Estimator(
    container, # container image name previously obtained
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge", # type of AWS instance to use
    output_path="s3://{}/{}/output".format(bucket, prefix), # location to store final trained model
    sagemaker_session=sess,
)

# Set hyperparameters for XGBoost model
xgb.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    verbosity=0,
    objective="binary:logistic",
    num_round=100,
)

# Train estimator, XGBoost model, by calling .fit() and passing the location of the train and validation set
xgb.fit({"train": s3_input_train, "validation": s3_input_validation})

2022-09-25 11:19:58 Starting - Starting the training job...
2022-09-25 11:20:22 Starting - Preparing the instances for trainingProfilerReport-1664104794: InProgress
......
2022-09-25 11:21:32 Downloading - Downloading input data......
2022-09-25 11:22:22 Training - Downloading the training image......
2022-09-25 11:23:33 Uploading - Uploading generated training model[2022-09-25 11:23:22.809 ip-10-0-218-31.eu-west-2.compute.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2022-09-25:11:23:22:INFO] Imported framework sagemaker_xgboost_container.training
[2022-09-25:11:23:22:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.
Returning the value itself
[2022-09-25:11:23:22:INFO] No GPUs detected (normal if no gpus installed)
[2022-09-25:11:23:22:INFO] Running XGBoost Sagemaker in algorithm mode
[2022-09-25:11:23:22:INFO] Determined delimiter of CSV input is ','
[2022-09-25:11:23:22:INFO] Determined delimiter of CSV input is ','
[2022-09-25:11:23:

A trained model will be store in the bucket and location specified below, for our case, it will be in: 

<code>sagemaker/aws-sagemaker-training/DEMO-xgboost-churn/output/sagemaker-xgboost-2022-09-25-11-19-54-022/output/model.tar.gz</code>

It can be used for deploying an endpoint for inference, in real-time, or for batch prediction.