# Imports

In [2]:
import os
import shutil
import pandas as pd
import sagemaker as sm

from sagemaker.inputs import TrainingInput
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.spark.processing import PySparkProcessor

## SageMaker Parameters

In [3]:
role              = sm.get_execution_role()
sagemaker_session = sm.session.Session()
region            = sagemaker_session._region_name
bucket            = sagemaker_session.default_bucket()

# Data Preparation

## Reset File Structure

In [4]:
try:
    shutil.rmtree('csv_bucket')
except:
    pass
os.mkdir('csv_bucket')

## Load Dataset

In [5]:
training_path_raw = f's3://{bucket}/Training_BOP.csv'
df = pd.read_csv(training_path_raw).head(100_000)
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,sku,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,...,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop,went_on_backorder
0,1026827,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-99.0,-99.0,0.0,No,No,No,Yes,No,No
1,1043384,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.99,0.99,0.0,No,No,No,Yes,No,No
2,1043696,2.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-99.0,-99.0,0.0,Yes,No,No,Yes,No,No
3,1043852,7.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.1,0.13,0.0,No,No,No,Yes,No,No
4,1044048,8.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-99.0,-99.0,0.0,Yes,No,No,Yes,No,No


In [6]:
_ = [print(c) for c in df.columns]

sku
national_inv
lead_time
in_transit_qty
forecast_3_month
forecast_6_month
forecast_9_month
sales_1_month
sales_3_month
sales_6_month
sales_9_month
min_bank
potential_issue
pieces_past_due
perf_6_month_avg
perf_12_month_avg
local_bo_qty
deck_risk
oe_constraint
ppap_risk
stop_auto_buy
rev_stop
went_on_backorder


## Featurize String Columns

In [7]:
columns = list(df.columns)
binary_columns = ['went_on_backorder','deck_risk','oe_constraint','ppap_risk','stop_auto_buy','rev_stop','potential_issue']
for col in binary_columns:
    df[col] = df[col].map({'No':0, 'Yes':1})

## Format Data for XGBoost

In [8]:
target_column = 'went_on_backorder'
columns.remove(target_column)
columns.remove('sku')
data = df[[target_column] + columns].dropna()
data.head()

Unnamed: 0,went_on_backorder,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,...,potential_issue,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop
1,0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.99,0.99,0.0,0,0,0,1,0
3,0,7.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.1,0.13,0.0,0,0,0,1,0
5,0,13.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.82,0.87,0.0,0,0,0,1,0
7,0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,1,0,1,1,0
9,0,4.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.82,0.87,0.0,0,0,0,1,0


## Sample and Upload Data

In [9]:
n_files = 500
n_rows = 1000
for i in range(n_files):
    data_i = data.sample(n_rows)
    data_i.to_csv(f'./csv_bucket/data_{i}.csv', header=False, index=False)
    sagemaker_session.upload_data(f'./csv_bucket/data_{i}.csv', key_prefix='csv_bucket')

## Process and Store Testing Data

NOTE: We will also remove the target column and store it separately.

In [10]:
testing_path_raw = f's3://{bucket}/Testing_BOP.csv'
df = pd.read_csv(testing_path_raw).head(50_000)

columns = list(df.columns)
binary_columns = ['went_on_backorder','deck_risk','oe_constraint','ppap_risk','stop_auto_buy','rev_stop','potential_issue']
for col in binary_columns:
    df[col] = df[col].map({'No':0, 'Yes':1})

target_column = 'went_on_backorder'
columns.remove(target_column)
columns.remove('sku')
data_X = df[columns]
data_y = df[[target_column]]

data_X.to_csv(f'./testing_X.csv', header=False, index=False)
sagemaker_session.upload_data(f'./testing_X.csv', key_prefix='testing')
data_y.to_csv(f'./testing_y.csv', header=False, index=False)
sagemaker_session.upload_data(f'./testing_y.csv', key_prefix='testing')

  exec(code_obj, self.user_global_ns, self.user_ns)


f's3://{bucket}/testing/testing_y.csv'

# Lab: Distributed Training with XGBoost

## Set Training Data

In [11]:
training_data_s3 = TrainingInput(s3_data=f's3://{bucket}/csv_bucket/', content_type='csv', distribution='ShardedByS3Key')
print(training_data_s3)

<sagemaker.inputs.TrainingInput object at 0x7f1b68331b90>


## Retrieve XGBoost Container Location

In [12]:
container = sm.image_uris.retrieve("xgboost", region, "latest")
container

'811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest'

## Define XGBoost Model and Parameters

In [13]:
model = sm.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path=f's3://{bucket}/output/',
    sagemaker_session=sagemaker_session,
)

model.set_hyperparameters(
    objective = "binary:logistic",
    tree_method = "approx",
    num_round = 50, 
)

### Train the XGBoost Model

In [14]:
model.fit({"train": training_data_s3})

INFO:sagemaker:Creating training-job with name: xgboost-2023-03-30-00-26-52-164


2023-03-30 00:26:53 Starting - Starting the training job......
2023-03-30 00:27:49 Starting - Preparing the instances for training......
2023-03-30 00:29:02 Downloading - Downloading input data...
2023-03-30 00:29:32 Training - Downloading the training image......
2023-03-30 00:30:17 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2023-03-30:00:30:32:INFO] Running standalone xgboost training.[0m
[34m[2023-03-30:00:30:32:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2023-03-30:00:30:32:INFO] File size need to be processed in the node: 38.41mb. Available memory size in the node: 8611.7mb[0m
[34m[2023-03-30:00:30:32:INFO] Determined delimiter of CSV input is ','[0m
[34m[00:30:32] S3DistributionType set as ShardedByS3Key[0m
[34m[00:30:33] 500000x21 matrix with 10500000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[00:30:33] Tree method is selected to be 'approx'[

### Create Batch Transformer

In [15]:
model_transformer = model.transformer(
    instance_count=5,
    instance_type="ml.m5.xlarge",
    assemble_with="Line",
    accept="text/csv",
    max_payload=20
)

INFO:sagemaker:Creating model with name: xgboost-2023-03-30-00-32-18-978


### Transform Testing Data

In [16]:
model_transformer.transform(f's3://{bucket}/testing/testing_X.csv', content_type="text/csv", split_type='Line')
model_transformer.wait()
model_transformer.output_path

INFO:sagemaker:Creating transform job with name: xgboost-2023-03-30-00-32-19-657


............................[32mArguments: serve[0m
[32m[2023-03-30 00:36:57 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[32m[2023-03-30 00:36:57 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[32m[2023-03-30 00:36:57 +0000] [1] [INFO] Using worker: gevent[0m
[32m[2023-03-30 00:36:57 +0000] [21] [INFO] Booting worker with pid: 21[0m
[32m[2023-03-30 00:36:57 +0000] [22] [INFO] Booting worker with pid: 22[0m
  monkey.patch_all(subprocess=True)[0m
[32m[2023-03-30:00:36:57:INFO] Model loaded successfully for worker : 21[0m
[32m[2023-03-30 00:36:57 +0000] [23] [INFO] Booting worker with pid: 23[0m
[32m[2023-03-30 00:36:57 +0000] [24] [INFO] Booting worker with pid: 24[0m
  monkey.patch_all(subprocess=True)[0m
[32m[2023-03-30:00:36:57:INFO] Model loaded successfully for worker : 22[0m
  monkey.patch_all(subprocess=True)[0m
[32m[2023-03-30:00:36:57:INFO] Model loaded successfully for worker : 23[0m
  monkey.patch_all(subprocess=True)[0m
[32m[2023-03-30

's3://sagemaker-us-east-1-047840628716/xgboost-2023-03-30-00-32-19-657'

In [17]:
model_transformer.output_path

's3://sagemaker-us-east-1-047840628716/xgboost-2023-03-30-00-32-19-657'

### View Predictions

In [18]:
responses = pd.read_csv(f'{model_transformer.output_path}/testing_X.csv.out', header=None, names=['predictions'])
responses.head()

Unnamed: 0,predictions
0,1.6e-05
1,7.8e-05
2,0.00011
3,0.000446
4,0.000385


In [19]:
model = sm.model.Model(
    container,
    role=role,
    model_data='s3://sagemaker-us-east-1-047840628716/output/xgboost-2023-03-22-23-23-46-878/output/model.tar.gz',
)

In [20]:
model_transformer = model.transformer(
    instance_count=1,
    instance_type="ml.m5.xlarge",
    assemble_with="Line",
    accept="text/csv",
    max_payload=20
)

INFO:sagemaker:Creating model with name: xgboost-2023-03-30-00-38-29-135


In [21]:
model_transformer.transform('s3://sagemaker-us-east-1-047840628716/testing/testing_X.csv', content_type="text/csv", split_type='Line')
model_transformer.wait()
model_transformer.output_path

INFO:sagemaker:Creating transform job with name: xgboost-2023-03-30-00-38-29-822


...........................
[34mArguments: serve[0m
[35mArguments: serve[0m
[34m[2023-03-30 00:42:59 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2023-03-30 00:42:59 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2023-03-30 00:42:59 +0000] [1] [INFO] Using worker: gevent[0m
[35m[2023-03-30 00:42:59 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[35m[2023-03-30 00:42:59 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[35m[2023-03-30 00:42:59 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2023-03-30 00:42:59 +0000] [21] [INFO] Booting worker with pid: 21[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2023-03-30:00:42:59:INFO] Model loaded successfully for worker : 21[0m
[34m[2023-03-30 00:42:59 +0000] [22] [INFO] Booting worker with pid: 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2023-03-30:00:42:59:INFO] Model loaded successfully for worker : 22[0m
[34m[2023-03-30 00:42:59 +0000] [23] [INFO] Booting worker with pid: 23[

's3://sagemaker-us-east-1-047840628716/xgboost-2023-03-30-00-38-29-822'

In [22]:
testing_predictions = f'{model_transformer.output_path}/testing_X.csv.out'
responses = pd.read_csv(testing_predictions, header=None, names=['predictions'])
responses.head()

Unnamed: 0,predictions
0,4.594525e-09
1,1.059714e-06
2,3.56121e-07
3,6.188358e-08
4,4.197955e-06


### Train XGBoost Model with Multiple Instances

In [23]:
model = sm.estimator.Estimator(
    container,
    role,
    instance_count=2,
    instance_type="ml.m4.xlarge",
    output_path=f's3://{bucket}/output/',
    sagemaker_session=sagemaker_session,
)

model.set_hyperparameters(
    objective = "binary:logistic", #multi:softmax", #NOTE: "binary:logistic" only predicts probabilities!
    tree_method = "approx",
    num_round = 50, 
)

model.fit({"train": training_data_s3})

INFO:sagemaker:Creating training-job with name: xgboost-2023-03-30-00-43-37-194


2023-03-30 00:43:40 Starting - Starting the training job...
2023-03-30 00:44:05 Starting - Preparing the instances for training.........
2023-03-30 00:45:31 Downloading - Downloading input data...
2023-03-30 00:46:06 Training - Downloading the training image...
2023-03-30 00:46:37 Training - Training image download completed. Training in progress..[35mArguments: train[0m
[34mArguments: train[0m
[34m[2023-03-30:00:46:48:INFO] Running distributed xgboost training.[0m
[34m[2023-03-30:00:46:48:INFO] Number of hosts: 2, master IP address: 10.2.240.230, host IP address: 10.2.240.230.[0m
[34m[2023-03-30:00:46:48:INFO] Finished Yarn configuration files setup.[0m
[35m[2023-03-30:00:46:49:INFO] Running distributed xgboost training.[0m
[35m[2023-03-30:00:46:49:INFO] Number of hosts: 2, master IP address: 10.2.240.230, host IP address: 10.2.201.133.[0m
[35m[2023-03-30:00:46:49:INFO] Finished Yarn configuration files setup.[0m
[35mstarting datanode, logging to /opt/amazon/hadoop/lo

In [24]:
model = sm.estimator.Estimator(
    container,
    role,
    instance_count=5,
    instance_type="ml.m4.xlarge",
    output_path=f's3://{bucket}/output/',
    sagemaker_session=sagemaker_session,
)

model.set_hyperparameters(
    objective = "binary:logistic", #multi:softmax", #NOTE: "binary:logistic" only predicts probabilities!
    tree_method = "approx",
    num_round = 50, 
)

model.fit({"train": training_data_s3})

INFO:sagemaker:Creating training-job with name: xgboost-2023-03-30-00-49-12-853


2023-03-30 00:49:13 Starting - Starting the training job...
2023-03-30 00:49:38 Starting - Preparing the instances for training............
2023-03-30 00:51:40 Downloading - Downloading input data..............[35mArguments: train[0m
[32mArguments: train[0m
[32m[2023-03-30:00:54:06:INFO] Running distributed xgboost training.[0m
[32m[2023-03-30:00:54:06:INFO] Number of hosts: 5, master IP address: 10.0.231.37, host IP address: 10.0.193.198.[0m
[32m[2023-03-30:00:54:06:INFO] Finished Yarn configuration files setup.[0m
[32mstarting datanode, logging to /opt/amazon/hadoop/logs/hadoop--datanode-ip-10-0-193-198.ec2.internal.out[0m
[35m[2023-03-30:00:54:06:INFO] Running distributed xgboost training.[0m
[35m[2023-03-30:00:54:06:INFO] Number of hosts: 5, master IP address: 10.0.231.37, host IP address: 10.0.227.208.[0m
[35m[2023-03-30:00:54:06:INFO] Finished Yarn configuration files setup.[0m
[35mstarting datanode, logging to /opt/amazon/hadoop/logs/hadoop--datanode-ip-10-0-2