### Notebook Env Details
`kernel`: Python 3
`image`: Data Science 3.0
`instance`: ml.t3.medium

<div style="background-color: darkgreen; font-size: 20px; color: white;">
Setup

In [1]:
# Ensure updated SageMaker SDK version
%pip install -U -q sagemaker

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import boto3
import re
import sagemaker

role = sagemaker.get_execution_role()
region = sagemaker.Session().boto_region_name

bucket = 'sagemaker-us-east-1-717145514721'
prefix = 'nyc-taxi'

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [3]:
# Get Stored Variables
%store -r
%store

Stored variables and their in-db values:
data_bucket             -> 'sagemaker-us-east-1-717145514721/nyc-taxi/data/pr


<div style="background-color: darkgreen; font-size: 20px; color: white;">
Train Script

In [4]:
%%writefile scripts/xgb_train.py

import argparse
import json
import logging
import os
import pickle as pkl
import boto3
from io import StringIO

import pandas as pd
import xgboost as xgb
from sagemaker_containers import entry_point
from sagemaker_xgboost_container import distributed
from sagemaker_xgboost_container.data_utils import get_dmatrix

#=========================================================================================
#                                      FUNCTIONS
#=========================================================================================

def _xgb_train(params, dtrain, evals, num_boost_round, model_dir, is_master):
    """Run xgb train on arguments given with rabit initialized.

    This is our rabit execution function.

    :param args_dict: Argument dictionary used to run xgb.train().
    :param is_master: True if current node is master host in distributed training,
                        or is running single node training job.
                        Note that rabit_run will include this argument.
    """

    logging.basicConfig(level=logging.DEBUG) 
    
    booster = xgb.train(params=params, dtrain=dtrain, evals=evals, num_boost_round=num_boost_round)

    if is_master:
        model_location = model_dir + "/xgboost-model"
        pkl.dump(booster, open(model_location, "wb"))
        logging.info("Stored trained model at {}".format(model_location))

#=========================================================================================
#                                      RUN
#=========================================================================================

if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # ARGUMENTS
    parser.add_argument(
        "--max_depth",
        type=int,
    )
    parser.add_argument("--eta", type=float)
    parser.add_argument("--gamma", type=float)
    parser.add_argument("--min_child_weight", type=float)
    parser.add_argument("--subsample", type=float)
    parser.add_argument("--verbosity", type=int)
    parser.add_argument("--objective", type=str)
    parser.add_argument("--num_round", type=int)
    parser.add_argument("--tree_method", type=str, default="auto")
    parser.add_argument("--predictor", type=str, default="auto")
    parser.add_argument("--batch_size", type=int, default=64)

    # Sagemaker specific arguments. Defaults are set in the environment variables.
    parser.add_argument("--output_data_dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR"))
    parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--validation", type=str, default=os.environ.get("SM_CHANNEL_VALIDATION"))
    parser.add_argument("--sm_hosts", type=str, default=os.environ.get("SM_HOSTS"))
    parser.add_argument("--sm_current_host", type=str, default=os.environ.get("SM_CURRENT_HOST"))

    args, _ = parser.parse_known_args()

    # DATA INPUT: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    
    dtrain = get_dmatrix(args.train, 'csv')
    dval = get_dmatrix(args.validation, 'csv')
    watchlist = [(dtrain, 'train'), (dval, 'validation')] if dval is not None else [(dtrain, 'train')]
    
    # TRAINING ARGUMENTS: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    train_hp = {
        "max_depth": args.max_depth,
        "eta": args.eta,
        "gamma": args.gamma,
        "min_child_weight": args.min_child_weight,
        "subsample": args.subsample,
        "verbosity": args.verbosity,
        "objective": args.objective,
        "tree_method": args.tree_method,
        "predictor": args.predictor,
    }

    xgb_train_args = dict(
        params=train_hp,
        dtrain=dtrain,
        evals=watchlist,
        num_boost_round=args.num_round,
        model_dir=args.model_dir,
    )

    # DISTRIBUTED TRAINING - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    
    # Get SageMaker host information from runtime environment variables
    sm_hosts = json.loads(args.sm_hosts)
    sm_current_host = args.sm_current_host
    
    if len(sm_hosts) > 1:
        # Wait until all hosts are able to find each other
        entry_point._wait_hostname_resolution()

        # Execute training function after initializing rabit.
        distributed.rabit_run(
            exec_fun=_xgb_train,
            args=xgb_train_args,
            include_in_training=(dtrain is not None),
            hosts=sm_hosts,
            current_host=sm_current_host,
            update_rabit_args=True,
        )
    else:
        # If single node training, call training method directly.
        if dtrain:
            xgb_train_args["is_master"] = True
            _xgb_train(**xgb_train_args)
        else:
            raise ValueError("Training channel must have data to train model.")

# Trained Model Export Function
def model_fn(model_dir):
    """Deserialize and return fitted model.

    Note that this should have the same name as the serialized model in the _xgb_train method
    """
    model_file = "xgboost-model"
    booster = pkl.load(open(os.path.join(model_dir, model_file), "rb"))
    return booster

Overwriting training_scripts/xgb_train.py


<div style="background-color: darkviolet; font-size: 15px; color: white;">
(Optional) Create Data Subset for Train-job Testing

<div style="background-color: darkgreen; font-size: 20px; color: white;">
Trainging Setup

In [5]:
# Training Output Path
bucket = 'sagemaker-us-east-1-717145514721'
prefix = 'nyc-taxi'
output_path = "s3://{}/{}/{}/output".format(bucket, prefix, "training-runs")
print(f"Output Path: {output_path}")

instance_type = "ml.m5.xlarge"
content_type = "csv"

Output Path: s3://sagemaker-us-east-1-717145514721/nyc-taxi/xgboost-runs/output


<div style="background-color: teal; font-size: 15px; color: white;">
Create An Estimator

In [8]:
from sagemaker.xgboost.estimator import XGBoost

hyperparameters = {
    "max_depth": "3",
    "eta": "0.3",
    "gamma": "1.9",
    "min_child_weight": "71",
    "subsample": "0.7",
    "objective": "reg:squarederror",
    "num_round": "50",
    "verbosity": "2",
}

In [9]:
# Open Source distributed script mode
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost

session = Session()

script_path = "scripts/xgb_train.py"

xgb_script_mode_estimator = XGBoost(
    entry_point=script_path,
    framework_version="1.7-1",  # Note: framework_version is mandatory
    hyperparameters=hyperparameters,
    role=role,
    instance_count=2,
    instance_type=instance_type,
    output_path=output_path,
    input_mode='File'
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


<div style="background-color: teal; font-size: 15px; color: white;">
Full Data Path

In [10]:
train_input = TrainingInput(
    "s3://{}/{}/{}/".format(bucket, prefix, "data/processed/train"), content_type=content_type
)
validation_input = TrainingInput(
    "s3://{}/{}/{}/".format(bucket, prefix, "data/processed/validation"), content_type=content_type
)

<div style="background-color: darkviolet; font-size: 15px; color: white;">
(Optional) Partial Data Path

<div style="background-color: darkgreen; font-size: 20px; color: white;">
Call Fit

In [11]:
xgb_script_mode_estimator.fit({"train": train_input, "validation": validation_input})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-12-06-17-25-59-061


Using provided s3_resource
2023-12-06 17:25:59 Starting - Starting the training job...
2023-12-06 17:26:15 Starting - Preparing the instances for training.........
2023-12-06 17:27:38 Downloading - Downloading input data...
2023-12-06 17:28:18 Training - Downloading the training image...
2023-12-06 17:28:44 Training - Training image download completed. Training in progress..[34m[2023-12-06 17:28:55.648 ip-10-0-235-99.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-12-06 17:28:55.671 ip-10-0-235-99.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-12-06:17:28:56:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-12-06:17:28:56:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-12-06:17:28:56:INFO] Invoking user training script.[0m
[34m[2023-12-06:17:28:56:INFO] Module xgb_train does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m[2023-12-06:17:28: