In [None]:
!pip install -U sagemaker



In [None]:
import os
import boto3
import re
import sagemaker
import time

role = sagemaker.get_execution_role()
region = sagemaker.Session().boto_region_name

bucket = sagemaker.Session().default_bucket()

prefix = f'gsml-nyc-taxi-2019-script-mode/ml/experiment-{time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())}'
print(f'Output S3 prefix: s3://{bucket}/{prefix}')


### Write the PySpark script

The source for a preprocessing script is in the cell below. The cell uses the %%writefile directive to save this file locally. This script does some basic feature engineering on a raw input dataset.


In [None]:
%%writefile ./xgboost-nyctaxi-parquet.py
import argparse
import json
import logging
import os
import time
import pandas as pd
import numpy as np
import pickle as pkl
from typing import Dict, List, Tuple
# from time import time
from contextlib import contextmanager

from sagemaker_containers import entry_point
from sagemaker_xgboost_container.data_utils import get_dmatrix
from sagemaker_xgboost_container import distributed

import xgboost as xgb

os.system("pip install -U sagemaker")
import boto3
from sagemaker.session import Session
from sagemaker.experiments.run import Run, load_run


boto_session = boto3.session.Session(region_name=os.environ["AWS_REGION"])
sagemaker_session = Session(boto_session=boto_session)

# Ref: https://xgboost.readthedocs.io/en/stable/python/examples/callbacks.html#sphx-glr-python-examples-callbacks-py
class ExperimentLogging(xgb.callback.TrainingCallback):
    '''Plot evaluation result during training.  Only for demonstration purpose as it's quite
    slow to draw.

    '''
    def __init__(self, 
                 is_master):
        self.is_master = is_master

    def _get_key(self, data, metric):
        return f'{data}:{metric}'


    def after_iteration(self, model, epoch: int, evals_log):
        '''Log metrics after each iteration.'''
        try:
            if self.is_master:           
                for data, metric in evals_log.items():
                    for metric_name, log in metric.items():
                        key = self._get_key(data, metric_name)

        except Exception as error:
            print(f'ERROR (`after_iteration()`): {error}')
        finally:
            # False to indicate training should not stop.
            return False
    
# Ref: https://sagemaker-examples.readthedocs.io/en/latest/introduction_to_amazon_algorithms/xgboost_abalone/xgboost_abalone_dist_script_mode.html
def _xgb_train(params, dtrain, evals, num_boost_round, model_dir, is_master, feval=None):
    """Run xgb train on arguments given with rabit initialized.

    This is our rabit execution function.

    :param args_dict: Argument dictionary used to run xgb.train().
    :param is_master: True if current node is master host in distributed training,
                        or is running single node training job.
                        Note that rabit_run includes this argument.
    """
    exp_logging = ExperimentLogging(is_master)
    booster = xgb.train(params=params,
                        dtrain=dtrain,
                        evals=evals,
                        num_boost_round=num_boost_round,
                        callbacks=[exp_logging],
                       )

if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    # Hyperparameters are described here.
    parser.add_argument('--max_depth', type=int,)
    parser.add_argument('--eta', type=float)
    parser.add_argument('--gamma', type=int)
    parser.add_argument('--min_child_weight', type=int)
    parser.add_argument('--subsample', type=float)
    parser.add_argument('--verbosity', type=int)
    parser.add_argument('--objective', type=str)
    parser.add_argument('--num_round', type=int)
    parser.add_argument('--tree_method', type=str, default="auto")
    parser.add_argument('--predictor', type=str, default="auto")
    parser.add_argument('--content_type', type=str, default="")

    # Sagemaker specific arguments. Defaults are set in the environment variables.
    parser.add_argument('--output_data_dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--validation', type=str, default=os.environ.get('SM_CHANNEL_VALIDATION'))
    parser.add_argument('--sm_hosts', type=str, default=os.environ.get('SM_HOSTS'))
    parser.add_argument('--sm_current_host', type=str, default=os.environ.get('SM_CURRENT_HOST'))

    args, _ = parser.parse_known_args()

    # Get SageMaker host information from runtime environment variables
    sm_hosts = json.loads(args.sm_hosts)
    sm_current_host = args.sm_current_host

    dtrain = get_dmatrix(args.train, args.content_type)
    dval = get_dmatrix(args.validation, args.content_type)
    watchlist = [(dtrain, 'train'), (dval, 'validation')] if dval is not None else [(dtrain, 'train')]

    train_hp = {
        'max_depth': args.max_depth,
        'eta': args.eta,
        'gamma': args.gamma,
        'min_child_weight': args.min_child_weight,
        'subsample': args.subsample,
        'verbosity': args.verbosity,
        'objective': args.objective,
        'tree_method': args.tree_method,
        'predictor': args.predictor,
    }

    xgb_train_args = dict(
        params=train_hp,
        dtrain=dtrain,
        evals=watchlist,
        num_boost_round=args.num_round,
        model_dir=args.model_dir)

    if len(sm_hosts) > 1:
        # Wait until all hosts are able to find each other
        entry_point._wait_hostname_resolution()

        # Execute training function after initializing rabit.
        distributed.rabit_run(
            exec_fun=_xgb_train,
            args=xgb_train_args,
            include_in_training=(dtrain is not None),
            hosts=sm_hosts,
            current_host=sm_current_host,
            update_rabit_args=True
        )
    else:
        # If single node training, call training method directly.
        if dtrain:
            xgb_train_args['is_master'] = True
            _xgb_train(**xgb_train_args)
        else:
            raise ValueError("Training channel must have data to train model.")


def model_fn(model_dir):
    """Deserialize and return fitted model.

    Note that this should have the same name as the serialized model in the _xgb_train method
    """
    model_file = 'xgboost-model'
    booster = pkl.load(open(os.path.join(model_dir, model_file), 'rb'))
    return booster

In [None]:
hyperparams = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "reg:squarederror",
    "num_round": "50",
    "verbosity": "2",
    "content_type":"parquet",
}

instance_type = "ml.m5.24xlarge"
output_path = "s3://{}/{}/{}/output".format(bucket, prefix, "nyctaxi-dist-xgb")
content_type = "application/x-parquet"    #! IMPORTANT: have to be this for Experiment API to work, validation regex: ^[-\w]+\/[-\w+]+$


print(f'Output path: {output_path}')


In [None]:
# Add your train/validation data set source here, the folder needs to contain "train" and "validation" subfolders with corresponding datasets

dataset_s3_uri = 's3://dsoaws/nyc-taxi-orig-cleaned-dropped-parquet-2019'

In [None]:
# Open Source distributed script mode
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.debugger import Rule, rule_configs

from IPython.display import FileLink, FileLinks

session = Session()
script_path = "./xgboost-nyctaxi-parquet.py"

xgb_script_mode_estimator = XGBoost(
    entry_point=script_path,
    framework_version="1.7-1",  # Note: framework_version is mandatory
    hyperparameters=hyperparams,
    role=role,
    instance_count=3,
    instance_type=instance_type,
    output_path=output_path,
)

train_input = TrainingInput(
    f"{dataset_s3_uri}/train", content_type=content_type,
    distribution='ShardedByS3Key',  # testing
)
validation_input = TrainingInput(
    f"{dataset_s3_uri}/validation", content_type=content_type,
    distribution='ShardedByS3Key',  # testing
)

print('Training input config')
print(train_input.config)

print('Validation input config')
print(validation_input.config)

In [None]:
%%time

xgb_script_mode_estimator.fit({"train": train_input, 
                               "validation": validation_input
                              })
    

