In [2]:
# !pip install -U sagemaker

In [3]:
import os
import boto3
import re
import sagemaker
import time

role = sagemaker.get_execution_role()
region = sagemaker.Session().boto_region_name

bucket = sagemaker.Session().default_bucket()

prefix = f'gsml-nyc-taxi-full-script-mode/ml/test1-{time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())}'
print(f'Output S3 prefix: s3://{bucket}/{prefix}')


Output S3 prefix: s3://sagemaker-us-east-1-079002598131/gsml-nyc-taxi-full-script-mode/ml/test1-2023-03-05-18-05-56


In [4]:
hyperparams = {
    "eta": "0.2",
    "gamma": "4",
    "max_depth": "5",
    "min_child_weight": "6",
    "num_round": "50",
    "objective": "reg:squarederror",
    "subsample": "0.7",
    "verbosity": "2",
    "content_type":"parquet",
}

instance_type = "ml.m5.24xlarge"
output_path = "s3://{}/{}/{}/output".format(bucket, prefix, "nyctaxi-dist-xgb")
content_type = "parquet"

print(f'Output path: {output_path}')


Output path: s3://sagemaker-us-east-1-079002598131/gsml-nyc-taxi-full-script-mode/ml/test1-2023-03-05-18-05-56/nyctaxi-dist-xgb/output


In [5]:
train_set_s3_uri = 's3://dsoaws/gsml-nyc-taxi-full-etl-ml-test-4-custompyspark-export-s3-via-notebook/export-flow-2023-03-02-03-32-10-53926e35/output/training/'
validation_set_s3_uri = 's3://dsoaws/gsml-nyc-taxi-full-etl-ml-test-4-custompyspark-export-s3-via-notebook/export-flow-2023-03-02-03-32-10-53926e35/output/validation/'

print(f"Training input data path: {train_set_s3_uri}")
print(f"Validation input data path: {validation_set_s3_uri}")

Training input data path: s3://dsoaws/gsml-nyc-taxi-full-etl-ml-test-4-custompyspark-export-s3-via-notebook/export-flow-2023-03-02-03-32-10-53926e35/output/training/
Validation input data path: s3://dsoaws/gsml-nyc-taxi-full-etl-ml-test-4-custompyspark-export-s3-via-notebook/export-flow-2023-03-02-03-32-10-53926e35/output/validation/


# TODO:  FIND A WAY TO ONLY TRAIN ON 70% OF THE DATA!!

# TODO:  CHANGE THIS TO /train INSTEAD OF /training!!

In [6]:
# Open Source distributed script mode
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost

session = Session()
script_path = "code/xgboost-nyctaxi-parquet.py"

# Logging metric
metrics_definitions = [
    {"Name": "train:loss", "Regex": "loss: ([0-9\\.]+)"},
    {"Name": "train:accuracy", "Regex": "accuracy: ([0-9\\.]+)"},
    {"Name": "validation:loss", "Regex": "val_loss: ([0-9\\.]+)"},
    {"Name": "validation:accuracy", "Regex": "val_accuracy: ([0-9\\.]+)"},
]

xgb_script_mode_estimator = XGBoost(
    entry_point=script_path,
    framework_version="1.5-1",  # Note: framework_version is mandatory
    hyperparameters=hyperparams,
    role=role,
    instance_count=6,
    instance_type=instance_type,
    output_path=output_path,
    metrics_definitions=metrics_definitions,
)

train_input = TrainingInput(
    train_set_s3_uri, content_type=content_type,
    distribution='ShardedByS3Key',
    input_mode='FastFile'
)
validation_input = TrainingInput(
    validation_set_s3_uri, content_type=content_type,
    distribution='ShardedByS3Key',
    input_mode='FastFile'    
)

print('Training input config')
print(train_input.config)

print('Validation input config')
print(validation_input.config)

Training input config
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://dsoaws/gsml-nyc-taxi-full-etl-ml-test-4-custompyspark-export-s3-via-notebook/export-flow-2023-03-02-03-32-10-53926e35/output/training/', 'S3DataDistributionType': 'ShardedByS3Key'}}, 'ContentType': 'parquet', 'InputMode': 'FastFile'}
Validation input config
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://dsoaws/gsml-nyc-taxi-full-etl-ml-test-4-custompyspark-export-s3-via-notebook/export-flow-2023-03-02-03-32-10-53926e35/output/validation/', 'S3DataDistributionType': 'ShardedByS3Key'}}, 'ContentType': 'parquet', 'InputMode': 'FastFile'}


In [None]:
%%time

training_job_results = xgb_script_mode_estimator.fit({"train": train_input,
                               #"validation": validation_input
                       })

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-03-05-18-05-57-204


2023-03-05 18:05:57 Starting - Starting the training job......
2023-03-05 18:06:54 Starting - Preparing the instances for training............
2023-03-05 18:08:43 Downloading - Downloading input data......
2023-03-05 18:09:49 Training - Training image download completed. Training in progress.[35m[2023-03-05 18:09:50.730 ip-10-0-165-177.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[35m[2023-03-05:18:09:51:INFO] Imported framework sagemaker_xgboost_container.training[0m
[35m[2023-03-05:18:09:51:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2023-03-05:18:09:51:INFO] Invoking user training script.[0m
[35m[2023-03-05:18:09:51:INFO] Module xgboost-nyctaxi-parquet does not provide a setup.py. [0m
[35mGenerating setup.py[0m
[35m[2023-03-05:18:09:51:INFO] Generating setup.cfg[0m
[35m[2023-03-05:18:09:51:INFO] Generating MANIFEST.in[0m
[35m[2023-03-05:18:09:51:INFO] Installing module with the following command:[0m
[35m/miniconda3/bin/p