In [2]:
import os
import boto3
import re
import sagemaker
import time

sess = sagemaker.Session()
region = boto3.Session().region_name
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()

In [3]:
model_training_s3_uri = 's3://dsoaws/nyc-taxi-orig-cleaned-dropped-parquet-all-years-multiple-files-100GB/'

In [4]:
# Open Source distributed script mode
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost

instance_type = "ml.m5.24xlarge"
output_content_type = "parquet"

train_content_type = (
    "application/x-parquet" if output_content_type.upper() == "PARQUET"
    else "text/csv"
)
train_input = sagemaker.inputs.TrainingInput(
    s3_data=model_training_s3_uri,
    content_type=train_content_type,
    distribution='ShardedByS3Key',
    input_mode='FastFile'
)

hyperparameters = {
    "eta": "0.2",
    "gamma": "4",
    "max_depth": "5",
    "min_child_weight": "6",
    "num_round": "50",
    "objective": "reg:squarederror",
    "subsample": "0.7",
    "verbosity": "2",
    "content_type":"parquet",
}

# # Logging metric
# metrics_definitions = [
#     {"Name": "train:loss", "Regex": "loss: ([0-9\\.]+)"},
#     {"Name": "train:accuracy", "Regex": "accuracy: ([0-9\\.]+)"},
#     {"Name": "validation:loss", "Regex": "val_loss: ([0-9\\.]+)"},
#     {"Name": "validation:accuracy", "Regex": "val_accuracy: ([0-9\\.]+)"},
# ]

xgb_script_mode_estimator = XGBoost(
    entry_point="code/xgboost-nyctaxi-parquet.py",
    framework_version="1.5-1", 
    hyperparameters=hyperparameters,
    role=role,
    instance_count=12,
    instance_type=instance_type,
    volume_size=600,
    input_mode='FastFile'
)

In [5]:
%%time

# m5.24xlarge (6 incl leader node) - 12.85 minutes??
# m5.24xlarge (20 incl leader node), 1TB - ?? minutes on 
training_job_results = xgb_script_mode_estimator.fit({"train": train_input})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-03-10-07-02-02-822


2023-03-10 07:02:03 Starting - Starting the training job...
2023-03-10 07:02:21 Starting - Preparing the instances for training............
2023-03-10 07:04:11 Downloading - Downloading input data...
2023-03-10 07:05:02 Training - Training image download completed. Training in progress...[35m[2023-03-10 07:05:13.714 ip-10-0-153-103.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[35m[2023-03-10 07:05:13.770 ip-10-0-153-103.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[35m[2023-03-10:07:05:14:INFO] Imported framework sagemaker_xgboost_container.training[0m
[35m[2023-03-10:07:05:14:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2023-03-10:07:05:14:INFO] Invoking user training script.[0m
[35m[2023-03-10:07:05:14:INFO] Module xgboost-nyctaxi-parquet does not provide a setup.py. [0m
[35mGenerating setup.py[0m
[35m[2023-03-10:07:05:14:INFO] Generating setup.cfg[0m
[35m[2023-03-10:07:05:14:INFO] Genera

KeyboardInterrupt: 