In [2]:
import time
import uuid
import boto3
import sagemaker

sess = sagemaker.Session()
region = boto3.Session().region_name
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()

In [3]:
model_training_s3_uri = 's3://dsoaws/nyc-taxi-orig-cleaned-dropped-parquet-all-years-multiple-files-100GB/'

### Configure the algorithm and training job

The Training Job hyperparameters are set. For more information on XGBoost Hyperparameters, 
see https://xgboost.readthedocs.io/en/latest/parameter.html.

In [4]:
container = sagemaker.image_uris.retrieve("xgboost", region, "1.5-1")

output_content_type = "parquet"

train_content_type = (
    "application/x-parquet" if output_content_type.upper() == "PARQUET"
    else "text/csv"
)
train_input = sagemaker.inputs.TrainingInput(
    s3_data=model_training_s3_uri,
    content_type=train_content_type,
    distribution='ShardedByS3Key',
    input_mode='FastFile'
)

hyperparameters = {
    "eta": "0.2",
    "gamma": "4",
    "max_depth": "5",
    "min_child_weight": "6",
    "num_round": "50",
    "objective": "reg:squarederror",
    "subsample": "0.7"
}

### Start the Training Job

The TrainingJob configurations are set using the SageMaker Python SDK Estimator, and which is fit using 
the training data from the Processing Job that was run earlier.

In [5]:
estimator = sagemaker.estimator.Estimator(
    container,
    role,
    hyperparameters=hyperparameters,
    instance_count=12,
    instance_type="ml.m5.24xlarge",
    volume_size=200,
    input_mode='FastFile'
)

In [6]:
import time
import sagemaker

# 12.85 minutes - m5.24xlarge (6 nodes) - 100GB
training_job_results = estimator.fit({"train": train_input})
print(training_job_results)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-03-10-06-57-33-386


2023-03-10 06:57:33 Starting - Starting the training job...
2023-03-10 06:57:51 Starting - Preparing the instances for training.........
2023-03-10 06:59:10 Downloading - Downloading input data...
2023-03-10 06:59:46 Training - Training image download completed. Training in progress....[36m[2023-03-10 07:00:17.915 ip-10-2-242-58.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[36m[2023-03-10 07:00:17.972 ip-10-2-242-58.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[36m[2023-03-10:07:00:18:INFO] Imported framework sagemaker_xgboost_container.training[0m
[36m[2023-03-10:07:00:18:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[36mReturning the value itself[0m
[36m[2023-03-10:07:00:18:INFO] No GPUs detected (normal if no gpus installed)[0m
[36m[2023-03-10:07:00:18:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[36m[2023-03-10:07:00:18:INFO] Determined 0 GPU(s) available on th

UnexpectedStatusException: Error for Training job sagemaker-xgboost-2023-03-10-06-57-33-386: Failed. Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.8/site-packages/sagemaker_xgboost_container/data_utils.py", line 410, in _get_parquet_dmatrix_file_mode
    table = pq.read_table(files_path)
  File "/miniconda3/lib/python3.8/site-packages/pyarrow/parquet.py", line 1594, in read_table
    return dataset.read(columns=columns, use_threads=use_threads,
  File "/miniconda3/lib/python3.8/site-packages/pyarrow/parquet.py", line 1473, in read
    table = self._dataset.to_table(
  File "pyarrow/_dataset.pyx", line 399, in pyarrow._dataset.Dataset.to_table
  File "pyarrow/_dataset.pyx", line 1994, in pyarrow._dataset.Scanner.to_table
  File "pyarrow/error.pxi", line 122, in pyarrow.lib.pyarrow_internal_check_status
  File "pyarrow/error.pxi", line 99, in pyarrow.lib.check_status
OSError: Could not open parquet input source '/opt/ml/input/data/train/15-': Invalid: Parquet file size is 0 bytes

During handling of the above exception, another exception occurred:

Traceba