In [1]:
import numpy as np
import pandas as pd
import boto3
import re
import sagemaker
from sagemaker import get_execution_role

In [2]:
# Specify your bucket name

bucket_name = 'estellaliuml'
training_folder = r'model/training/'
validation_folder = r'model/validation/'
test_folder = r'model/test/'

s3_model_output_location = r's3://{0}/model/model'.format(bucket_name)
s3_training_file_location = r"s3://{0}/{1}".format(bucket_name,training_folder)
s3_validation_file_location = r"s3://{0}/{1}".format(bucket_name,validation_folder)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_folder)

In [3]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)
print(s3_test_file_location)

s3://estellaliuml/model/model
s3://estellaliuml/model/training/
s3://estellaliuml/model/validation/
s3://estellaliuml/model/test/


In [4]:
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [5]:
use_spot_instances = True
max_run = 3600 # in seconds
max_wait = 7200 if use_spot_instances else None # in seconds

job_name = 'xgboost-v1'

checkpoint_s3_uri = None

if use_spot_instances:
    checkpoint_s3_uri = f's3://{bucket_name}/bikerental/checkpoints/{job_name}'
    
print (f'Checkpoint uri: {checkpoint_s3_uri}')

Checkpoint uri: s3://estellaliuml/bikerental/checkpoints/xgboost-v1


In [6]:
sess = sagemaker.Session()

In [7]:
role = get_execution_role()

In [8]:
print(role)

arn:aws:iam::107541254819:role/service-role/AmazonSageMaker-ExecutionRole-20210808T115899


In [9]:
container = sagemaker.image_uris.retrieve("xgboost",sess.boto_region_name,version="1.2-2")

print (f'Using XGBoost Container {container}')

Using XGBoost Container 683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-2


In [10]:
estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=s3_model_output_location,
    sagemaker_session=sess,
    base_job_name = job_name,
    use_spot_instances=use_spot_instances,
    max_run=max_run,
    max_wait=max_wait,
    checkpoint_s3_uri=checkpoint_s3_uri)

In [11]:
estimator.set_hyperparameters(max_depth=5,
                              objective="reg:squarederror",
                              eta=0.1,
                              num_round=150)

In [12]:
estimator.hyperparameters()

{'max_depth': 5, 'objective': 'reg:squarederror', 'eta': 0.1, 'num_round': 150}

In [13]:
training_input_config = sagemaker.session.TrainingInput(
    s3_data=s3_training_file_location,
    content_type='csv',
    s3_data_type='S3Prefix')

validation_input_config = sagemaker.session.TrainingInput(
    s3_data=s3_validation_file_location,
    content_type='csv',
    s3_data_type='S3Prefix'
)

data_channels = {'train': training_input_config, 'validation': validation_input_config}

In [14]:
print(training_input_config.config)
print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://estellaliuml/model/training/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://estellaliuml/model/validation/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}


In [15]:
estimator.fit(data_channels)

2021-08-26 13:29:35 Starting - Starting the training job...
2021-08-26 13:29:37 Starting - Launching requested ML instancesProfilerReport-1629984575: InProgress
......
2021-08-26 13:30:54 Starting - Preparing the instances for training.........
2021-08-26 13:32:31 Downloading - Downloading input data...
2021-08-26 13:32:51 Training - Downloading the training image..[34m[2021-08-26 13:33:11.503 ip-10-2-96-101.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-08-26:13:33:11:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2021-08-26:13:33:11:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2021-08-26:13:33:11:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-08-26:13:33:11:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2021-08-26:13:33:11:INFO] Determined delimiter of CSV input is ','[0m
[34m[2021-08-26:13:33:11:INFO] Deter

[34m[108]#011train-rmse:9.35254#011validation-rmse:9.35401[0m
[34m[109]#011train-rmse:9.34938#011validation-rmse:9.35078[0m
[34m[110]#011train-rmse:9.34882#011validation-rmse:9.35013[0m
[34m[111]#011train-rmse:9.34620#011validation-rmse:9.34771[0m
[34m[112]#011train-rmse:9.34368#011validation-rmse:9.34512[0m
[34m[113]#011train-rmse:9.34283#011validation-rmse:9.34412[0m
[34m[114]#011train-rmse:9.33851#011validation-rmse:9.34007[0m
[34m[115]#011train-rmse:9.33566#011validation-rmse:9.33740[0m
[34m[116]#011train-rmse:9.33249#011validation-rmse:9.33421[0m
[34m[117]#011train-rmse:9.32887#011validation-rmse:9.33061[0m
[34m[118]#011train-rmse:9.32503#011validation-rmse:9.32683[0m
[34m[119]#011train-rmse:9.32062#011validation-rmse:9.32245[0m
[34m[120]#011train-rmse:9.31668#011validation-rmse:9.31860[0m
[34m[121]#011train-rmse:9.31287#011validation-rmse:9.31470[0m
[34m[122]#011train-rmse:9.30678#011validation-rmse:9.30873[0m
[34m[123]#011train-rmse:9.30419#011vali

In [16]:
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m5.xlarge',
                             endpoint_name = job_name)

-------------!