# New Section

In [2]:

import sys
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 2000)
pd.set_option('display.width', 2000)

import sklearn.model_selection

import joblib
import pickle
import tempfile


import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()


# S3 bucket name
bucket = 'sagemaker-us-west-2-479862333671'


In [3]:
container = get_image_uri(boto3.Session().region_name, 'xgboost', '1.0-1')

s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}'.format(bucket, 'data/train_no_header.csv'), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}'.format(bucket, 'data/val_no_header.csv'), content_type='csv')

(vars(s3_input_train), vars(s3_input_validation))

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


({'config': {'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
     'S3Uri': 's3://sagemaker-us-west-2-479862333671/data/train.csv',
     'S3DataDistributionType': 'FullyReplicated'}},
   'ContentType': 'csv'}},
 {'config': {'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
     'S3Uri': 's3://sagemaker-us-west-2-479862333671/data/val.csv',
     'S3DataDistributionType': 'FullyReplicated'}},
   'ContentType': 'csv'}})

In [4]:
prefix = 'perdictions'

In [5]:
xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.2xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sagemaker_session)


In [6]:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='reg:squarederror',
                        eval_metric='rmse',
                        num_round=20)

In [None]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2022-01-21 19:49:43 Starting - Starting the training job...
2022-01-21 19:50:08 Starting - Launching requested ML instancesProfilerReport-1642794583: InProgress
......
2022-01-21 19:51:08 Starting - Preparing the instances for training.........
2022-01-21 19:52:31 Downloading - Downloading input data........................
2022-01-21 19:56:32 Training - Downloading the training image..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value rmse to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m

In [None]:
xgb_transformer = xgb.transformer(instance_count=1, instance_type='ml.c5.4xlarge')

In [None]:
test_location = 's3://{}/{}'.format(bucket, 'data/test.csv')

In [None]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

In [None]:
xgb_transformer.wait()

In [None]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

In [None]:
Y_pred = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)

In [None]:
Y_test = pd.read_csv('s3://{}/{}'.format(bucket, 'data/val.csv'))['demand']

In [None]:
Y_test

In [None]:
Y_pred

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_error(Y_test, Y_pred)