# New Section

In [1]:

import sys
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 2000)
pd.set_option('display.width', 2000)

import sklearn.model_selection

import joblib
import pickle
import tempfile


import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()


# S3 bucket name
bucket = 'sagemaker-us-west-2-479862333671'


In [2]:
container = get_image_uri(boto3.Session().region_name, 'xgboost', '1.0-1')

s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}'.format(bucket, 'data/train.csv'), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}'.format(bucket, 'data/val.csv'), content_type='csv')

(vars(s3_input_train), vars(s3_input_validation))

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


({'config': {'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
     'S3Uri': 's3://sagemaker-us-west-2-479862333671/data/train.csv',
     'S3DataDistributionType': 'FullyReplicated'}},
   'ContentType': 'csv'}},
 {'config': {'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
     'S3Uri': 's3://sagemaker-us-west-2-479862333671/data/val.csv',
     'S3DataDistributionType': 'FullyReplicated'}},
   'ContentType': 'csv'}})

In [3]:
prefix = 'perdictions'

In [4]:
xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.2xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sagemaker_session)


In [5]:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='reg:squarederror',
                        eval_metric='rmse',
                        num_round=20)

In [6]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2022-01-16 11:06:35 Starting - Starting the training job...
2022-01-16 11:06:58 Starting - Launching requested ML instancesProfilerReport-1642331195: InProgress
...
2022-01-16 11:07:32 Starting - Preparing the instances for training.........
2022-01-16 11:09:01 Downloading - Downloading input data............
2022-01-16 11:10:59 Training - Downloading the training image..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value rmse to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Deter

In [7]:
xgb_transformer = xgb.transformer(instance_count=1, instance_type='ml.c5.4xlarge')

In [24]:
test_location = 's3://{}/{}'.format(bucket, 'data/test.csv')

In [32]:
test = pd.read_csv('s3://{}/{}'.format(bucket, 'data/test.csv'))
test

Unnamed: 0,1437,3,1,0.1,0.2,1914,30,4,4.1,2,0.3,0.4,0.5,8.38,25,17,4.2,5,4.3,0.0,0.0.1,0.0.2,1.0,0.0.3,0.0.4,0.0.5,0.0.6,0.0.7,0.0.8
0,1437,3,1,0,0,1915,30,4,4,2,0,0,0,8.38,26,17,4,5,4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1437,3,1,0,0,1916,30,4,4,2,0,0,0,8.38,27,17,4,5,4,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1437,3,1,0,0,1917,30,4,4,2,0,0,0,8.38,28,17,4,5,4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1437,3,1,0,0,1918,30,4,4,2,0,0,0,8.38,29,17,4,5,5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1438,3,1,0,0,1914,30,4,4,2,0,0,0,3.97,25,17,4,5,4,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
823224,1432,2,0,9,2,1940,30,4,4,2,0,0,0,2.98,21,20,5,5,3,0.0,0.0,0.0,0.0,0.0,1.0,0.5,1.0,1.0,1.0
823225,1433,2,0,9,2,1940,30,4,4,2,0,0,0,2.48,21,20,5,5,3,3.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
823226,1434,2,0,9,2,1940,30,4,4,2,0,0,0,3.98,21,20,5,5,3,5.0,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0
823227,1435,2,0,9,2,1940,30,4,4,2,0,0,0,1.28,21,20,5,5,3,0.0,0.0,3.0,0.0,2.0,0.0,0.0,1.0,1.0,1.0


In [28]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

...........................
[34m[2022-01-16:11:58:56:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2022-01-16:11:58:56:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2022-01-16:11:58:56:INFO] nginx config: [0m
[35m[2022-01-16:11:58:56:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2022-01-16:11:58:56:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2022-01-16:11:58:56:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {

UnexpectedStatusException: Error for Transform job sagemaker-xgboost-2022-01-16-11-54-40-593: Failed. Reason: ClientError: See job logs for more information

In [None]:
xgb_transformer.wait()

In [None]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

In [None]:
Y_pred = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)

In [None]:
Y_test = pd.read_csv('s3://{}/{}'.format(bucket, 'data/val.csv'))['demand']

In [None]:
Y_test

In [None]:
Y_pred

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_error(Y_test, Y_pred)