# XGBoost Model Training

After having the data processed, we are now ready to use it in training our model.

In [1]:

import sys
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 2000)
pd.set_option('display.width', 2000)

import sklearn.model_selection

import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()


# S3 bucket name
bucket = 'sagemaker-us-west-2-479862333671'


In [2]:
container = get_image_uri(boto3.Session().region_name, 'xgboost', '1.0-1')

s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}'.format(bucket, 'data/train_no_header.csv'), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}'.format(bucket, 'data/val_no_header.csv'), content_type='csv')

(vars(s3_input_train), vars(s3_input_validation))

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


({'config': {'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
     'S3Uri': 's3://sagemaker-us-west-2-479862333671/data/train_no_header.csv',
     'S3DataDistributionType': 'FullyReplicated'}},
   'ContentType': 'csv'}},
 {'config': {'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
     'S3Uri': 's3://sagemaker-us-west-2-479862333671/data/val_no_header.csv',
     'S3DataDistributionType': 'FullyReplicated'}},
   'ContentType': 'csv'}})

In [3]:
prefix = 'perdictions'

In [4]:
xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.2xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sagemaker_session)


In [5]:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='reg:squarederror',
                        eval_metric='rmse',
                        num_round=20)

In [6]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2022-01-22 10:50:26 Starting - Starting the training job...
2022-01-22 10:50:52 Starting - Preparing the instances for trainingProfilerReport-1642848625: InProgress
......
2022-01-22 10:51:56 Downloading - Downloading input data.....................
2022-01-22 10:55:26 Training - Training image download completed. Training in progress.[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value rmse to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[

In [7]:
xgb_transformer = xgb.transformer(instance_count=1, instance_type='ml.c5.4xlarge')

In [8]:
test_location = 's3://{}/{}'.format(bucket, 'data/test_no_header.csv')

In [9]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

...........................[34m[2022-01-22:11:30:13:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2022-01-22:11:30:13:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2022-01-22:11:30:13:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }
    location /

In [10]:
xgb_transformer.wait()

[34m[2022-01-22:11:30:13:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2022-01-22:11:30:13:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2022-01-22:11:30:13:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[35m[2022-01-22:11:30:13:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2022-01-22:11:30:13:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2022-01-22:11:30:13:INFO] nginx config: [0m
[35mworker_processes auto;[0m
[35mdaemon off;[0m
[35mpid /tmp/nginx.pid;[0m
[35merror_log  /dev/stderr;[0m
[35mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 808

In [11]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

Note: AWS CLI version 2, the latest major version of the AWS CLI, is now stable and recommended for general use. For more information, see the AWS CLI version 2 installation instructions at: https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html

usage: aws [options] <command> <subcommand> [<subcommand> ...] [parameters]
To see help text, you can run:

  aws help
  aws <command> help
  aws <command> <subcommand> help
aws: error: the following arguments are required: paths


In [15]:
data_dir = xgb_transformer.output_path

In [16]:
Y_pred = pd.read_csv(os.path.join(data_dir, 'test_no_header.csv.out'), header=None)

In [17]:
Y_pred

Unnamed: 0,0
0,0.471162
1,0.517790
2,0.536669
3,0.517790
4,0.471162
...,...
823225,1.077206
823226,0.504309
823227,1.342117
823228,1.404573


In [21]:
Y_test = pd.read_csv('s3://{}/{}'.format(bucket, 'data/val.csv'))['demand']

In [22]:
Y_test

0         0
1         0
2         0
3         2
4         0
         ..
823225    1
823226    1
823227    0
823228    1
823229    5
Name: demand, Length: 823230, dtype: int64

In [29]:
Y = pd.concat([Y_test, Y_pred], axis=1)

In [31]:
col_names = ['actual', 'prediction']

Y.columns = col_names

In [32]:
Y

Unnamed: 0,actual,prediction
0,0,0.471162
1,0,0.517790
2,0,0.536669
3,2,0.517790
4,0,0.471162
...,...,...
823225,1,1.077206
823226,1,0.504309
823227,0,1.342117
823228,1,1.404573


In [33]:
from sklearn.metrics import mean_squared_error

In [34]:
mean_squared_error(Y_test, Y_pred)

6.131172588601535