In [40]:
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

In [41]:
import sagemaker 
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer

In [42]:
session = sagemaker.Session()

In [43]:
boston = load_boston()

In [44]:
X_bos = pd.DataFrame(boston.data, columns = boston.feature_names)
Y_bos = pd.DataFrame(boston.target)

In [45]:
X_train, X_test, Y_train, Y_test = train_test_split(X_bos, Y_bos, test_size=0.30)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.30)

In [46]:
data_dir = './boston_data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

X_test.to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)
pd.concat([Y_val, X_val], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([Y_train, X_train], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [47]:
prefix = 'boston-ML'


test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

In [48]:
container = get_image_uri(session.boto_region_name, 'xgboost')

	get_image_uri(region, 'xgboost', '0.90-2').


In [49]:
role = get_execution_role()

In [50]:
xgb = sagemaker.estimator.Estimator(container, role, train_instance_count=1, train_instance_type='ml.m4.xlarge', output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix), sagemaker_session=session)

In [51]:
xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,objective='reg:linear',early_stopping_rounds=10,num_round=200)

In [52]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')
xgb.fit({'train': s3_input_train, 'validation':s3_input_validation})

2020-05-09 20:44:10 Starting - Starting the training job...
2020-05-09 20:44:12 Starting - Launching requested ML instances......
2020-05-09 20:45:12 Starting - Preparing the instances for training...
2020-05-09 20:46:04 Downloading - Downloading input data...
2020-05-09 20:46:23 Training - Downloading the training image.[34mArguments: train[0m
[34m[2020-05-09:20:46:44:INFO] Running standalone xgboost training.[0m
[34m[2020-05-09:20:46:44:INFO] File size need to be processed in the node: 0.03mb. Available memory size in the node: 8477.84mb[0m
[34m[2020-05-09:20:46:44:INFO] Determined delimiter of CSV input is ','[0m
[34m[20:46:44] S3DistributionType set as FullyReplicated[0m
[34m[20:46:44] 247x13 matrix with 3211 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-05-09:20:46:44:INFO] Determined delimiter of CSV input is ','[0m
[34m[20:46:44] S3DistributionType set as FullyReplicated[0m
[34m[20:46:44] 107x13 matrix with 1391 


2020-05-09 20:46:56 Uploading - Uploading generated training model
2020-05-09 20:46:56 Completed - Training job completed
Training seconds: 52
Billable seconds: 52


In [53]:
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

In [54]:
xgb_transformer.wait()

....................[34mArguments: serve[0m
[34m[2020-05-09 20:50:26 +0000] [1] [INFO] Starting gunicorn 19.7.1[0m
[34m[2020-05-09 20:50:26 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2020-05-09 20:50:26 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2020-05-09 20:50:26 +0000] [38] [INFO] Booting worker with pid: 38[0m
[34m[2020-05-09 20:50:26 +0000] [39] [INFO] Booting worker with pid: 39[0m
[34m[2020-05-09 20:50:27 +0000] [40] [INFO] Booting worker with pid: 40[0m
[34m[2020-05-09:20:50:27:INFO] Model loaded successfully for worker : 38[0m
[34m[2020-05-09:20:50:27:INFO] Model loaded successfully for worker : 39[0m
[34m[2020-05-09 20:50:27 +0000] [41] [INFO] Booting worker with pid: 41[0m
[34m[2020-05-09:20:50:27:INFO] Model loaded successfully for worker : 40[0m
[34m[2020-05-09:20:50:27:INFO] Model loaded successfully for worker : 41[0m

[34m[2020-05-09:20:50:45:INFO] Sniff delimiter as ','[0m
[34m[2020-05-09:20:50:45:INFO] Determined del

In [55]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

Completed 2.1 KiB/2.1 KiB (36.6 KiB/s) with 1 file(s) remainingdownload: s3://sagemaker-us-east-2-735391510666/xgboost-2020-05-09-20-47-22-331/test.csv.out to boston_data/test.csv.out


In [56]:
Y_pred = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
error = np.sqrt(mse(Y_pred,Y_test))
print(error)

3.833756654124735


In [57]:
xgb_predictor = xgb.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')



-------------!

In [58]:
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer

In [59]:
Y_pred = xgb_predictor.predict(X_test.values).decode('utf-8')

In [60]:
Y_pred = np.fromstring(Y_pred, sep=',')

In [61]:
Y_pred = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
error = np.sqrt(mse(Y_pred,Y_test))
print(error)

3.833756654124735


In [62]:
xgb_predictor.endpoint

'xgboost-2020-05-09-20-44-10-379'