In [1]:
# import libraries
import sagemaker
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input, Session
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer


Get execution role and region. It also creates an XGBoost container

In [2]:
role = get_execution_role()
my_region = boto3.session.Session().region_name
prefix = 'sagemaker/DEMO-xgboost-dm'

print('Role:   '+role+'\n'+'Region: '+my_region+'\n'+'')

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")

print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + xgboost_container + " container for your SageMaker endpoint.")

Role:   arn:aws:iam::852800376493:role/service-role/AmazonSageMaker-ExecutionRole-20220726T135083
Region: us-east-1

Success - the MySageMakerInstance is in the us-east-1 region. You will use the 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


Create a s3 bucket for your data. You would need your access key and secret key for this step. Omitted here

In [4]:
# List files contained within a bucket
access_key = 'AKIA4NDW5B2WWAALDZZV'
access_secret = 'W27EgxVCl1HFLoa4Ql8uD+5hQL0Xb/TCrdEGg7Lt'
bucket_name = 'sagemaker-tutorial-jt-intro-rev03'

client_s3 = boto3.resource(
    's3',
    aws_access_key_id = access_key, 
    aws_secret_access_key = access_secret
    )
try:
    if  my_region == 'us-east-1':
      client_s3.create_bucket(Bucket=bucket_name)
    else: 
      client_s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)
    
#client_s3.list_buckets()['Buckets']

S3 bucket created successfully


At this point you can go back to aws console, go to s3 and you should be able to see the bucket you just created
Next we download the data:


In [5]:
try:
  urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
  print('Success: downloaded bank_clean.csv.')
except Exception as e:
  print('Data load error: ',e)

Success: downloaded bank_clean.csv.


We now read the data into the data frame

In [6]:
try:
  model_data = pd.read_csv('./bank_clean.csv',index_col=0)
  print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: Data loaded into dataframe.


Let split the data into 70% training and 30% test. Then check size of dataset 

In [7]:
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


Let us inspect the table

In [10]:
pd.set_option("display.max_columns", None) # display all columns
model_data.head(5)

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,marital_unknown,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,default_no,default_unknown,default_yes,housing_no,housing_unknown,housing_yes,loan_no,loan_unknown,loan_yes,contact_cellular,contact_telephone,month_apr,month_aug,month_dec,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,1,999,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0
1,57,1,999,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0
2,37,1,999,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0
3,40,1,999,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0
4,56,1,999,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0


Move ground truth to first column, convert back to csv and save as train.csv to local folder

In [11]:
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)


Upload file train.csv to folder sagemaker/DEMO-xgboost-dm/train/train.csv

In [12]:
boto3.Session().resource('s3',aws_access_key_id = access_key, aws_secret_access_key = access_secret).Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')


Setup location to get training data for SageMaker XGBoost algorithm

In [13]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

Create a session

In [14]:
sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(xgboost_container,role, instance_count=1, instance_type='ml.m4.xlarge',output_path='s3://{}/{}/output'.format(bucket_name, prefix),sagemaker_session=sess)


Set XGBoost hyperparementers

In [15]:
xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,silent=0,objective='binary:logistic',num_round=100)

Train the model

In [16]:
xgb.fit({'train': s3_input_train})


2022-07-26 18:27:37 Starting - Starting the training job...
2022-07-26 18:28:04 Starting - Preparing the instances for trainingProfilerReport-1658860057: InProgress
.........
2022-07-26 18:29:36 Downloading - Downloading input data......
2022-07-26 18:30:30 Training - Downloading the training image......
2022-07-26 18:31:38 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2022-07-26:18:31:26:INFO] Running standalone xgboost training.[0m
[34m[2022-07-26:18:31:26:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2022-07-26:18:31:26:INFO] File size need to be processed in the node: 3.38mb. Available memory size in the node: 8436.91mb[0m
[34m[2022-07-26:18:31:26:INFO] Determined delimiter of CSV input is ','[0m
[34m[18:31:26] S3DistributionType set as FullyReplicated[0m
[34m[18:31:26] 28831x59 matrix with 1701029 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[18:31:26] src/tree/updater_prune

Model deployment to an endpoint

In [17]:
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

-------!

Run prediction

In [18]:
from sagemaker.serializers import CSVSerializer

test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values #load the data into an array
xgb_predictor.serializer = CSVSerializer() # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

(12357,)


Evaluation of model performance

In [19]:
cm = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 89.5%

Predicted      No Purchase    Purchase
Observed
No Purchase    90% (10769)    37% (167)
Purchase        10% (1133)     63% (288) 



Terminate resources. Then also delete the notebook instance to avoid extra costs!!

In [20]:
xgb_predictor.delete_endpoint(delete_endpoint_config=True)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': 'KN225MDJR4GDGYHQ',
   'HostId': 'kjnIhSc52dtP1zrML6l//z6PxbTEUkhygFaVuG5eB6a9dQcxnSBndIxPsW04RLGnYzhnoQo+fKY=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'kjnIhSc52dtP1zrML6l//z6PxbTEUkhygFaVuG5eB6a9dQcxnSBndIxPsW04RLGnYzhnoQo+fKY=',
    'x-amz-request-id': 'KN225MDJR4GDGYHQ',
    'date': 'Tue, 26 Jul 2022 18:49:22 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'aws-programmatic-access-test-object'},
   {'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2022-07-26-18-27-37-313/profiler-output/system/incremental/2022072618/1658860200.algo-1.json'},
   {'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2022-07-26-18-27-37-313/rule-output/ProfilerReport-1658860057/profiler-output/profiler-reports/MaxInitializationTime.json'},
   {'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2022-07-26-18-27-37-313/rule