In [21]:
import sys
import pandas as pd
import awswrangler as wr
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import image_uris 
from sagemaker.session import s3_input, Session
import urllib
import os
import numpy as np
from sagemaker.predictor import csv_serializer

In [22]:
import credentials as cr

In [23]:
import importlib
importlib.reload(cr)

<module 'credentials' from '/Users/gaby/Documents/GitHub/sm/credentials.py'>

In [24]:
#import sagemaker as sm
#wr.__version__
#sm.__version__

In [25]:
bucket_name = 'aws-sm-bucket' 
my_region = boto3.session.Session().region_name 
print(my_region)

us-east-2


In [26]:
prefix = 'xgboost-as-a-built-in-algo'
output_path ='s3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

s3://aws-sm-bucket/xgboost-as-a-built-in-algo/output


### Dataset + split train , val and test

In [27]:
urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
model_data = pd.read_csv('./bank_clean.csv',index_col=0)
model_data = pd.concat([model_data['y_yes'], model_data.drop(['y_no', 'y_yes'], axis=1)], axis=1)

size_train = .4
size_test = .3

train_data, val_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), 
                                           [int(size_train * len(model_data)), int((1-size_test) * len(model_data))])

print(model_data.shape, train_data.shape, val_data.shape, test_data.shape)



(41188, 60) (16475, 60) (12356, 60) (12357, 60)


## Save data in S3

In [28]:
ds = dict (zip(['train_data', 'val_data', 'test_data']  , [train_data, val_data, test_data]))

for name in ds:
    df= ds[name]
    print(name)
    df.to_csv(str(name)+'.csv', index=False, header=False)
    boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, f'{name}/{name}.csv')).upload_file(f'{name}.csv')

s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train_data'.format(bucket_name, prefix), content_type='csv')
s3_input_val = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/val_data'.format(bucket_name, prefix), content_type='csv')

train_data
val_data
test_data


## Load AWS container with prebuilt algorithm

In [29]:
container = image_uris.retrieve(region = boto3.Session().region_name, framework='xgboost', version='1.0-1')

In [30]:
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":50
        }

In [68]:
estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role= cr.role,
                                          instance_count=1, 
                                          instance_type='ml.m4.xlarge', 
                                          volume_size=3, # 5 GB 
                                          output_path=output_path,
                                          use_spot_instances=True,
                                          max_run=300,
                                          max_wait=600)

In [69]:
%%time
estimator.fit({'train': s3_input_train,'validation': s3_input_val})#

2021-03-18 10:29:05 Starting - Starting the training job..
2021-03-18 10:29:29 Starting - Launching requested ML instancesProfilerReport-1616063340: InProgress
....
2021-03-18 10:30:49 Starting - Preparing the instances for training..
2021-03-18 10:31:30 Downloading - Downloading input data..
2021-03-18 10:31:50 Training - Downloading the training image..
2021-03-18 10:32:40 Uploading - Uploading generated training model
2021-03-18 10:32:40 Completed - Training job completed
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input 

## Print jobs

In [70]:
#https://us-east-2.console.aws.amazon.com/sagemaker/home?region=us-east-2#/jobs
client = boto3.client('sagemaker')
jobs = client.list_training_jobs()
#!aws sagemaker list-training-jobs
#jobs_all = [x['TrainingJobArn'] for x in jobs['TrainingJobSummaries'] ]
jobs_comp = [x['TrainingJobArn']  for x in jobs['TrainingJobSummaries'] if x['TrainingJobStatus']=='Completed']
jobs_comp

['arn:aws:sagemaker:us-east-2:342342686540:training-job/sagemaker-xgboost-2021-03-18-10-29-00-410',
 'arn:aws:sagemaker:us-east-2:342342686540:training-job/sagemaker-xgboost-2021-03-18-10-26-05-864',
 'arn:aws:sagemaker:us-east-2:342342686540:training-job/sagemaker-xgboost-2021-03-18-09-57-40-076',
 'arn:aws:sagemaker:us-east-2:342342686540:training-job/sagemaker-xgboost-2021-03-17-15-09-12-896',
 'arn:aws:sagemaker:us-east-2:342342686540:training-job/sagemaker-xgboost-2021-03-17-14-07-09-191',
 'arn:aws:sagemaker:us-east-2:342342686540:training-job/sagemaker-xgboost-2021-03-11-16-44-48-956',
 'arn:aws:sagemaker:us-east-2:342342686540:training-job/sagemaker-xgboost-2021-03-11-15-46-34-426',
 'arn:aws:sagemaker:us-east-2:342342686540:training-job/sagemaker-xgboost-2021-03-11-13-13-25-842',
 'arn:aws:sagemaker:us-east-2:342342686540:training-job/sagemaker-xgboost-2021-03-09-20-37-08-345',
 'arn:aws:sagemaker:us-east-2:342342686540:training-job/sagemaker-xgboost-2021-03-09-20-30-26-410']

## Deploy model

In [71]:
%%time
xgb_predictor = estimator.deploy(initial_instance_count=1, 
                                 instance_type = 'ml.t2.medium', 
                                 serializer = sagemaker.serializers.CSVSerializer())
#https://us-east-2.console.aws.amazon.com/sagemaker/home?region=us-east-2#/endpoints

----------------!CPU times: user 445 ms, sys: 65.7 ms, total: 511 ms
Wall time: 9min 41s


In [72]:
endpoints = client.list_endpoints(SortBy='Status')['Endpoints']
endpoints

[{'EndpointName': 'sagemaker-xgboost-2021-03-18-10-33-13-014',
  'EndpointArn': 'arn:aws:sagemaker:us-east-2:342342686540:endpoint/sagemaker-xgboost-2021-03-18-10-33-13-014',
  'CreationTime': datetime.datetime(2021, 3, 18, 11, 33, 25, 90000, tzinfo=tzlocal()),
  'LastModifiedTime': datetime.datetime(2021, 3, 18, 11, 42, 36, 233000, tzinfo=tzlocal()),
  'EndpointStatus': 'InService'},
 {'EndpointName': 'sagemaker-xgboost-2021-03-18-10-02-29-356',
  'EndpointArn': 'arn:aws:sagemaker:us-east-2:342342686540:endpoint/sagemaker-xgboost-2021-03-18-10-02-29-356',
  'CreationTime': datetime.datetime(2021, 3, 18, 11, 2, 41, 337000, tzinfo=tzlocal()),
  'LastModifiedTime': datetime.datetime(2021, 3, 18, 11, 10, 49, 822000, tzinfo=tzlocal()),
  'EndpointStatus': 'InService'}]

In [73]:
#print all your endpoints
for i in range( len(endpoints) ):
    endpoint_name = endpoints[i]['EndpointName']
    print(endpoint_name)

sagemaker-xgboost-2021-03-18-10-33-13-014
sagemaker-xgboost-2021-03-18-10-02-29-356


In [74]:
endpoint_name = 'sagemaker-xgboost-2021-03-18-10-33-13-014'

## Test endpoint 

In [75]:
runtime_client = boto3.client('sagemaker-runtime')

In [80]:
test_data = pd.read_csv('test_data.csv')
payload = test_data.copy()
payload = payload.drop(payload.columns[0], axis=1).values
with open('payload_data.csv', 'r') as f:
    payload = f.read().strip()

__invoque endpoint__

In [104]:
   
response = runtime_client.invoke_endpoint(EndpointName=endpoint_name, 
                                   ContentType='text/csv', 
                                   Body= payload )
result = response['Body'].read()

In [107]:

predictions_array = np.fromstring(result, sep=',') # and turn the prediction into an array

predictions_array

array([0.02122305, 0.04085948, 0.31341541, ..., 0.03422592, 0.03033714,
       0.03065895])

In [108]:
cm = pd.crosstab(index=test_data[test_data.columns[0]], 
                 columns=np.round(predictions_array), 
                 rownames=['Observed'], 
                 colnames=['Predicted'])
print(cm)
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))

Predicted    0.0  1.0
Observed             
0          10783  152
1           1135  286

Overall Classification Rate: 89.6%

Predicted      No Purchase    Purchase
Observed
No Purchase    90% (10783)    35% (152)
Purchase        10% (1135)     65% (286) 



## Delete Endpoint

In [117]:
#if you want to delete a list of endpoints keeping the last "keep_ast" ones.
print(len(endpoints))
keep_last = 1
endpoints = client.list_endpoints(SortBy='Status')['Endpoints']
for i in range(keep_last, len(endpoints) ):# exclude last training and delete
    name = endpoints[i]['EndpointName']
    print(name)
    sagemaker.Session().delete_endpoint(name)

2
sagemaker-xgboost-2021-03-18-10-02-29-356


In [118]:
endpoints = client.list_endpoints(SortBy='Status')['Endpoints']
endpoints

[{'EndpointName': 'sagemaker-xgboost-2021-03-18-10-33-13-014',
  'EndpointArn': 'arn:aws:sagemaker:us-east-2:342342686540:endpoint/sagemaker-xgboost-2021-03-18-10-33-13-014',
  'CreationTime': datetime.datetime(2021, 3, 18, 11, 33, 25, 90000, tzinfo=tzlocal()),
  'LastModifiedTime': datetime.datetime(2021, 3, 18, 11, 42, 36, 233000, tzinfo=tzlocal()),
  'EndpointStatus': 'InService'}]

## Delete S3 objects

In [None]:
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
print(bucket_to_delete.objects.all())
#bucket_to_delete.objects.all().delete()