In [12]:
#importing the required libraries
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri ,image_uris
from sagemaker.session import s3_input, Session,TrainingInput
import os
import numpy as np
import pandas as pd
import urllib

In [2]:
#assigning the bucket name and getting region of the notebook
bucket_name = 'bank-ml-demo'
region = boto3.session.Session().region_name
print(region)

us-east-1


In [3]:
#creating a s3 bucket for data and model storage
s3 = boto3.resource('s3')
if region == 'us-east-1':
    s3.create_bucket(Bucket=bucket_name)
    
    

In [3]:
# set an output path where the trained model will be saved
prefix = 'xgboost-as-a-built-in-algo'
output_path ='s3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

s3://bank-ml-demo/xgboost-as-a-built-in-algo/output


In [4]:
#retrieving the data files from web and saving them to local files in Sagemaker...

try:
    urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print('Success: downloaded bank_clean.csv.')
except Exception as e:
    print('Data load error: ',e)

try:
    model_data = pd.read_csv('./bank_clean.csv',index_col=0)
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: downloaded bank_clean.csv.
Success: Data loaded into dataframe.


In [6]:
### Train Test split
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


In [7]:
#re-structuring the dataset file with first column as output variable and rest as the features....
train_data = pd.concat([train_data['y_yes'],train_data.drop(columns=['y_yes','y_no'],axis=1)],axis=1)
test_data = pd.concat([test_data['y_yes'],test_data.drop(columns=['y_yes','y_no'],axis=1)],axis=1)

In [8]:
#saving the re-structure files locally....
train_data.to_csv('train_data.csv',index=False,header=False)
test_data.to_csv('test_data.csv',index=False,header=False)
train_data.shape, test_data.shape

((28831, 60), (12357, 60))

In [9]:
#uploading the file to the s3 bucket
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train_data.csv')
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test_data.csv')

In [11]:
#creating s3_input objects
s3_train_input = TrainingInput(s3_data="s3://{}/{}/train/".format(bucket_name,prefix),content_type='csv')
s3_test_input = TrainingInput(s3_data="s3://{}/{}/test/".format(bucket_name,prefix),content_type='csv')

In [14]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
container = get_image_uri(boto3.Session().region_name,
                          'xgboost', 
                          repo_version='1.0-1')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [15]:
# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":50
        }

In [16]:
# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          train_volume_size=5, # 5 GB 
                                          output_path=output_path,
                                          train_use_spot_instances=True,
                                          train_max_run=300,
                                          train_max_wait=600)
     

train_max_run has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_use_spot_instances has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_wait has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_volume_size has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [17]:
estimator.fit({'train': s3_train_input,'validation': s3_test_input})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-06-21-13-55-48-928


2023-06-21 13:55:49 Starting - Starting the training job...
2023-06-21 13:56:05 Starting - Preparing the instances for training......
2023-06-21 13:56:58 Downloading - Downloading input data...
2023-06-21 13:57:39 Training - Training image download completed. Training in progress....
2023-06-21 13:58:20 Uploading - Uploading generated training model
2023-06-21 13:58:20 Completed - Training job completed
[34m[2023-06-21 13:58:03.165 ip-10-2-103-171.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[3

In [18]:
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-06-21-14-03-33-941
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-06-21-14-03-33-941
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-06-21-14-03-33-941


-------!

In [20]:
# from sagemaker.predictor import csv_serializer
from sagemaker.serializers import CSVSerializer
test_data_array = test_data.drop(['y_yes'], axis=1).values #load the data into an array
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = CSVSerializer()# set the serializer type

In [21]:
predictions = xgb_predictor.predict(test_data_array).decode('utf-8')
preds = np.fromstring(predictions[1:],sep=',')
preds_ = [1   if x>=0.5 else 0 for x in preds]

In [22]:
from sklearn.metrics import classification_report
print(classification_report(test_data['y_yes'],preds_))

              precision    recall  f1-score   support

           0       0.91      0.99      0.94     10936
           1       0.66      0.21      0.32      1421

    accuracy                           0.90     12357
   macro avg       0.78      0.60      0.63     12357
weighted avg       0.88      0.90      0.87     12357



In [108]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2023-06-21-14-03-33-941


[{'ResponseMetadata': {'RequestId': '3PC4ZA47R9N43TX0',
   'HostId': 'EbOSAQmZzPiiX//qVbba5Nfp+XSuJkxyS5LDaKpRqyEnyUltetYdN2XbNl2eivizh7XzODKJ3R8=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'EbOSAQmZzPiiX//qVbba5Nfp+XSuJkxyS5LDaKpRqyEnyUltetYdN2XbNl2eivizh7XzODKJ3R8=',
    'x-amz-request-id': '3PC4ZA47R9N43TX0',
    'date': 'Wed, 21 Jun 2023 16:11:14 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2023-06-21-13-55-48-928/profiler-output/system/incremental/2023062113/1687355880.algo-1.json'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2023-06-21-13-55-48-928/debug-output/index/000000000/000000000020_worker_0.json'},
   {'Key': 'xgboost-as-a-built-in-algo/train/train.csv'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2023-06-21-13-55-48-928/debug-

In [85]:
#lambda handler for the numpy array as the input but the lambda function does not support numpy imports
import os
import io
import boto3
import json
import csv

# grab environment variables
ENDPOINT_NAME = "sagemaker-xgboost-2023-06-21-14-03-33-941"
runtime= boto3.client('runtime.sagemaker')

def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv,arr,delimiter=",",fmt="%g")
    return csv.getvalue().decode().rstrip()
    
def lambda_handler(event):

    input_data = event['data']
    input_data = np2csv(input_data)
    
    response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
                                       ContentType='text/csv',
                                       Body=input_data)
    print(response)
    result = response['Body'].read().decode()
    print(result)
    probs = result.split(',')
    print(probs)
    predictions = [1 if float(i)>=0.5 else 0 for  i in probs]
    
    return predictions

In [86]:
lambda_handler({'data':test_data_array[0:3]})

{'ResponseMetadata': {'RequestId': '95e9e7ee-253c-443d-9552-07b647c81a14', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '95e9e7ee-253c-443d-9552-07b647c81a14', 'x-amzn-invoked-production-variant': 'AllTraffic', 'date': 'Wed, 21 Jun 2023 14:51:48 GMT', 'content-type': 'text/csv; charset=utf-8', 'content-length': '60'}, 'RetryAttempts': 0}, 'ContentType': 'text/csv; charset=utf-8', 'InvokedProductionVariant': 'AllTraffic', 'Body': <botocore.response.StreamingBody object at 0x7f71e6b18bb0>}
0.05214285850524902,0.056601911783218384,0.05096195265650749
['0.05214285850524902', '0.056601911783218384', '0.05096195265650749']


[0, 0, 0]

In [107]:
#lamdbda function for the AWS lambda with contex type text/csv
"""input : { 
  "data": "29,2,999,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,\
  0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0"\
}
"""
import os
import io
import boto3
import json
import csv

# grab environment variables
ENDPOINT_NAME = "sagemaker-xgboost-2023-06-21-14-03-33-941" #os.environ['ENDPOINT_NAME']
runtime= boto3.client('runtime.sagemaker')

def lambda_handler(event):
    print("Received event: " + json.dumps(event, indent=2))
    
    data = json.loads(json.dumps(event))
    payload = data['data']
    print(payload)
    
    response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
                                       ContentType='text/csv',
                                       Body=payload)
    print(response)
    result = json.loads(response['Body'].read().decode())
    print(result)
    if result >=0.5:
        return 1
    return 0
    

In [101]:
 ex1 = ','.join(str(i) for i in test_data_array[0])

In [102]:
lambda_handler({"data":ex1})

Received event: {
  "data": "29,2,999,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0"
}
29,2,999,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0
{'ResponseMetadata': {'RequestId': 'aafd2df4-bb31-4ab7-b49e-daa57b92490e', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'aafd2df4-bb31-4ab7-b49e-daa57b92490e', 'x-amzn-invoked-production-variant': 'AllTraffic', 'date': 'Wed, 21 Jun 2023 15:31:57 GMT', 'content-type': 'text/csv; charset=utf-8', 'content-length': '19'}, 'RetryAttempts': 0}, 'ContentType': 'text/csv; charset=utf-8', 'InvokedProductionVariant': 'AllTraffic', 'Body': <botocore.response.StreamingBody object at 0x7f71e6cbd990>}
0.05214285850524902


0