### Importing Important Libraries

#### Steps To Be Followed
1. Importing necessary Libraries
2. Creating S3 bucket 
3. Mapping train And Test Data in S3
4. Mapping The path of the models in S3

In [3]:
!pip install sagemaker

Collecting sagemaker
  Downloading sagemaker-2.194.0.tar.gz (913 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m913.3/913.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting attrs<24,>=23.1.0 (from sagemaker)
  Using cached attrs-23.1.0-py3-none-any.whl (61 kB)
Collecting smdebug_rulesconfig==1.0.1 (from sagemaker)
  Downloading smdebug_rulesconfig-1.0.1-py2.py3-none-any.whl (20 kB)
Collecting pathos (from sagemaker)
  Obtaining dependency information for pathos from https://files.pythonhosted.org/packages/d8/08/ac94fa6f9eefe32963b8a54e573dab0dbc0d3df24fd34924bd9ce7eab7c4/pathos-0.3.1-py3-none-any.whl.metadata
  Downloading pathos-0.3.1-py3-none-any.whl.metadata (11 kB)
Collecting schema (from sagemaker)
  Downloading schema-0.7.5-py2.py3-none-any.whl (17 kB)
Collecting tblib==1.7.0 (from sagemaker)
  Downloading tblib-1.7.0-py2.py3-none-any.whl (12 kB)
Collecting ppft>=1.7.6.7 (from

In [4]:
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri 
#alternative
from sagemaker.amazon.amazon_estimator import image_uris
from sagemaker.session import s3_input, Session

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/architsharma/Library/Application Support/sagemaker/config.yaml


In [None]:
bucket_name = 'bankapparchit' # bucket name
my_region = boto3.session.Session().region_name # Check the region of the instance
print(my_region)

us-east-1


In [None]:
#creating a bucket
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
        s3.create_bucket(Bucket=bucket_name)
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


In [None]:
# set an output path where the trained model will be saved
prefix = 'xgboost-as-a-built-in-algo'
output_path ='s3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

s3://bankappsuraaj/xgboost-as-a-built-in-algo/output


#### Downloading The Dataset And Storing in S3

In [None]:
import pandas as pd
import urllib
try:
    #the bank data is in one hot encoded format already
    urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print('Success: downloaded bank_clean.csv.')
except Exception as e:
    print('Data load error: ',e)

try:
    model_data = pd.read_csv('./bank_clean.csv',index_col=0)
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: downloaded bank_clean.csv.
Success: Data loaded into dataframe.


In [None]:
### Train Test split

import numpy as np
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


In [None]:
### Saving Train And Test Into Buckets
## We start with Train Data
import os
#format in which data is stored is label and then independent columns
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], 
                                                axis=1)], 
                                                axis=1).to_csv('train.csv', index=False, header=False)
#uploading train.csv in the bucket
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')

#now storing the training csv into a variable 
s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [None]:
# Test Data Into Buckets
pd.concat([test_data['y_yes'], test_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('test.csv', index=False, header=False)

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')

s3_input_test = sagemaker.TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type='csv')

### Building Models Xgboot- Inbuilt Algorithm

In [None]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
container = image_uris.retrieve('xgboost',boto3.Session().region_name,'1.5-1')


In [None]:
# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":50
        }

In [None]:
# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path)
                    



In [None]:
estimator.fit({'train': s3_input_train,'validation': s3_input_test})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-03-11-11-00-27-426


2023-03-11 11:00:27 Starting - Starting the training job...
2023-03-11 11:00:55 Starting - Preparing the instances for training......
2023-03-11 11:01:41 Downloading - Downloading input data...
2023-03-11 11:02:21 Training - Training image download completed. Training in progress...[34m[2023-03-11 11:02:42.836 ip-10-0-100-46.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-03-11 11:02:42.896 ip-10-0-100-46.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-03-11:11:02:43:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-03-11:11:02:43:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2023-03-11:11:02:43:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-03-11:11:02:43:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2023-03-11:11:02:43:INFO] Determined 0 GPU(s) available on the ins

### Deploy Machine Learning Model As Endpoints

In [None]:
from sagemaker.serializers import CSVSerializer
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge',serializer=CSVSerializer())

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-03-11-11-38-10-005
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-03-11-11-38-10-005
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-03-11-11-38-10-005


-------!

#### Prediction of the Test Data

In [None]:
test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values #load the data into an array

print(test_data_array.shape)

#xgb_predictor.content_type = 'text/csv' # set the data type for an inference

#xgb_predictor.serializer = CSVSerializer() # set the serializer type

predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!



(12357, 59)


In [None]:
print(predictions)

0.05214285850524902
0.056601911783218384
0.05096195265650749
0.30592477321624756
0.10553096234798431
0.078081876039505
0.02262614667415619
0.03498779982328415
0.02811497449874878
0.034304987639188766
0.11726024001836777
0.05544304847717285
0.037178657948970795
0.060526397079229355
0.04172196611762047
0.032530296593904495
0.07993681728839874
0.029354767873883247
0.10640976577997208
0.08665816485881805
0.04581623896956444
0.056621942669153214
0.03803135082125664
0.08899843692779541
0.07463277131319046
0.06287781149148941
0.07477721571922302
0.037889495491981506
0.05363423004746437
0.03714437410235405
0.11883419007062912
0.014051004312932491
0.10135763138532639
0.036241237074136734
0.04939017817378044
0.04110652580857277
0.30315452814102173
0.03515808284282684
0.3916339576244354
0.08269158750772476
0.05523224547505379
0.05635124072432518
0.04269713535904884
0.07391157746315002
0.05223361775279045
0.10406903922557831
0.10215269774198532
0.08263440430164337
0.13864034414291382
0.09056770056

In [None]:
predictions_array = np.fromstring(predictions[1:], sep='\n') # and turn the prediction into an array
print(predictions_array.shape)

(12357,)


In [None]:
predictions_array

array([0.05214286, 0.05660191, 0.05096195, ..., 0.03436061, 0.02942475,
       0.03715819])

In [None]:
import sklearn.metrics

cutoff=0.5
print(sklearn.metrics.confusion_matrix(test_data['y_yes'],np.round(predictions_array)))
print(sklearn.metrics.classification_report(test_data['y_yes'],np.round(predictions_array)))

[[10785   151]
 [ 1124   297]]
              precision    recall  f1-score   support

           0       0.91      0.99      0.94     10936
           1       0.66      0.21      0.32      1421

    accuracy                           0.90     12357
   macro avg       0.78      0.60      0.63     12357
weighted avg       0.88      0.90      0.87     12357



In [None]:
xgb_predictor.endpoint_name

'sagemaker-xgboost-2023-03-11-11-38-10-005'

#### Deleting The Endpoints

In [None]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': '2FF829102DC6DFD1',
   'HostId': 'mYPqeWyx3REoLIsQu2MVorzKLrlxES2n6Dcdr3PycVf1VkRCxicEewoPP8IxRguc5MGksLnjynY=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'mYPqeWyx3REoLIsQu2MVorzKLrlxES2n6Dcdr3PycVf1VkRCxicEewoPP8IxRguc5MGksLnjynY=',
    'x-amz-request-id': '2FF829102DC6DFD1',
    'date': 'Sat, 29 Aug 2020 10:21:27 GMT',
    'connection': 'close',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'xgboost-as-a-built-in-algo/train/train.csv'},
   {'Key': 'xgboost-as-a-built-in-algo/test/test.csv'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2020-08-29-09-49-29-015/output/model.tar.gz'}]}]