### Importing Libraries

In [2]:
import boto3, re, sys, math, json, os, sagemaker, urllib.request

In [3]:
from sagemaker import get_execution_role

In [4]:
import numpy as np

In [5]:
import pandas as pd

In [6]:
import matplotlib.pyplot as plt

In [7]:
from IPython.display import Image

In [8]:
from IPython.display import display

In [9]:
from time import gmtime, strftime

In [12]:
from sagemaker.serializers import CSVSerializer

###  Defining IAM Role

In [13]:
role = get_execution_role()

In [14]:
prefix = 'sagemaker/DEMO-xgboost-dm'

In [16]:
my_region = boto3.session.Session().region_name # set the region of the instance

In [17]:
xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region,"latest")

In [18]:
print("Success - the MySageMakerInstance is in the " + my_region + " region.You will use the " + xgboost_container + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-west-2 region.You will use the 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


### Creating S3 Bucket

In [19]:
bucket_name = 'girishankar-sagemaker' 

In [20]:
s3 = boto3.resource('s3')

In [21]:
try:
    if my_region == 'us-east-1':
        s3.create_bucket(Bucket=bucket_name)
    else:
        s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': my_region })
        print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


### Loading Data

In [22]:
try:
    urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv","bank_clean.csv")
    print('Success: downloaded bank_clean.csv.')
except Exception as e:
    print('Data load error: ',e)

try:
    model_data = pd.read_csv('./bank_clean.csv',index_col=0)
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)    

Success: downloaded bank_clean.csv.
Success: Data loaded into dataframe.


### Training Data

In [23]:
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729),[int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


  return bound(*args, **kwds)


### Training the ML Model

In [24]:
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)],axis=1).to_csv('train.csv', index=False, header=False)

In [25]:
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix,'train/train.csv')).upload_file('train.csv')

In [26]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name,prefix), content_type='csv')

### Creating sagemaker session

In [27]:
sess = sagemaker.Session()

In [28]:
xgb = sagemaker.estimator.Estimator(xgboost_container,role, instance_count=1,instance_type='ml.m4.xlarge',output_path='s3://{}/{}/output'.format(bucket_name,prefix),sagemaker_session=sess)

In [29]:
xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,silent=0,objective='binary:logistic',num_round=100)

## Training the model

In [31]:
xgb.fit({'train': s3_input_train})

INFO:sagemaker:Creating training-job with name: xgboost-2026-02-10-15-01-50-652


2026-02-10 15:01:53 Starting - Starting the training job...
2026-02-10 15:02:07 Starting - Preparing the instances for training...
2026-02-10 15:02:31 Downloading - Downloading input data......
2026-02-10 15:03:52 Training - Training image download completed. Training in progress...[34mArguments: train[0m
[34m[2026-02-10:15:04:06:INFO] Running standalone xgboost training.[0m
[34m[2026-02-10:15:04:06:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2026-02-10:15:04:06:INFO] File size need to be processed in the node: 3.38mb. Available memory size in the node: 8523.98mb[0m
[34m[2026-02-10:15:04:06:INFO] Determined delimiter of CSV input is ','[0m
[34m[15:04:06] S3DistributionType set as FullyReplicated[0m
[34m[15:04:06] 28831x59 matrix with 1701029 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[15:04:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 14 pruned nodes, max_depth=5[0m
[34m

### Deploying the Model

In [33]:
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: xgboost-2026-02-10-15-06-50-876
INFO:sagemaker:Creating endpoint-config with name xgboost-2026-02-10-15-06-50-876
INFO:sagemaker:Creating endpoint with name xgboost-2026-02-10-15-06-50-876


------!

### Predicting Data

In [35]:
test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values #load the data into an array

In [36]:
xgb_predictor.serializer = CSVSerializer() # set the serializer type

In [37]:
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!

In [38]:
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array

In [39]:
print(predictions_array.shape)

(12357,)


In [40]:
cm = pd.crosstab(index=test_data['y_yes'],columns=np.round(predictions_array), rownames=['Observed'],colnames=['Predicted'])

In [41]:
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100

In [45]:
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase",
tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase",
fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 89.5%

Predicted      No Purchase    Purchase
Observed
No Purchase    90% (10769)    37% (167)
Purchase        10% (1133)     63% (288) 



## Cleaning Up

In [46]:
xgb_predictor.delete_endpoint(delete_endpoint_config=True) ## Deleting Endpoint

INFO:sagemaker:Deleting endpoint configuration with name: xgboost-2026-02-10-15-06-50-876
INFO:sagemaker:Deleting endpoint with name: xgboost-2026-02-10-15-06-50-876


In [None]:
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete() ## Deleting training artifacts and S3 