# Configuring Notebook

In [1]:
import configparser, urllib.request, os, time
import boto3, sagemaker, pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sagemaker.inputs import TrainingInput
from sagemaker.amazon.amazon_estimator import image_uris
from sagemaker.serializers import CSVSerializer
from sagemaker.session import Session


  from pandas.core.computation.check import NUMEXPR_INSTALLED


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## Reading Config File

In [6]:
# .gitignore
CFG_FILE = "aws_config.cfg"

# Read credentials & region from local profile -------------------------------------------------
cp = configparser.ConfigParser()
if not cp.read(CFG_FILE):
    raise FileNotFoundError(
        f"Missing {CFG_FILE}. Expected keys: aws_access_key_id, aws_secret_access_key, region"
    )


aws_access_key_id = cp["default"]["aws_access_key_id"]
aws_secret_access_key = cp["default"]["aws_secret_access_key"]
region = cp["default"].get("region", "us-east-1")
print("Region:", region)
# print("secrete access key: ", aws_secret_access_key) # Don't do this!!!

Region: us-west-2


# Let's setup BOTO

In [8]:
s3_client = boto3.client("s3")

BUCKET = "mlops-dsml-june27"


try:
    if region == "us-east-1":
        s3_client.create_bucket(Bucket=BUCKET)
    else:
        s3_client.create_bucket(
            Bucket=BUCKET, CreateBucketConfiguration={"LocationConstraint": region}
        )
    print("Created bucket", BUCKET)
except s3_client.exceptions.BucketAlreadyOwnedByYou:
    print("Using existing bucket", BUCKET)

Using existing bucket mlops-dsml-june27


## Some Boto Operations for interacting/playing with S3

In [9]:
response = s3_client.list_buckets()
print("Available S3 Buckets:")
for bucket in response['Buckets']:
    print(f"  - {bucket['Name']} (Created: {bucket['CreationDate']})")

Available S3 Buckets:
  - mlops-dsml-june27 (Created: 2025-06-27 02:11:29+00:00)


In [13]:
local_file_path = 'empty.txt'
s3_key = 's3_empty_v2.txt' # file name, with which it will be present inside the BUCKET
open('empty.txt', 'a').close()


s3_client.upload_file(local_file_path, BUCKET, s3_key)
print(f"Successfully uploaded {local_file_path} to s3://{BUCKET}/{s3_key}")

Successfully uploaded empty.txt to s3://mlops-dsml-june27/s3_empty_v2.txt


In [11]:
local_file_path = 'downloaded_from_s3.txt'
s3_client.download_file(BUCKET, s3_key, local_file_path)
print(f"Successfully downloaded s3://{BUCKET}/{s3_key} to {local_file_path}")

Successfully downloaded s3://mlops-dsml-june27/s3_empty.txt to downloaded_from_s3.txt


In [14]:
response = s3_client.list_objects_v2(Bucket=BUCKET)

print(f"Contents of s3://{BUCKET}:")
for obj in response['Contents']:
    print(f"  - {obj['Key']} (Size: {obj['Size']} bytes, Modified: {obj['LastModified']})")

Contents of s3://mlops-dsml-june27:
  - empty.txt (Size: 0 bytes, Modified: 2025-06-27 02:32:51+00:00)
  - s3_empty.txt (Size: 0 bytes, Modified: 2025-06-27 02:29:57+00:00)
  - s3_empty_v2.txt (Size: 0 bytes, Modified: 2025-06-27 02:32:22+00:00)


# Downloading Dataset

In [15]:
DATA_URL = (
    "https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-"
    "sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv"
)

# Define local file path
LOCAL_CSV = "bank_clean.csv"

if not os.path.exists(LOCAL_CSV):
    urllib.request.urlretrieve(DATA_URL, LOCAL_CSV)
    print("Downloaded", LOCAL_CSV)
else:
    print("Dataset already present →", LOCAL_CSV)

df = pd.read_csv(LOCAL_CSV, index_col=0)
print("Shape:", df.shape)
display(df.head(3))

Downloaded bank_clean.csv
Shape: (41188, 61)


Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,1,999,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
1,57,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
2,37,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0


## Uploading data to S3, as that's where model will get it from in production

In [17]:
train_df, test_df = train_test_split(
    df, train_size=0.7, shuffle=True, random_state=1729
)

# import numpy as np
# train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
# print(train_data.shape, test_data.shape)

In [19]:
train_df.drop(["y_no", "y_yes"], axis=1).head()

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
28868,31,1,999,1,1,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
10545,29,1,999,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
25907,53,2,999,0,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
1289,60,1,999,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
23064,55,1,999,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


In [20]:
train_df.drop(["y_no"], axis=1).head()

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_yes
28868,31,1,999,1,1,1,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
10545,29,1,999,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
25907,53,2,999,0,1,0,0,0,1,0,...,0,0,0,0,0,1,0,1,0,0
1289,60,1,999,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
23064,55,1,999,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [21]:
pd.concat(
    [train_df["y_yes"], train_df.drop(["y_no", "y_yes"], axis=1)], axis=1
).head()

Unnamed: 0,y_yes,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
28868,0,31,1,999,1,1,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
10545,0,29,1,999,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
25907,0,53,2,999,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,1,0
1289,0,60,1,999,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,1,0
23064,0,55,1,999,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


In [24]:
PREFIX = "xgboost-bank"

In [23]:
train_payload = pd.concat(
    [train_df["y_yes"], train_df.drop(["y_no", "y_yes"], axis=1)], axis=1
)

train_csv_path = "train.csv"
train_payload.to_csv(train_csv_path, index=False, header=False)

s3_train_key = f"{PREFIX}/train/{train_csv_path}"

print("S3 train key: ", s3_train_key)

s3_client.upload_file(train_csv_path, BUCKET, s3_train_key)
print(f"Uploaded training data to s3://{BUCKET}/{s3_train_key}")


S3 train key:  xgboost-bank/train/train.csv
Uploaded training data to s3://mlops-dsml-june27/xgboost-bank/train/train.csv


In [25]:
test_payload = pd.concat(
    [test_df["y_yes"], test_df.drop(["y_no", "y_yes"], axis=1)], axis=1
)
test_csv_path = "test.csv"
test_payload.to_csv(test_csv_path, index=False, header=False)
s3_test_key = f"{PREFIX}/test/{test_csv_path}"
s3_client.upload_file(test_csv_path, BUCKET, s3_test_key)
print(f"Uploaded test data to s3://{BUCKET}/{s3_test_key}")

Uploaded test data to s3://mlops-dsml-june27/xgboost-bank/test/test.csv


# Configuring and setting up Sagemaker Xgboost Container

In [26]:
f"s3://{BUCKET}/{PREFIX}/train/"

's3://mlops-dsml-june27/xgboost-bank/train/'

In [27]:
# Create TrainingInput objects -------------------------------------------------


s3_train = TrainingInput(f"s3://{BUCKET}/{PREFIX}/train/", content_type="text/csv")
s3_val = TrainingInput(f"s3://{BUCKET}/{PREFIX}/test/", content_type="text/csv")

# Training Model

In [29]:
container = image_uris.retrieve('xgboost',boto3.Session().region_name,'1.5-1')

# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":50
}


estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=f"s3://{BUCKET}/{PREFIX}/output")

In [30]:
print("Starting training …")
estimator.fit({"train": s3_train, "validation": s3_val}, wait=True)
print("Training job completed →", estimator.latest_training_job.name)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-06-27-03-03-18-051


Starting training …
2025-06-27 03:03:19 Starting - Starting the training job...
2025-06-27 03:03:53 Downloading - Downloading input data...
2025-06-27 03:04:08 Downloading - Downloading the training image...
2025-06-27 03:04:46 Training - Training image download completed. Training in progress....
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-06-27 03:05:10.333 ip-10-0-105-32.us-west-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-06-27 03:05:10.354 ip-10-0-105-32.us-west-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-06-27:03:05:10:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-06-27:03:05:10:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-06-27:03:05:10:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-06-27:03:05:10:INFO] Running XGBoost Sagemaker in

In [31]:
# 6. Deploy Realtime Endpoint

print("Deploying endpoint … this may take a few minutes")
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.2xlarge",
    serializer=CSVSerializer(),
)
endpoint_name = predictor.endpoint_name
print("Endpoint active →", endpoint_name)

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-06-27-03-06-33-413


Deploying endpoint … this may take a few minutes


INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2025-06-27-03-06-33-413
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2025-06-27-03-06-33-413


-----!Endpoint active → sagemaker-xgboost-2025-06-27-03-06-33-413


In [39]:
test_df.head()

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
40949,54,3,999,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,1,0
9332,56,2,999,0,1,0,0,1,0,0,...,1,0,0,0,0,0,1,0,1,0
32286,32,2,999,0,1,0,0,1,0,0,...,1,0,0,0,0,0,1,0,1,0
3925,46,3,999,0,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0
9406,35,2,999,0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0


In [36]:
X_test = test_df.drop(["y_no", "y_yes"], axis=1).values
probs_text = predictor.predict(X_test)
probs_text[:50].decode("utf-8").strip().split("\n")

['0.24062314629554749', '0.04597575217485428', '0.10725669']

In [37]:
# 7. Evaluate on the Test Set

X_test = test_df.drop(["y_no", "y_yes"], axis=1).values
probs_text = predictor.predict(X_test).decode("utf-8").strip().split("\n")
probs = np.asarray(probs_text, dtype=float)

y_true = test_df["y_yes"].values
print("Confusion Matrix:\n", confusion_matrix(y_true, probs > 0.5))
print(
    "\nClassification Report:\n", classification_report(y_true, probs > 0.5, digits=4)
)

Confusion Matrix:
 [[10834   198]
 [ 1024   301]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9136    0.9821    0.9466     11032
           1     0.6032    0.2272    0.3300      1325

    accuracy                         0.9011     12357
   macro avg     0.7584    0.6046    0.6383     12357
weighted avg     0.8804    0.9011    0.8805     12357



In [40]:
predictor.delete_endpoint(delete_endpoint_config=True)
print("Endpoint deleted.")

INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-xgboost-2025-06-27-03-06-33-413
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2025-06-27-03-06-33-413


Endpoint deleted.


In [None]:
# --- Batch Transform (predict entire test.csv as a file) --------------------
# transformer = estimator.transformer(
#     instance_count=1,
#     instance_type="ml.m5.xlarge",
#     strategy="SingleRecord",  # one row per request
#     assemble_with="Line",  # same CSV line format
# )

# transformer.transform(
#     data=f"s3://{BUCKET}/{PREFIX}/test/test.csv",
#     content_type="text/csv",
#     split_type="Line",
#     wait=True,
# )

# print("Batch outputs @", transformer.output_path)