<a href="https://colab.research.google.com/github/friedelj/ML540/blob/main/JFriedel_USD540_Assignment_4_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

JFriedel           USD540         Assignment 4.1     9June2025                  

In [None]:
!pip3 install -U sagemaker

In [None]:
import os
import boto3
import sagemaker

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name

bucket = sess.default_bucket()
prefix = "DEMO-breast-cancer-prediction-xgboost-highlevel"

In [None]:
import pandas as pd
import numpy as np

s3 = boto3.client("s3")

filename = "wdbc.csv"
s3.download_file(
    f"sagemaker-example-files-prod-{region}", "datasets/tabular/breast_cancer/wdbc.csv", filename
)
data = pd.read_csv(filename, header=None)

# specify columns extracted from wbdc.names
data.columns = [
    "id",
    "diagnosis",
    "radius_mean",
    "texture_mean",
    "perimeter_mean",
    "area_mean",
    "smoothness_mean",
    "compactness_mean",
    "concavity_mean",
    "concave points_mean",
    "symmetry_mean",
    "fractal_dimension_mean",
    "radius_se",
    "texture_se",
    "perimeter_se",
    "area_se",
    "smoothness_se",
    "compactness_se",
    "concavity_se",
    "concave points_se",
    "symmetry_se",
    "fractal_dimension_se",
    "radius_worst",
    "texture_worst",
    "perimeter_worst",
    "area_worst",
    "smoothness_worst",
    "compactness_worst",
    "concavity_worst",
    "concave points_worst",
    "symmetry_worst",
    "fractal_dimension_worst",
]

# save the data
data.to_csv("data.csv", sep=",", index=False)

data.sample(8)

In [None]:
data["diagnosis"] = data["diagnosis"].apply(lambda x: ((x == "M")) + 0)
data.sample(8)

In [None]:
# data split in three sets, training, validation and batch inference
rand_split = np.random.rand(len(data))
train_list = rand_split < 0.8
val_list = (rand_split >= 0.8) & (rand_split < 0.9)
batch_list = rand_split >= 0.9

data_train = data[train_list].drop(["id"], axis=1)
data_val = data[val_list].drop(["id"], axis=1)
data_batch = data[batch_list].drop(["diagnosis"], axis=1)
data_batch_noID = data_batch.drop(["id"], axis=1)

In [None]:
train_file = "train_data.csv"
data_train.to_csv(train_file, index=False, header=False)
sess.upload_data(train_file, key_prefix="{}/train".format(prefix))

validation_file = "validation_data.csv"
data_val.to_csv(validation_file, index=False, header=False)
sess.upload_data(validation_file, key_prefix="{}/validation".format(prefix))

batch_file = "batch_data.csv"
data_batch.to_csv(batch_file, index=False, header=False)
sess.upload_data(batch_file, key_prefix="{}/batch".format(prefix))

batch_file_noID = "batch_data_noID.csv"
data_batch_noID.to_csv(batch_file_noID, index=False, header=False)
sess.upload_data(batch_file_noID, key_prefix="{}/batch".format(prefix))

In [None]:
%%time
from time import gmtime, strftime

job_name = "xgb-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
output_location = "s3://{}/{}/output/{}".format(bucket, prefix, job_name)
image = sagemaker.image_uris.retrieve(
    framework="xgboost", region=boto3.Session().region_name, version="1.7-1"
)

sm_estimator = sagemaker.estimator.Estimator(
    image,
    role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size=50,
    input_mode="File",
    output_path=output_location,
    sagemaker_session=sess,
)

sm_estimator.set_hyperparameters(
    objective="binary:logistic",
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    verbosity=0,
    num_round=100,
)

train_data = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/train".format(bucket, prefix),
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)
validation_data = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/validation".format(bucket, prefix),
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)
data_channels = {"train": train_data, "validation": validation_data}

# Start training by calling the fit method in the estimator
sm_estimator.fit(inputs=data_channels, job_name=job_name, logs=True)

In [None]:
sm_transformer = sm_estimator.transformer(1, "ml.m5.xlarge")

In [None]:
%%time

sm_transformer = sm_estimator.transformer(1, "ml.m5.xlarge")

# start a transform job
input_location = "s3://{}/{}/batch/{}".format(
    bucket, prefix, batch_file_noID
)  # use input data without ID column
sm_transformer.transform(input_location, content_type="text/csv", split_type="Line")
sm_transformer.wait()

In [None]:
import re

def get_csv_output_from_s3(s3uri, batch_file):
    file_name = "{}.out".format(batch_file)
    match = re.match("s3://([^/]+)/(.*)", "{}/{}".format(s3uri, file_name))
    output_bucket, output_prefix = match.group(1), match.group(2)
    s3.download_file(output_bucket, output_prefix, file_name)
    return pd.read_csv(file_name, sep=",", header=None)

In [None]:
output_df = get_csv_output_from_s3(sm_transformer.output_path, batch_file_noID)
output_df.head(8)

In [None]:
# content_type / accept and split_type / assemble_with are required to use IO joining feature
sm_transformer.assemble_with = "Line"
sm_transformer.accept = "text/csv"

# start a transform job
input_location = "s3://{}/{}/batch/{}".format(
    bucket, prefix, batch_file
)  # use input data with ID column cause InputFilter will filter it out
sm_transformer.transform(
    input_location,
    split_type="Line",
    content_type="text/csv",
    input_filter="$[1:]",
    join_source="Input",
)
sm_transformer.wait()

In [None]:
output_df = get_csv_output_from_s3(sm_transformer.output_path, batch_file)
output_df.head(8)

In [None]:
# start another transform job
sm_transformer.transform(
    input_location,
    split_type="Line",
    content_type="text/csv",
    input_filter="$[1:]",
    join_source="Input",
    output_filter="$[0,-1]",
)
sm_transformer.wait()

In [None]:
output_df = get_csv_output_from_s3(sm_transformer.output_path, batch_file)
output_df.head(8)

In [None]:
# Inspect Training Job Details
info

In [None]:
# Create Endpoint Configuration

# Create an endpoint config name. Here we create one based on the date
# so it we can search endpoints based on creation time.
endpoint_config_name = 'lab4-1-endpoint-config' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

instance_type = 'ml.m5.xlarge'

endpoint_config_response = sagemaker.create_endpoint_config(
    EndpointConfigName=endpoint_config_name, # You will specify this name in a CreateEndpoint request.
    # List of ProductionVariant objects, one for each model that you want to host at this endpoint.
    ProductionVariants=[
        {
            "VariantName": "variant1", # The name of the production variant.
            "ModelName": model_name,
            "InstanceType": instance_type, # Specify the compute instance type.
            "InitialInstanceCount": 1 # Number of instances to launch initially.
        }
    ]
)

print(f"Created EndpointConfig: {endpoint_config_response['EndpointConfigArn']}")

In [None]:
# Deploy our model to real-time endpoint

endpoint_name = 'lab4-1-endpoint' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())


create_endpoint_response = sagemaker.create_endpoint(
                                            EndpointName=endpoint_name,
                                            EndpointConfigName=endpoint_config_name)

In [None]:
# Wait for endpoint to spin up
from time import sleep

sagemaker.describe_endpoint(EndpointName=endpoint_name)

while True:
    print("Getting Job Status")
    res = sagemaker.describe_endpoint(EndpointName=endpoint_name)
    state = res["EndpointStatus"]

    if state == "InService":
        print("Endpoint in Service")
        break
    elif state == "Creating":
        print("Endpoint still creating...")
        sleep(30)    # PURKEYPILE: Changed from 60 to 30
    else:
        print("Endpoint Creation Error - Check Sagemaker Console")
        break

In [None]:
# Invoke Endpoint

sagemaker_runtime = boto3.client("sagemaker-runtime", region_name=region)

response = sagemaker_runtime.invoke_endpoint(
                            EndpointName=endpoint_name,
                            ContentType='text/csv',
                            Body=data_batch_noID.to_csv(header=None, index=False).strip('\n').split('\n')[0]
                            )
print(response['Body'].read().decode('utf-8'))

In [None]:
# Examine Response Body
response

In [None]:
# Delete Endpoint
sagemaker.delete_endpoint(EndpointName=endpoint_name)

#S3

In [None]:
# Create S3 Bucket
import boto3
import sagemaker

session = boto3.session.Session()
region = session.region_name
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

s3 = boto3.Session().client(service_name="s3", region_name=region)

In [None]:
setup_s3_bucket_passed = False
print("Default bucket: {}".format(bucket))

In [None]:
# Verify S3_BUCKET Bucket Creation
from botocore.client import ClientError

response = None

try:
    response = s3.head_bucket(Bucket=bucket)
    print(response)
    setup_s3_bucket_passed = True
except ClientError as e:
    print("[ERROR] Cannot find bucket {} in {} due to {}.".format(bucket, response, e))

In [None]:
%store setup_s3_bucket_passed

In [None]:
%store

In [None]:
# Check Pre-Requisites from an earlier notebook
%store -r setup_dependencies_passed
try:
    setup_dependencies_passed
except NameError:
    print("+++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.")
    print("+++++++++++++++++++++++++++++++")
print(setup_dependencies_passed)

In [None]:
%store -r setup_s3_bucket_passed
try:
    setup_s3_bucket_passed
except NameError:
    print("+++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.")
    print("+++++++++++++++++++++++++++++++")
print(setup_s3_bucket_passed)

In [None]:
if not setup_dependencies_passed:
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.")
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
if not setup_s3_bucket_passed:
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.")
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [None]:
# Set S3 Destination Location (Our Private S3 Bucket)
s3_private_path_csv = "s3://{}/assmt2-ds/csv".format(bucket)
print(s3_private_path_csv)

In [None]:
%store s3_private_path_csv

In [None]:
# Copy Data From the Public S3 Bucket to Private S3 Bucket in this Account
!aws s3 cp "dataset.csv" $s3_private_path_csv/

In [None]:
# List Files in our Private S3 Bucket in this Account
print(s3_private_path_csv)

In [None]:
!aws s3 ls $s3_private_path_csv/

In [None]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/sagemaker-{}-{}/assmt2-ds/?region={}&tab=overview">S3 Bucket</a></b>'.format(
            region, account_id, region
        )
    )
)

In [None]:
from sagemaker.session import Session

region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
sagemaker_runtime = boto_session.client(service_name="sagemaker-runtime", region_name=region)

cancer_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_runtime_client=sagemaker_runtime,
)

In [None]:
# Use a bucket of your choosing
default_s3_bucket_name = cancer_session.default_bucket()
prefix = "sagemaker-cancer-demo"

print(default_s3_bucket_name)

In [None]:
from sagemaker import get_execution_role

# You can modify the following to use a role of your choosing. See the documentation for how to create this.
role = get_execution_role()
print(role)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io

s3_client = boto3.client("s3", region_name=region)

### Create a Model Group (Boto3)

# To create a Model Group by using Boto3, call the create_model_package_group API operation and specify a name and
# description as parameters. The following example shows how to create a Model Group. The response from the
# create_model_package_group call is the Amazon Resource Name (ARN) of the new Model Group.

# First, import the required packages and set up the SageMaker AI Boto3 client.

In [None]:
import time
import os
from sagemaker import get_execution_role, session
import boto3

region = boto3.Session().region_name

role = get_execution_role()

sm_client = boto3.client('sagemaker', region_name=region)

# Now create the Model Group.

In [None]:
import time
model_package_group_name = "Joes-breast-cancer-detector" + str(round(time.time()))
model_package_group_input_dict = {
 "ModelPackageGroupName" : model_package_group_name,
 "ModelPackageGroupDescription" : "Assmt4 using Breast Cancer Example"
}

create_model_package_group_response = sm_client.create_model_package_group(**model_package_group_input_dict)
print('ModelPackageGroup Arn : {}'.format(create_model_package_group_response['ModelPackageGroupArn']))

## Response Syntax

## Response Structure
# (dict) –

# ModelPackageGroupName (string) –

# The name of the model group.

# ModelPackageGroupArn (string) –

# The Amazon Resource Name (ARN) of the model group.

# ModelPackageGroupDescription (string) –

# A description of the model group.

# CreationTime (datetime) –

# The time that the model group was created.

# CreatedBy (dict) –

# Information about the user who created or modified an experiment, trial, trial component, lineage group, project, or model card.

# UserProfileArn (string) –

# The Amazon Resource Name (ARN) of the user’s profile.

# UserProfileName (string) –

# The name of the user’s profile.

# DomainId (string) –

# The domain associated with the user.

IamIdentity (dict) –

The IAM Identity details associated with the user. These details are associated with model package groups, model packages, and project entities only.

Arn (string) –

The Amazon Resource Name (ARN) of the IAM identity.

PrincipalId (string) –

The ID of the principal that assumes the IAM identity.

SourceIdentity (string) –

The person or application which assumes the IAM identity.

ModelPackageGroupStatus (string) –

The status of the model group.

In [None]:
from datetime import datetime
{
    'ModelPackageGroupName': 'Joes-breast-cancer-detector',
    'ModelPackageGroupArn': 'ModelPackageGroupArn',
    'ModelPackageGroupDescription': 'string',
    'CreationTime': datetime(2015, 1, 1),
    'CreatedBy': {
        'UserProfileArn': 'string1',
        'UserProfileName': 'JoeF',
        'DomainId': 'string3',
        'IamIdentity': {
            'Arn': 'string4',
            'PrincipalId': 'string5',
            'SourceIdentity': 'string6'
        }
    },
    'ModelPackageGroupStatus': 'InProgress'
}

# Give your Model Group an informative name about what this group does, e.g. xgboost-breast-cancer-detection, and give
# it a brief but informative description of what this group does in a bit more detail (best practice is under ~250
# chars).

In [None]:
import boto3

sagemaker_client = boto3.client('sagemaker')

response = sagemaker_client.list_model_package_groups()

print("Available model package groups:")
for group in response['ModelPackageGroupSummaryList']:
    print(group['ModelPackageGroupName'])

In [None]:
import boto3

sagemaker_client = boto3.client('sagemaker')

response = sagemaker_client.describe_model_package_group(
    ModelPackageGroupName='Joes-breast-cancer-detector1749416004'
)

print(response)

## Part 2: Set Up Model Package

# The Model Package will contain specific details about our current model. Our Model Package should document model
# deployment information (instance image, model data source i.e. our binary artifact, data source, any pre-processor
# or post-processor scripts, etc.). After we learn more about Model monitoring, we can also include model quality,
# model data quality, model bias and model explainability reports here too!

In [None]:
from sagemaker import image_uris

image_uri = image_uris.retrieve(
    framework='xgboost',
    region='us-east-1',
    version='1.5-1'
)

In [None]:
# Specify the model source
model_url = "s3/buckets/sagemaker-us-east-1-244989531891?prefix=assmt2-ds/csv/breastcancer.csv"

modelpackage_inference_specification =  {
    "InferenceSpecification": {
      "Containers": [
         {
            "Image": image_uri,
	    "ModelDataUrl": model_url
         }
      ],
      "SupportedContentTypes": [ "text/csv" ],
      "SupportedResponseMIMETypes": [ "text/csv" ],
   }
 }

# Alternatively, you can specify the model source like this:
# modelpackage_inference_specification["InferenceSpecification"]["Containers"][0]["ModelDataUrl"]=model_url

create_model_package_input_dict = {
    "ModelPackageGroupName" : model_package_group_name,
    "ModelPackageDescription" : "Model to detect breast cancer",
    "ModelApprovalStatus" : "PendingManualApproval"
}
create_model_package_input_dict.update(modelpackage_inference_specification)

# Review the API documentation here and fill out what you can inside the Inference Specification section. It’s
# important to document what is useful to you and your team, but keep documentation brief so it is still readable.

### describe_model_package

In [None]:
import boto3

region = boto3.Session().region_name
client = boto3.client('sagemaker', region_name=region)

response = client.list_model_packages(
    ModelPackageGroupName='Joes-breast-cancer-detector',
)
print(response)

# S3 Bucket Data: s3/buckets/sagemaker-us-east-1-244989531891?prefix=assmt2-ds/csv/breastcancer.csv

### Part 3: Write the Model Card

# Finally we have our Model Card. The Model Card will contain qualitative details about our current model. The Model
# Card can contain a lot of information. At a minimum, it should contain details of what the model algorithm is, how
# the model was trained, what hyperparameters were used to train the model, what the input features for the model are,
# who the model owner is (you), what problem the model is trying to solve, intended uses of the model, evaluation
# details of the model, and so on.

###                                 See PDF