# Breast Cancer Prediction

Predict Breast Cancer using SageMaker's linear learner algorithm

In [3]:
import os
import boto3
import re
import sagemaker

try:
    # get_execution_role() will only work within Sagemaker studio or notebook instance
    role = sagemaker.get_execution_role()
except ValueError:
    # Will need to get the role ARN by initializing a a new IAM session and get the role by their name
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='AmazonSageMaker-ExecutionRole-20220817T160055')['Role']['Arn']
    print("Role ARN successfully extracted")


region = boto3.Session().region_name

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket and prefix
bucket = sagemaker.Session().default_bucket()

prefix = (
    "sagemaker/aws-sagemaker-training/DEMO-breast-cancer-prediction"  # place to upload training files within the bucket
)

Couldn't call 'get_role' to get Role ARN from role name francisco-admin to get Role path.


Role ARN successfully extracted


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import time
import json
import sagemaker.amazon.common as smac

## Data donwload and prep

In [5]:
s3 = boto3.client("s3")

filename = "data-samples\wdbc.csv"
s3.download_file("sagemaker-sample-files", "datasets/tabular/breast_cancer/wdbc.csv", filename)
data = pd.read_csv(filename, header=None)

# specify columns extracted from wbdc.names
data.columns = [
    "id",
    "diagnosis",
    "radius_mean",
    "texture_mean",
    "perimeter_mean",
    "area_mean",
    "smoothness_mean",
    "compactness_mean",
    "concavity_mean",
    "concave points_mean",
    "symmetry_mean",
    "fractal_dimension_mean",
    "radius_se",
    "texture_se",
    "perimeter_se",
    "area_se",
    "smoothness_se",
    "compactness_se",
    "concavity_se",
    "concave points_se",
    "symmetry_se",
    "fractal_dimension_se",
    "radius_worst",
    "texture_worst",
    "perimeter_worst",
    "area_worst",
    "smoothness_worst",
    "compactness_worst",
    "concavity_worst",
    "concave points_worst",
    "symmetry_worst",
    "fractal_dimension_worst",
]

# save the data
data.to_csv("data-samples\data.csv", sep=",", index=False)

# print the shape of the data file
print(data.shape)

# show the top few rows
display(data.head())

# describe the data object
display(data.describe())

# we will also summarize the categorical field diganosis
display(data.diagnosis.value_counts())

(569, 32)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,30371830.0,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,125020600.0,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,869218.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,906024.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,8813129.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,911320500.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


B    357
M    212
Name: diagnosis, dtype: int64

In [7]:
# Split data into train, validation and test (80/10/10)
rand_split = np.random.rand(len(data))
train_list = rand_split < 0.8
val_list = (rand_split >= 0.8) & (rand_split < 0.9)
test_list = rand_split >= 0.9

data_train = data[train_list]
data_val = data[val_list]
data_test = data[test_list]

train_y = ((data_train.iloc[:, 1] == "M") + 0).to_numpy()
train_X = data_train.iloc[:, 2:].to_numpy()

val_y = ((data_val.iloc[:, 1] == "M") + 0).to_numpy()
val_X = data_val.iloc[:, 2:].to_numpy()

test_y = ((data_test.iloc[:, 1] == "M") + 0).to_numpy()
test_X = data_test.iloc[:, 2:].to_numpy()

In [8]:
# Conver data to recordIO format
train_file = "linear_train.data"
f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, train_X.astype("float32"), train_y.astype("float32"))
f.seek(0)
print(os.path.join(prefix, "train", train_file).replace("\\","/"))
boto3.Session().resource("s3").Bucket(bucket).Object(os.path.join(prefix, "train", train_file).replace("\\","/")).upload_fileobj(f)

sagemaker/aws-sagemaker-training/DEMO-breast-cancer-prediction/train/linear_train.data


In [9]:
validation_file = "linear_validation.data"
f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, val_X.astype("float32"), val_y.astype("float32"))
f.seek(0)
print(os.path.join(prefix, "validation", validation_file).replace("\\","/"))
boto3.Session().resource("s3").Bucket(bucket).Object(os.path.join(prefix, "validation", validation_file).replace("\\","/")).upload_fileobj(f)

sagemaker/aws-sagemaker-training/DEMO-breast-cancer-prediction/validation/linear_validation.data


## Train

In [10]:
from sagemaker import image_uris

container = image_uris.retrieve(region=boto3.Session().region_name, framework="linear-learner")

In [11]:
# Define training job parameters
linear_job = "DEMO-linear-cancer-prediction" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

print("Job name is:", linear_job)

linear_training_params = {
    "RoleArn": role,
    "TrainingJobName": linear_job,
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.c4.2xlarge", "VolumeSizeInGB": 10},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/train/".format(bucket, prefix),
                    "S3DataDistributionType": "ShardedByS3Key",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/validation/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
    ],
    "OutputDataConfig": {"S3OutputPath": "s3://{}/{}/".format(bucket, prefix)},
    "HyperParameters": {
        "feature_dim": "30",
        "mini_batch_size": "100",
        "predictor_type": "regressor",
        "epochs": "10",
        "num_models": "16",
        "loss": "absolute_loss",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 60 * 60},
}

Job name is: DEMO-linear-cancer-prediction2023-01-22-18-42-19


In [12]:
# Kick off training job
%%time

region = boto3.Session().region_name
sm = boto3.client("sagemaker")

sm.create_training_job(**linear_training_params)

status = sm.describe_training_job(TrainingJobName=linear_job)["TrainingJobStatus"]
print(status)
sm.get_waiter("training_job_completed_or_stopped").wait(TrainingJobName=linear_job)
if status == "Failed":
    message = sm.describe_training_job(TrainingJobName=linear_job)["FailureReason"]
    print("Training failed with the following error: {}".format(message))
    raise Exception("Training job failed")

InProgress
CPU times: total: 62.5 ms
Wall time: 4min


## Model hosting

In [13]:
# Create model definition in Sagemaker so it can be used for hosting, and create model with such definition
linear_hosting_container = {
    "Image": container,
    "ModelDataUrl": sm.describe_training_job(TrainingJobName=linear_job)["ModelArtifacts"][
        "S3ModelArtifacts"
    ],
}

create_model_response = sm.create_model(
    ModelName=linear_job, ExecutionRoleArn=role, PrimaryContainer=linear_hosting_container
)

print(create_model_response["ModelArn"])

arn:aws:sagemaker:eu-west-2:842780680566:model/demo-linear-cancer-prediction2023-01-22-18-42-19


In [14]:
# Configure endpoint definition using model defined in previous step

linear_endpoint_config = "DEMO-linear-endpoint-config-cancer-" + time.strftime(
    "%Y-%m-%d-%H-%M-%S", time.gmtime()
)
print(linear_endpoint_config)

create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=linear_endpoint_config,
    ProductionVariants=[
        {
            "InstanceType": "ml.m4.xlarge",
            "InitialInstanceCount": 1,
            "ModelName": linear_job,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])

DEMO-linear-endpoint-config-cancer-2023-01-22-18-52-41
Endpoint Config Arn: arn:aws:sagemaker:eu-west-2:842780680566:endpoint-config/demo-linear-endpoint-config-cancer-2023-01-22-18-52-41


In [16]:
%%time
# Create endpoint using endpoint configuration previously defined (right now it is by name)
linear_endpoint = "DEMO-linear-endpoint-cancer-" + time.strftime("%Y%m%d%H%M", time.gmtime())
print(linear_endpoint)
create_endpoint_response = sm.create_endpoint(
    EndpointName=linear_endpoint, EndpointConfigName=linear_endpoint_config
)
print(create_endpoint_response["EndpointArn"])

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp["EndpointStatus"]
print("Status: " + status)

sm.get_waiter("endpoint_in_service").wait(EndpointName=linear_endpoint)

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp["EndpointStatus"]
print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

if status != "InService":
    raise Exception("Endpoint creation did not succeed")

DEMO-linear-endpoint-cancer-202301221854
arn:aws:sagemaker:eu-west-2:842780680566:endpoint/demo-linear-endpoint-cancer-202301221854
Status: Creating
Arn: arn:aws:sagemaker:eu-west-2:842780680566:endpoint/demo-linear-endpoint-cancer-202301221854
Status: InService
CPU times: total: 15.6 ms
Wall time: 4min 1s


## Predict on test data (using endpoint created)

In [17]:
# function to convert an array to a csv, currently our data is in an numpy array
def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter=",", fmt="%g")
    return csv.getvalue().decode().rstrip()

In [20]:
# Example of how it looks like (test data)
test_X

array([[9.504e+00, 1.244e+01, 6.034e+01, ..., 6.227e-02, 2.450e-01,
        7.773e-02],
       [1.665e+01, 2.138e+01, 1.100e+02, ..., 2.095e-01, 3.613e-01,
        9.564e-02],
       [1.927e+01, 2.647e+01, 1.279e+02, ..., 1.785e-01, 3.672e-01,
        1.123e-01],
       ...,
       [1.522e+01, 3.062e+01, 1.034e+02, ..., 2.356e-01, 4.089e-01,
        1.409e-01],
       [2.156e+01, 2.239e+01, 1.420e+02, ..., 2.216e-01, 2.060e-01,
        7.115e-02],
       [2.013e+01, 2.825e+01, 1.312e+02, ..., 1.628e-01, 2.572e-01,
        6.637e-02]])

In [18]:
payload = np2csv(test_X)

In [21]:
# resulting test data after conversion
payload

'9.504,12.44,60.34,273.9,0.1024,0.06492,0.02956,0.02076,0.1815,0.06905,0.2773,0.9768,1.909,15.7,0.009606,0.01432,0.01985,0.01421,0.02027,0.002968,10.23,15.66,65.13,314.9,0.1324,0.1148,0.08867,0.06227,0.245,0.07773\n16.65,21.38,110,904.6,0.1121,0.1457,0.1525,0.0917,0.1995,0.0633,0.8068,0.9017,5.455,102.6,0.006048,0.01882,0.02741,0.0113,0.01468,0.002801,26.46,31.56,177,2215,0.1805,0.3578,0.4695,0.2095,0.3613,0.09564\n19.27,26.47,127.9,1162,0.09401,0.1719,0.1657,0.07593,0.1853,0.06261,0.5558,0.6062,3.528,68.17,0.005015,0.03318,0.03497,0.009643,0.01543,0.003896,24.15,30.9,161.4,1813,0.1509,0.659,0.6091,0.1785,0.3672,0.1123\n13.64,16.34,87.21,571.8,0.07685,0.06059,0.01857,0.01723,0.1353,0.05953,0.1872,0.9234,1.449,14.55,0.004477,0.01177,0.01079,0.007956,0.01325,0.002551,14.67,23.19,96.08,656.7,0.1089,0.1582,0.105,0.08586,0.2346,0.08025\n14.71,21.59,95.55,656.9,0.1137,0.1365,0.1293,0.08123,0.2027,0.06758,0.4226,1.15,2.735,40.09,0.003659,0.02855,0.02572,0.01272,0.01817,0.004108,17.87,30.7,115

In [22]:
# Lets invoke the endpoint and get predictions
runtime = boto3.client("runtime.sagemaker") # use client to request/interact with deployed resources (endpoints)
response = runtime.invoke_endpoint(
    EndpointName=linear_endpoint, ContentType="text/csv", Body=payload
)

In [23]:
# How the response looks like
response

{'ResponseMetadata': {'RequestId': 'adaa9566-4399-4e26-93be-dce132a7e9f1',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'adaa9566-4399-4e26-93be-dce132a7e9f1',
   'x-amzn-invoked-production-variant': 'AllTraffic',
   'date': 'Sun, 22 Jan 2023 19:02:40 GMT',
   'content-type': 'application/json',
   'content-length': '1882'},
  'RetryAttempts': 0},
 'ContentType': 'application/json',
 'InvokedProductionVariant': 'AllTraffic',
 'Body': <botocore.response.StreamingBody at 0x284aef5b640>}

In [24]:
# We are interested in the Body key, but we need to decode the response
result = json.loads(response["Body"].read().decode())

In [25]:
result

{'predictions': [{'score': -0.2331850528717041},
  {'score': 1.0932550430297852},
  {'score': 1.0871968269348145},
  {'score': -0.06412887573242188},
  {'score': 0.6143367290496826},
  {'score': -0.12577486038208008},
  {'score': 0.8813326358795166},
  {'score': 0.7630813121795654},
  {'score': 0.10799646377563477},
  {'score': -0.40035784244537354},
  {'score': -0.05119895935058594},
  {'score': -0.07844233512878418},
  {'score': 0.041510820388793945},
  {'score': -0.014040827751159668},
  {'score': 0.5170663595199585},
  {'score': -0.02531719207763672},
  {'score': -0.13097620010375977},
  {'score': 0.8052535057067871},
  {'score': 0.5421257019042969},
  {'score': -0.2455536127090454},
  {'score': 0.06883561611175537},
  {'score': 0.053117990493774414},
  {'score': -0.028899312019348145},
  {'score': 0.12138700485229492},
  {'score': 0.0872201919555664},
  {'score': -0.5071595907211304},
  {'score': -0.2868075370788574},
  {'score': -0.2068941593170166},
  {'score': -0.41307020187377

In [26]:
test_pred = np.array([r["score"] for r in result["predictions"]])

In [28]:
test_pred

array([-0.23318505,  1.09325504,  1.08719683, -0.06412888,  0.61433673,
       -0.12577486,  0.88133264,  0.76308131,  0.10799646, -0.40035784,
       -0.05119896, -0.07844234,  0.04151082, -0.01404083,  0.51706636,
       -0.02531719, -0.1309762 ,  0.80525351,  0.5421257 , -0.24555361,
        0.06883562,  0.05311799, -0.02889931,  0.121387  ,  0.08722019,
       -0.50715959, -0.28680754, -0.20689416, -0.4130702 , -0.10879183,
       -0.23924589,  1.12382221,  0.82244921,  0.01518154,  0.88503957,
        0.02028441, -0.00832939,  0.00830579,  0.02797639,  0.11340928,
        0.02898455,  0.22266173, -0.2330035 ,  0.04445839,  0.01909447,
        0.76816487,  0.11423612,  0.28065157,  0.21183276,  0.0723331 ,
       -0.00970531,  0.10916531,  0.04738808, -0.1073916 , -0.24419856,
        1.20330739,  1.29258513,  0.94459391])

In [29]:
# Lets evaluate the model
test_mae_linear = np.mean(np.abs(test_y - test_pred))
test_mae_baseline = np.mean(
    np.abs(test_y - np.median(train_y))
)  ## training median as baseline predictor

print("Test MAE Baseline :", round(test_mae_baseline, 3))
print("Test MAE Linear:", round(test_mae_linear, 3))

Test MAE Baseline : 0.259
Test MAE Linear: 0.152


In [30]:
test_pred_class = (test_pred > 0.5) + 0
test_pred_baseline = np.repeat(np.median(train_y), len(test_y))

prediction_accuracy = np.mean((test_y == test_pred_class)) * 100
baseline_accuracy = np.mean((test_y == test_pred_baseline)) * 100

print("Prediction Accuracy:", round(prediction_accuracy, 1), "%")
print("Baseline Accuracy:", round(baseline_accuracy, 1), "%")

Prediction Accuracy: 100.0 %
Baseline Accuracy: 74.1 %


## Delete endpoint

In [31]:
sm.delete_endpoint(EndpointName=linear_endpoint)

{'ResponseMetadata': {'RequestId': '0a01461b-6b1f-474c-963c-ce57c1ada7fa',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '0a01461b-6b1f-474c-963c-ce57c1ada7fa',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sun, 22 Jan 2023 19:08:40 GMT'},
  'RetryAttempts': 0}}