## Use K-nn to predict if a patient has breast cancer or not.

Goal:
1. create knn model
2. Down-select Features
3. use protobuf


In [2]:
# Import Libraries

import boto3
import sagemaker
import sagemaker.amazon.common as smac
import io
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [3]:
sm = sagemaker.Session()

bucket = sm.default_bucket()
prefix = 'breast-cancer'
key = "breast-cancer"

In [23]:
# Download raw data from s3 bucket
s3 = boto3.client("s3")
s3.download_file(bucket, f"{prefix}/{key}", f"{key}.csv")

In [4]:

# if want to read over network
#pd.read_csv('s3://sagemaker-us-east-1-258532878709/breast-cancer/breast-cancer.csv')

# if want to read from sagemaker instance locally
bc_df = pd.read_csv('breast-cancer.csv')

features = bc_df.iloc[:,2:6]
target = bc_df['diagnosis']


In [40]:
features 

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean
0,17.99,10.38,122.80,1001.0
1,20.57,17.77,132.90,1326.0
2,19.69,21.25,130.00,1203.0
3,11.42,20.38,77.58,386.1
4,20.29,14.34,135.10,1297.0
...,...,...,...,...
564,21.56,22.39,142.00,1479.0
565,20.13,28.25,131.20,1261.0
566,16.60,28.08,108.30,858.1
567,20.60,29.33,140.10,1265.0


In [6]:
# Feature Engineering/Pre-processing

le = preprocessing.LabelEncoder()
le.fit(['M','B'])

target = le.transform(target)
       
       

In [7]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=42)

In [31]:
y_test

array([0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0])

In [8]:
features.shape

(569, 4)

In [37]:
# Create protobuf for training
#x = X_train.to_numpy()

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, X_train.to_numpy(), y_train)
buf.seek(0)

0

In [38]:
# Load to s3

boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "train", key)).upload_fileobj(buf)
s3_train_data = f"s3://{bucket}/{prefix}/train/{key}"
print(f"uploaded training data location: {s3_train_data}")



uploaded training data location: s3://sagemaker-us-east-1-258532878709/breast-cancer/train/breast-cancer


In [39]:
# Create protobuf for training
#x = X_train.to_numpy()

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, X_test.to_numpy(), y_test)
buf.seek(0)

boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "test", key)).upload_fileobj(buf)
s3_test_data = f"s3://{bucket}/{prefix}/test/{key}"
print(f"uploaded training data location: {s3_test_data}")


uploaded training data location: s3://sagemaker-us-east-1-258532878709/breast-cancer/test/breast-cancer


In [41]:
import matplotlib.pyplot as plt

import sagemaker
from sagemaker import get_execution_role
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

from sagemaker.amazon.amazon_estimator import get_image_uri


def trained_estimator_from_hyperparams(s3_train_data, hyperparams, output_path, s3_test_data=None):
    """
    Create an Estimator from the given hyperparams, fit to training data,
    and return a deployed predictor

    """
    # set up the estimator
    knn = sagemaker.estimator.Estimator(
        get_image_uri(boto3.Session().region_name, "knn"),
        get_execution_role(),
        instance_count=1,
        instance_type="ml.m5.2xlarge",
        output_path=output_path,
        sagemaker_session=sagemaker.Session(),
    )
    knn.set_hyperparameters(**hyperparams)

    # train a model. fit_input contains the locations of the train and test data
    fit_input = {"train": s3_train_data}
    if s3_test_data is not None:
        fit_input["test"] = s3_test_data
    knn.fit(fit_input)
    return knn

In [42]:
hyperparams = {"feature_dim": 4, "k": 2, "sample_size": 100, "predictor_type": "classifier"}
output_path = f"s3://{bucket}/{prefix}/default_example/output"
knn_estimator = trained_estimator_from_hyperparams(
    s3_train_data, hyperparams, output_path, s3_test_data=s3_test_data
)

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2022-09-13 19:02:28 Starting - Starting the training job...
2022-09-13 19:02:54 Starting - Preparing the instances for trainingProfilerReport-1663095748: InProgress
......
2022-09-13 19:03:55 Downloading - Downloading input data...
2022-09-13 19:04:14 Training - Downloading the training image..................
2022-09-13 19:07:26 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[09/13/2022 19:07:28 INFO 140118698051392] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'_kvstore': 'dist_async', '_log_level': 'info', '_num_gpus': 'auto', '_num_kv_servers': '1', '_tuning_objective_metric': '', '_faiss_index_nprobe': '5', 'epochs': '1', 'feature_dim': 'auto', 'faiss_index_ivf_nlists': 'auto', 'index_metric': 'L2', 'index_type': 'faiss.Flat', 'mini_batch_size': '5000', '_enable_profiler':

## Create Endpoints

In [43]:
import time

instance_type = "ml.m4.xlarge"
model_name = "knn_%s" % instance_type
endpoint_name = "knn-ml-m4-xlarge-%s" % (str(time.time()).replace(".", "-"))
print("setting up the endpoint..")

setting up the endpoint..


In [44]:
predictor = knn_estimator.deploy( initial_instance_count = 1 , instance_type = instance_type, endpoint_name = endpoint_name)

predictor.serializer = CSVSerializer()
predictor.deserializer = JSONDeserializer()



-------------!

## Make Inferences

In [49]:
# Single Prediction:
single_pred = X_test.to_numpy()[0]
result = predictor.predict(single_pred, initial_args={"ContentType": "text/csv"})
result

In [57]:
# TEst on entire set

all_results = predictor.predict(X_test.to_numpy(), initial_args={"ContentType": "text/csv"})


In [58]:
all_results

{'predictions': [{'predicted_label': 0.0},
  {'predicted_label': 1.0},
  {'predicted_label': 1.0},
  {'predicted_label': 0.0},
  {'predicted_label': 0.0},
  {'predicted_label': 1.0},
  {'predicted_label': 1.0},
  {'predicted_label': 1.0},
  {'predicted_label': 0.0},
  {'predicted_label': 0.0},
  {'predicted_label': 0.0},
  {'predicted_label': 1.0},
  {'predicted_label': 0.0},
  {'predicted_label': 0.0},
  {'predicted_label': 0.0},
  {'predicted_label': 1.0},
  {'predicted_label': 0.0},
  {'predicted_label': 0.0},
  {'predicted_label': 0.0},
  {'predicted_label': 1.0},
  {'predicted_label': 0.0},
  {'predicted_label': 0.0},
  {'predicted_label': 1.0},
  {'predicted_label': 0.0},
  {'predicted_label': 0.0},
  {'predicted_label': 0.0},
  {'predicted_label': 0.0},
  {'predicted_label': 0.0},
  {'predicted_label': 0.0},
  {'predicted_label': 1.0},
  {'predicted_label': 0.0},
  {'predicted_label': 0.0},
  {'predicted_label': 0.0},
  {'predicted_label': 0.0},
  {'predicted_label': 0.0},
  {'p

In [59]:
cur_predictions = np.array(
        [all_results["predictions"][i]["predicted_label"] for i in range(len(all_results["predictions"]))]
    )

In [8]:
# Given these features
X_test

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean
204,12.47,18.60,81.09,481.9
70,18.94,21.31,123.60,1130.0
131,15.46,19.48,101.70,748.9
431,12.40,17.68,81.47,467.8
540,11.54,14.44,74.65,402.9
...,...,...,...,...
141,16.11,18.05,105.10,813.0
498,18.49,17.52,121.30,1068.0
7,13.71,20.83,90.20,577.9
541,14.47,24.99,95.81,656.4


In [61]:
# these are the predicted results
cur_predictions

array([0., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1.,
       1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0.,
       1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0.,
       0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1.,
       0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0.,
       0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.,
       0.])

## Evaluate

In [65]:
num_correct = sum(cur_predictions == y_test)
accuracy = num_correct / float(len(y_test))

In [66]:
num_correct

166

In [67]:
accuracy 

0.8829787234042553

In [68]:
def delete_endpoint(predictor):
    try:
        boto3.client("sagemaker").delete_endpoint(EndpointName=predictor.endpoint_name)
        print(f"Deleted {predictor.endpoint_name}")
    except:
        print(f"Already deleted: {predictor.endpoint_name}")


delete_endpoint(predictor)

Deleted knn-ml-m4-xlarge-1663096097-5858493


References:

