In [205]:
import io
import json
import sagemaker.amazon.common as smac
import pandas as pd
import boto3
import sagemaker
import os
import numpy as np
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
from time import gmtime, strftime
from sklearn.metrics import confusion_matrix

In [206]:
bucket = sagemaker.Session().default_bucket()
prefix = "angel-investor/knn"
key = "recordio-knn-data"

In [207]:
buf = io.BytesIO()

In [208]:
!ls data

companies.csv  test_features.csv  train_features.csv  validate_features.csv
startup.xls    testing.csv	  training.csv	      validate_labels.csv
startup.xlsx   test_labels.csv	  train_labels.csv    validation.csv


# Training K-Nearest Neighbor Model

## Formatting training data for modeling

In [209]:
train_features = pd.read_csv("data/train_features.csv").to_numpy()
print("train features share = ", train_features.shape)
train_features

train features share =  (14604, 984)


array([[1.0000000e+06, 1.0000000e+00, 2.0130000e+03, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [2.5065701e+07, 2.0000000e+00, 2.0080000e+03, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [2.5000000e+05, 1.0000000e+00, 2.0060000e+03, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       ...,
       [2.0000000e+05, 1.0000000e+00, 2.0090000e+03, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.0000000e+06, 1.0000000e+00, 2.0050000e+03, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [2.0000000e+06, 1.0000000e+00, 2.0130000e+03, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00]])

In [210]:
train_labels = pd.read_csv("data/train_labels.csv").to_numpy()
train_labels = np.reshape(train_labels, -1)
print("train labeles share = ", train_labels.shape)
train_labels

train labeles share =  (14604,)


array([0, 0, 0, ..., 0, 0, 0])

In [211]:
buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, train_features, train_labels)
buf.seek(0)

0

In [212]:
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "train", key)).upload_fileobj(buf)
s3_train_data = f"s3://{bucket}/{prefix}/train/{key}"
print(f"uploaded training data location: {s3_train_data}")

uploaded training data location: s3://sagemaker-us-east-1-215019911230/angel-investor/knn/train/recordio-knn-data


## Formatting test data for modeling

In [213]:
test_features = pd.read_csv("data/test_features.csv").to_numpy()
test_labels = pd.read_csv("data/test_labels.csv").to_numpy()
test_labels = np.reshape(test_labels, -1)

In [214]:
print(f"test_features shape = {test_features.shape}")
print(f"test_labels shape = {test_labels.shape}")

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, test_features, test_labels)
buf.seek(0)

boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "test", key)).upload_fileobj(buf)
s3_test_data = f"s3://{bucket}/{prefix}/test/{key}"
print(f"uploaded test data location: {s3_test_data}")

test_features shape = (4380, 984)
test_labels shape = (4380,)
uploaded test data location: s3://sagemaker-us-east-1-215019911230/angel-investor/knn/test/recordio-knn-data


## Training

Useful reference from [this notebook](https://github.com/aws/amazon-sagemaker-examples/blob/master/introduction_to_amazon_algorithms/k_nearest_neighbors_covtype/k_nearest_neighbors_covtype.ipynb) for code below

In [215]:
import matplotlib.pyplot as plt

import sagemaker
from sagemaker import get_execution_role
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

from sagemaker.amazon.amazon_estimator import get_image_uri


In [216]:
output_path = "s3://sagemaker-us-east-1-215019911230/angel-investor/knn"
hyperparams = {"feature_dim": 984, "k": 10, "sample_size": 1000, "predictor_type": "classifier"}

In [217]:
knn = sagemaker.estimator.Estimator(
        get_image_uri(boto3.Session().region_name, "knn"),
        get_execution_role(),
        instance_count=10,
        instance_type="ml.m5.2xlarge",
        output_path=output_path,
        sagemaker_session=sagemaker.Session(),
    )

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


In [218]:
fit_input = {"train": s3_train_data}
knn.set_hyperparameters(**hyperparams)
knn.fit(fit_input)

2021-08-04 09:11:40 Starting - Starting the training job...
2021-08-04 09:11:42 Starting - Launching requested ML instancesProfilerReport-1628068299: InProgress
...
2021-08-04 09:12:38 Starting - Preparing the instances for training.........
2021-08-04 09:14:07 Downloading - Downloading input data
2021-08-04 09:14:07 Training - Downloading the training image............
2021-08-04 09:16:11 Training - Training image download completed. Training in progress.[35mDocker entrypoint called with argument(s): train[0m
[35mRunning default environment configuration script[0m
[33mDocker entrypoint called with argument(s): train[0m
[33mRunning default environment configuration script[0m
[33m[08/04/2021 09:15:59 INFO 140556697474880] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'_kvstore': 'dist_async', '_log_level': 'info', '_num_gpus': 'auto', '_num_kv_servers': '1', '_tuning_objective_metric': '', '_faiss_index_nprobe

[33m[08/04/2021 09:16:03 INFO 140556697474880] Final configuration: {'_kvstore': 'dist_async', '_log_level': 'info', '_num_gpus': 'auto', '_num_kv_servers': '1', '_tuning_objective_metric': '', '_faiss_index_nprobe': '5', 'epochs': '1', 'feature_dim': '984', 'faiss_index_ivf_nlists': 'auto', 'index_metric': 'L2', 'index_type': 'faiss.Flat', 'mini_batch_size': '5000', '_enable_profiler': 'false', 'predictor_type': 'classifier', 'sample_size': '1000', 'k': '10'}[0m
[35m[08/04/2021 09:16:05 INFO 140490656757568] Final configuration: {'_kvstore': 'dist_async', '_log_level': 'info', '_num_gpus': 'auto', '_num_kv_servers': '1', '_tuning_objective_metric': '', '_faiss_index_nprobe': '5', 'epochs': '1', 'feature_dim': '984', 'faiss_index_ivf_nlists': 'auto', 'index_metric': 'L2', 'index_type': 'faiss.Flat', 'mini_batch_size': '5000', '_enable_profiler': 'false', 'predictor_type': 'classifier', 'sample_size': '1000', 'k': '10'}[0m
[32mDocker entrypoint called with argument(s): train[0m
[

[33m[08/04/2021 09:16:11 INFO 140250168227648] Environment: {'ENVROOT': '/opt/amazon', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION': 'cpp', 'HOSTNAME': 'ip-10-0-134-102.ec2.internal', 'TRAINING_JOB_NAME': 'knn-2021-08-04-09-11-39-857', 'NVIDIA_REQUIRE_CUDA': 'cuda>=9.0', 'TRAINING_JOB_ARN': 'arn:aws:sagemaker:us-east-1:215019911230:training-job/knn-2021-08-04-09-11-39-857', 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/v2/credentials/7705f946-3de4-4d50-b34e-ba1e895cbe87', 'CANONICAL_ENVROOT': '/opt/amazon', 'PYTHONUNBUFFERED': 'TRUE', 'NVIDIA_VISIBLE_DEVICES': 'void', 'LD_LIBRARY_PATH': '/opt/amazon/lib/python3.7/site-packages/cv2/../../../../lib:/usr/local/nvidia/lib64:/opt/amazon/lib', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'AWS_EXECUTION_ENV': 'AWS_ECS_EC2', 'PATH': '/opt/amazon/bin:/usr/local/nvidia/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/amazon/bin:/opt/amazon/bin', 'PWD': '/', 'LANG': 'en_US.utf8', 'SAGEMAKER_METRICS_DIRECTORY': '/opt/ml/output/

[36m[08/04/2021 09:16:17 ERROR 140202387842880] nvidia-smi: failed to run (127): b'/bin/sh: nvidia-smi: command not found'/[0m
[36m[08/04/2021 09:16:17 INFO 140202387842880] Using per-worker sample size = 100 (Available virtual memory = 31098032128 bytes, GPU free memory = 0 bytes, number of workers = 10). If an out-of-memory error occurs, choose a larger instance type, use dimension reduction, decrease sample_size, and/or decrease mini_batch_size.[0m
[36m[08/04/2021 09:16:17 INFO 140202387842880] Starting cluster...[0m
[36m[08/04/2021 09:16:17 INFO 140200234141440] concurrency model: async[0m
[36m[08/04/2021 09:16:17 INFO 140202387842880] ...Cluster started[0m
[36m[08/04/2021 09:16:17 INFO 140200234141440] masquerade (NAT) address: None[0m
[36m[08/04/2021 09:16:17 INFO 140200234141440] passive ports: None[0m
[36m[08/04/2021 09:16:17 INFO 140200234141440] >>> starting FTP server on 0.0.0.0:8999, pid=1 <<<[0m
[36m[08/04/2021 09:16:17 INFO 140202387842880] Verifying conn

[36m[2021-08-04 09:16:18.306] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 1, "duration": 349, "num_examples": 3, "num_bytes": 115663680}[0m
[36m[08/04/2021 09:16:18 INFO 140202387842880] #progress_metric: host=algo-3, completed 100.0 % of epochs[0m
[36m#metrics {"StartTime": 1628068577.9566226, "EndTime": 1628068578.3112347, "Dimensions": {"Algorithm": "AWS/KNN", "Host": "algo-3", "Operation": "training", "epoch": 0, "Meta": "training_data_iter"}, "Metrics": {"Total Records Seen": {"sum": 14604.0, "count": 1, "min": 14604, "max": 14604}, "Total Batches Seen": {"sum": 3.0, "count": 1, "min": 3, "max": 3}, "Max Records Seen Between Resets": {"sum": 14604.0, "count": 1, "min": 14604, "max": 14604}, "Max Batches Seen Between Resets": {"sum": 3.0, "count": 1, "min": 3, "max": 3}, "Reset Count": {"sum": 1.0, "count": 1, "min": 1, "max": 1}, "Number of Records Since Last Reset": {"sum": 14604.0, "count": 1, "min": 14604, "max": 14604}, "Number of 

## Inference Using Batch Transform

In [219]:
knn_transformer = knn.transformer(instance_count = 10, 
                                  instance_type = 'ml.m4.xlarge',
                                  strategy="MultiRecord",
                                  assemble_with="Line",
                                  output_path = "s3://sagemaker-us-east-1-215019911230/angel-investor/knn/test/"
                                 )

In [220]:

s3_test_data = "s3://sagemaker-us-east-1-215019911230/angel-investor/train/test_features.csv"
knn_transformer.transform(s3_test_data, content_type='text/csv', split_type="Line")
knn_transformer.wait()

...............................................[34mDocker entrypoint called with argument(s): serve[0m
[34mRunning default environment configuration script[0m
[33mDocker entrypoint called with argument(s): serve[0m
[33mRunning default environment configuration script[0m
[33m[08/04/2021 09:24:38 INFO 139725277591360] Memory profiler is not enabled by the environment variable ENABLE_PROFILER.[0m
[34m[08/04/2021 09:24:41 INFO 139778299914048] Memory profiler is not enabled by the environment variable ENABLE_PROFILER.[0m
[35mDocker entrypoint called with argument(s): serve[0m
[35mRunning default environment configuration script[0m
[35m[08/04/2021 09:24:39 INFO 140077679007552] Memory profiler is not enabled by the environment variable ENABLE_PROFILER.[0m
[32mDocker entrypoint called with argument(s): serve[0m
[32mRunning default environment configuration script[0m
[32m[08/04/2021 09:24:43 INFO 139791607211840] Memory profiler is not enabled by the environment variabl

[32m[08/04/2021 09:24:46 INFO 139778299914048] nvidia-smi: took 0.032 seconds to run.[0m
[32m[08/04/2021 09:24:46 INFO 139778299914048] nvidia-smi identified 0 GPUs.[0m
[32m[08/04/2021 09:24:46 INFO 139778299914048] ...model loaded.[0m
[32m#metrics {"StartTime": 1628069086.82639, "EndTime": 1628069086.8328147, "Dimensions": {"Algorithm": "KNNModel", "Host": "UNKNOWN", "Operation": "scoring"}, "Metrics": {"execution_parameters.count": {"sum": 1.0, "count": 1, "min": 1, "max": 1}}}
[0m
[36m[08/04/2021 09:24:47 INFO 140319186814784] loaded entry point class algorithm.serve.server_config:config_api[0m
[36m[08/04/2021 09:24:47 INFO 140319186814784] loading entry points[0m
[36m[08/04/2021 09:24:47 INFO 140319186814784] loaded request iterator application/json[0m
[36m[08/04/2021 09:24:47 INFO 140319186814784] loaded request iterator application/jsonlines[0m
[36m[08/04/2021 09:24:47 INFO 140319186814784] loaded request iterator application/x-recordio-protobuf[0m
[36m[08/04/2

[36m2021-08-04T09:24:46.838:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[34m[08/04/2021 09:24:50 INFO 139862182373184] loaded entry point class algorithm:model[0m
[34m[08/04/2021 09:24:50 INFO 139862182373184] Number of server workers: 1[0m
[34m[2021-08-04 09:24:50 +0000] [1] [INFO] Starting gunicorn 20.0.4[0m
[34m[2021-08-04 09:24:50 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2021-08-04 09:24:50 +0000] [1] [INFO] Using worker: sync[0m
[34m[2021-08-04 09:24:50 +0000] [58] [INFO] Booting worker with pid: 58[0m
[34m[08/04/2021 09:24:50 INFO 139862182373184] loading model...[0m
[34m[08/04/2021 09:24:50 INFO 139862182373184] nvidia-smi: took 0.032 seconds to run.[0m
[34m[08/04/2021 09:24:50 INFO 139862182373184] nvidia-smi identified 0 GPUs.[0m
[34m[08/04/2021 09:24:50 INFO 139862182373184] ...model loaded.[0m
[34m#metrics {"StartTime": 1628069090.4192462, "EndTime": 1628069090.4250422, "Dimensions

[35m2021-08-04T09:24:50.431:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[34m2021-08-04T09:24:51.246:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[32m2021-08-04T09:24:52.017:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[32m2021-08-04T09:24:52.017:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[36m2021-08-04T09:24:49.878:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[35m2021-08-04T09:24:51.716:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[32mDocker entrypoint called with argument(s): serve[0m
[32mDocker entrypoint called with argument(s): serve[0m
[32mRunning default environment configuration script[0m
[32mRunning default environment configuration script[0m
[35mDocker entrypoint called 

[32m2021-08-04T09:24:44.344:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[32m[08/04/2021 09:24:46 INFO 139778299914048] loaded entry point class algorithm.serve.server_config:config_api[0m
[32m[08/04/2021 09:24:46 INFO 139778299914048] loading entry points[0m
[32m[08/04/2021 09:24:46 INFO 139778299914048] loaded entry point class algorithm.serve.server_config:config_api[0m
[32m[08/04/2021 09:24:46 INFO 139778299914048] loading entry points[0m
[32m[08/04/2021 09:24:46 INFO 139778299914048] loaded request iterator application/json[0m
[32m[08/04/2021 09:24:46 INFO 139778299914048] loaded request iterator application/jsonlines[0m
[32m[08/04/2021 09:24:46 INFO 139778299914048] loaded request iterator application/x-recordio-protobuf[0m
[32m[08/04/2021 09:24:46 INFO 139778299914048] loaded request iterator text/csv[0m
[32m[08/04/2021 09:24:46 INFO 139778299914048] loaded response encoder application/json[0m
[32m[08/04/2021 0

[33m2021-08-04T09:24:48.854:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[33m2021-08-04T09:24:48.854:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[36m[08/04/2021 09:24:48 INFO 140319186814784] loaded entry point class algorithm:model[0m
[36m[08/04/2021 09:24:48 INFO 140319186814784] Number of server workers: 1[0m
[36m[2021-08-04 09:24:48 +0000] [1] [INFO] Starting gunicorn 20.0.4[0m
[36m[2021-08-04 09:24:48 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[36m[2021-08-04 09:24:48 +0000] [1] [INFO] Using worker: sync[0m
[36m[2021-08-04 09:24:48 +0000] [57] [INFO] Booting worker with pid: 57[0m
[36m[08/04/2021 09:24:48 INFO 140319186814784] loading model...[0m
[36m[08/04/2021 09:24:48 INFO 140319186814784] nvidia-smi: took 0.033 seconds to run.[0m
[36m[08/04/2021 09:24:48 INFO 140319186814784] nvidia-smi identified 0 GPUs.[0m
[36m[08/04/2021 09:24:48 INFO 14031

[35m[08/04/2021 09:24:51 INFO 139978059433792] loaded entry point class algorithm.serve.server_config:config_api[0m
[35m[08/04/2021 09:24:51 INFO 139978059433792] loading entry points[0m
[35m[08/04/2021 09:24:51 INFO 139978059433792] loaded request iterator application/json[0m
[35m[08/04/2021 09:24:51 INFO 139978059433792] loaded request iterator application/jsonlines[0m
[35m[08/04/2021 09:24:51 INFO 139978059433792] loaded request iterator application/x-recordio-protobuf[0m
[35m[08/04/2021 09:24:51 INFO 139978059433792] loaded request iterator text/csv[0m
[35m[08/04/2021 09:24:51 INFO 139978059433792] loaded response encoder application/json[0m
[35m[08/04/2021 09:24:51 INFO 139978059433792] loaded response encoder application/jsonlines[0m
[35m[08/04/2021 09:24:51 INFO 139978059433792] loaded response encoder application/x-recordio-protobuf[0m
[35m[08/04/2021 09:24:51 INFO 139978059433792] loaded entry point class algorithm:model[0m
[35m[08/04/2021 09:24:51 INFO 13

In [221]:
!aws s3 cp s3://sagemaker-us-east-1-215019911230/angel-investor/knn/test/test_features.csv.out .

Completed 102.7 KiB/102.7 KiB (1.4 MiB/s) with 1 file(s) remainingdownload: s3://sagemaker-us-east-1-215019911230/angel-investor/knn/test/test_features.csv.out to ./test_features.csv.out


In [222]:
!head test_features.csv.out

{"predicted_label":0.0}
{"predicted_label":0.0}
{"predicted_label":0.0}
{"predicted_label":0.0}
{"predicted_label":0.0}
{"predicted_label":0.0}
{"predicted_label":0.0}
{"predicted_label":1.0}
{"predicted_label":0.0}
{"predicted_label":0.0}


In [223]:
! mv test_features.csv.json test_features.csv.txt

mv: cannot stat ‘test_features.csv.json’: No such file or directory


In [224]:
!head test_features.csv.txt

{"predicted_label":0.0}
{"predicted_label":0.0}
{"predicted_label":0.0}
{"predicted_label":0.0}
{"predicted_label":0.0}
{"predicted_label":0.0}
{"predicted_label":0.0}
{"predicted_label":0.0}
{"predicted_label":0.0}
{"predicted_label":0.0}


In [225]:
predictions = pd.read_csv('test_features.csv.txt', header=None)

predictions

Unnamed: 0,0
0,"{""predicted_label"":0.0}"
1,"{""predicted_label"":0.0}"
2,"{""predicted_label"":0.0}"
3,"{""predicted_label"":0.0}"
4,"{""predicted_label"":0.0}"
...,...
4376,"{""predicted_label"":0.0}"
4377,"{""predicted_label"":0.0}"
4378,"{""predicted_label"":0.0}"
4379,"{""predicted_label"":0.0}"


In [226]:
y_pred = []
for i in range(0, len(predictions.index)):
    temp_y = eval(predictions[0][i]).get('predicted_label')
    y_pred.append(temp_y)
    
len(y_pred)

4381

In [227]:
test_labels = pd.read_csv("data/test_labels.csv", header=None)
test_labels = list(test_labels.iloc[:,0])
len(test_labels)

4381

In [228]:
confusion_matrix(test_labels, y_pred)

array([[3969,    6],
       [ 405,    1]])

## Hyperparameter Tuning

See [this notebook](https://github.com/aws/amazon-sagemaker-examples/blob/master/hyperparameter_tuning/xgboost_direct_marketing/hpo_xgboost_direct_marketing_sagemaker_python_sdk.ipynb) for example code on hyper parameter tuning


## Check Auto Pilot Experiment called "angel-investor" for tips on processing and modeling to encode into the process, [see this link for location of this Auto Pilot Experiment](https://d-n9iky4cokb5h.studio.us-east-1.sagemaker.aws/jupyter/default/lab?)