In [33]:
import boto3
from datetime import datetime
import io
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sagemaker
import sagemaker.amazon.common as smac
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import tarfile

In [34]:
!pip install --upgrade pip --quiet
!pip install mxnet --quiet
import mxnet as mx

#### Prepare the Data

In [14]:
df = pd.read_csv('ufo_fullset.csv', 
                 usecols=['shape', 'duration', 'witnesses', 'weather', 'latitude', 
                          'longitude', 'physicalEvidence', 'contact', 'researchOutcome'])
df['shape'] = df['shape'].fillna(df['shape'].value_counts().index[0])
df['physicalEvidence'] = df['physicalEvidence'].replace({'Y': 1, 'N': 0})
df['contact'] = df['contact'].replace({'Y': 1, 'N': 0})
df = pd.get_dummies(df, columns=['weather', 'shape'])
df['researchOutcome'] = df['researchOutcome'].replace({'unexplained': 0, 'explained': 1, 'probable': 2})
df.columns = [c.replace('weather_', '').replace('shape_', '').replace(' ', '_') for c in df.columns]

#### Split the data into train, validate and test sets

In [18]:
np.random.seed(0)
rand_split = np.random.rand(len(df))
train_list = rand_split < 0.8
val_list = (rand_split >= 0.8) & (rand_split < 0.9)
test_list = rand_split >= 0.9

data_train = df[train_list]
data_val = df[val_list]
data_test = df[test_list]

train_X = data_train.drop(columns='researchOutcome').values
train_y = data_train['researchOutcome'].values

val_X = data_val.drop(columns='researchOutcome').values
val_y = data_val['researchOutcome'].values

test_X = data_test.drop(columns='researchOutcome').values
test_y = data_test['researchOutcome'].values

#### Send data to S3 for training

In [20]:
bucket = 'acg-ml-certification-df'
train_file = 'ufo_sightings_train_recordIO_protobuf.data'

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, train_X.astype('float32'), train_y.astype('float32'))
f.seek(0)

boto3.Session().resource('s3').Bucket(bucket).Object('algorithms_lab/linearlearner_train/{}'.format(train_file)).upload_fileobj(f)
training_recordIO_protobuf_location = 's3://{}/algorithms_lab/linearlearner_train/{}'.format(bucket, train_file)
print('The Pipe mode recordIO protobuf training data: {}'.format(training_recordIO_protobuf_location))

The Pipe mode recordIO protobuf training data: s3://acg-ml-certification-df/algorithms_lab/linearlearner_train/ufo_sightings_train_recordIO_protobuf.data


In [21]:
validation_file = 'ufo_sightings_validatioin_recordIO_protobuf.data'

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, val_X.astype('float32'), val_y.astype('float32'))
f.seek(0)

boto3.Session().resource('s3').Bucket(bucket).Object('algorithms_lab/linearlearner_validation/{}'.format(validation_file)).upload_fileobj(f)
validate_recordIO_protobuf_location = 's3://{}/algorithms_lab/linearlearner_validation/{}'.format(bucket, validation_file)
print(f'The Pipe mode recordIO protobuf validation data: {validate_recordIO_protobuf_location}'

The Pipe mode recordIO protobuf validation data: s3://acg-ml-certification-df/algorithms_lab/linearlearner_validation/ufo_sightings_validatioin_recordIO_protobuf.data


## Train the initial linear learner

In [39]:
container = sagemaker.image_uris.retrieve(boto3.Session().region_name, 'linear-learner', "1")
job_name = f'ufo-linear-learner-job-{datetime.now().strftime("%Y%m%d%H%M%S")}'
output_location = f's3://{bucket}/algorithms_lab/linearlearner_output'

role = sagemaker.get_execution_role()
sess = sagemaker.Session()

linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.c4.xlarge',
                                       output_path=output_location,
                                       sagemaker_session=sess,
                                       input_mode='Pipe')

linear.set_hyperparameters(feature_dim=train_X.shape[1],
                           predictor_type='multiclass_classifier',
                           num_classes=3)

linear.fit(inputs={'train': training_recordIO_protobuf_location,
                   'validation': validate_recordIO_protobuf_location}, job_name=job_name)

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


2021-05-29 16:36:21 Starting - Starting the training job...
2021-05-29 16:36:48 Starting - Launching requested ML instancesProfilerReport-1622306181: InProgress
.........
2021-05-29 16:38:08 Starting - Preparing the instances for training......
2021-05-29 16:39:20 Downloading - Downloading input data...
2021-05-29 16:39:49 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[05/29/2021 16:40:07 INFO 139924883953472] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bia

[34m[2021-05-29 16:40:20.862] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 11, "duration": 1296, "num_examples": 15, "num_bytes": 1904760}[0m
[34m#metrics {"StartTime": 1622306420.8628457, "EndTime": 1622306420.8629253, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 4, "model": 0}, "Metrics": {"train_multiclass_cross_entropy_objective": {"sum": 0.19411804853166853, "count": 1, "min": 0.19411804853166853, "max": 0.19411804853166853}}}
[0m
[34m#metrics {"StartTime": 1622306420.863018, "EndTime": 1622306420.8630326, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 4, "model": 1}, "Metrics": {"train_multiclass_cross_entropy_objective": {"sum": 0.21248661368233818, "count": 1, "min": 0.21248661368233818, "max": 0.21248661368233818}}}
[0m
[34m#metrics {"StartTime": 1622306420.8630733, "EndTime": 1622306420.8630843, "Dimensions": {"Algorithm": "Lin

[34m[2021-05-29 16:40:30.411] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 19, "duration": 1276, "num_examples": 15, "num_bytes": 1904760}[0m
[34m#metrics {"StartTime": 1622306430.4114375, "EndTime": 1622306430.4115293, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 8, "model": 0}, "Metrics": {"train_multiclass_cross_entropy_objective": {"sum": 0.1959419206891741, "count": 1, "min": 0.1959419206891741, "max": 0.1959419206891741}}}
[0m
[34m#metrics {"StartTime": 1622306430.4116406, "EndTime": 1622306430.4116633, "Dimensions": {"Algorithm": "Linear Learner", "Host": "algo-1", "Operation": "training", "epoch": 8, "model": 1}, "Metrics": {"train_multiclass_cross_entropy_objective": {"sum": 0.19240924399239676, "count": 1, "min": 0.19240924399239676, "max": 0.19240924399239676}}}
[0m
[34m#metrics {"StartTime": 1622306430.411718, "EndTime": 1622306430.4117367, "Dimensions": {"Algorithm": "Linear


2021-05-29 16:40:51 Uploading - Uploading generated training model
2021-05-29 16:40:51 Completed - Training job completed
Training seconds: 91
Billable seconds: 91


#### Deploy initial model and evaluate

In [40]:
predictor = linear.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge')
predictor.serializer = sagemaker.serializers.CSVSerializer()
predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

---------------!

In [55]:
def check(x, y_true):
    y_pred = np.array([p['predicted_label'] for p in predictor.predict(x)['predictions']])
    print(f"Accuracy:  {accuracy_score(y_true, y_pred):.4f}")
    print(f"Recall:    {recall_score(y_true, y_pred, average='macro'):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='macro'):.4f}")
    print(f"f1:        {f1_score(y_true, y_pred, average='macro'):.4f}")

In [58]:
print('** Test Results **')
check(test_X, test_y)

** Test Results **
Accuracy:  0.9499
Recall:    0.9365
Precision: 0.9127
f1:        0.9241


In [59]:
print('** Validation Results **')
check(val_X, val_y)

** Validation Results **
Accuracy:  0.9397
Recall:    0.9271
Precision: 0.8962
f1:        0.9108


In [60]:
print('** Train Results **')
check(train_X, train_y)

** Train Results **
Accuracy:  0.9459
Recall:    0.9296
Precision: 0.9009
f1:        0.9144


In [61]:
predictor.delete_endpoint()

## Create a Hyperparameter Tuning Job