# Sagemaker AutoML

In [40]:
import os
import pandas as pd
import boto3
import io
import datetime
from os.path import join
from time import sleep

import xeek
import xeek.features as features

## Data upload

For this process, we must:

* Pull in our data
* Format it to a specific (relatively standard) format
* Upload to S3

In [2]:
df_train = pd.read_csv(xeek.raw_train_filepath, sep=";")
df_test = pd.read_csv(xeek.raw_test_filepath, sep=";")

In [3]:
df_train

Unnamed: 0,WELL,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,GROUP,FORMATION,CALI,RSHA,RMED,...,ROP,DTS,DCAL,DRHO,MUDWEIGHT,RMIC,ROPA,RXO,FORCE_2020_LITHOFACIES_LITHOLOGY,FORCE_2020_LITHOFACIES_CONFIDENCE
0,15/9-13,494.5280,437641.96875,6470972.5,-469.501831,NORDLAND GP.,,19.480835,,1.611410,...,34.636410,,,-0.574928,,,,,65000,1.0
1,15/9-13,494.6800,437641.96875,6470972.5,-469.653809,NORDLAND GP.,,19.468800,,1.618070,...,34.636410,,,-0.570188,,,,,65000,1.0
2,15/9-13,494.8320,437641.96875,6470972.5,-469.805786,NORDLAND GP.,,19.468800,,1.626459,...,34.779556,,,-0.574245,,,,,65000,1.0
3,15/9-13,494.9840,437641.96875,6470972.5,-469.957794,NORDLAND GP.,,19.459282,,1.621594,...,39.965164,,,-0.586315,,,,,65000,1.0
4,15/9-13,495.1360,437641.96875,6470972.5,-470.109772,NORDLAND GP.,,19.453100,,1.602679,...,57.483765,,,-0.597914,,,,,65000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1170506,7/1-2 S,3169.3124,,,,VESTLAND GP.,Bryne Fm.,8.423170,,,...,27.674368,,,-0.001763,,,26.673708,,30000,2.0
1170507,7/1-2 S,3169.4644,,,,VESTLAND GP.,Bryne Fm.,8.379244,,,...,28.024338,,,-0.007600,,,26.840818,,65030,2.0
1170508,7/1-2 S,3169.6164,,,,VESTLAND GP.,Bryne Fm.,8.350248,,,...,28.091282,,,-0.018297,,,27.007942,,65030,2.0
1170509,7/1-2 S,3169.7684,,,,VESTLAND GP.,Bryne Fm.,8.313779,,,...,28.019775,,,-0.011438,,,27.175179,,65030,2.0


In [4]:
(df_train
 .drop(["FORCE_2020_LITHOFACIES_CONFIDENCE"], axis=1)
 .to_csv(
     join(xeek.processed_data_dir, "train_dataset_formatted.csv"), 
     index=False, header=True))

And now we upload:

In [5]:
s3 = boto3.resource("s3")
bucket_name = os.environ["S3_BUCKET"]
bucket = s3.Bucket(bucket_name)
filename = "facies_train.csv"

target_key = "automl/input/{}".format(filename)
bucket.upload_file(join(xeek.processed_data_dir, "train_dataset_formatted.csv"), target_key)

## AutoML Job

### Config

We define our config. Most importantly we have to ensure this is a classification job.

In [12]:
input_data_config = [{
    'DataSource': {
        'S3DataSource': {
            'S3DataType': 'S3Prefix',
            'S3Uri': 's3://{}/{}'.format(bucket_name, target_key)
        }
    },
    'TargetAttributeName': 'FORCE_2020_LITHOFACIES_LITHOLOGY'
}]

auto_ml_objective = {
    'MetricName': 'F1macro'
}

output_data_config = {
    'S3OutputPath': 's3://{}/automl/output/'.format(bucket_name)
}

problem_type = "MulticlassClassification"

### Create our job

In [13]:
now = datetime.datetime.now()
auto_ml_job_name = 'automl-job-{}'.format(int(datetime.datetime.now().timestamp()))
print(auto_ml_job_name)

automl-job-1602734264


In [16]:
sm = boto3.client("sagemaker",
                 region_name=os.environ['S3_REGION'])
sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name,
                      InputDataConfig=input_data_config,
                      OutputDataConfig=output_data_config,
                      ProblemType=problem_type,
                      AutoMLJobObjective=auto_ml_objective,
                      RoleArn=os.environ['SM_ROLE'])

{'AutoMLJobArn': 'arn:aws:sagemaker:ap-southeast-2:949012111517:automl-job/automl-job-1602734264',
 'ResponseMetadata': {'RequestId': 'a961321b-271a-4c4e-8653-92b97b734177',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a961321b-271a-4c4e-8653-92b97b734177',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '97',
   'date': 'Thu, 15 Oct 2020 03:57:52 GMT'},
  'RetryAttempts': 0}}

### Describe and track our job

Learned experience: set a time limit for this process.

In [44]:
while True:
    description = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    print("{} -- {}".format(description['AutoMLJobStatus'], description['AutoMLJobSecondaryStatus']))
    sleep(30)

Failed -- Failed


KeyboardInterrupt: 

### Evaluate our job

We pull data from our AutoML job and evaluate.

In [45]:
candidates = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name,
                                                SortBy='FinalObjectiveMetricValue'
                                               )['Candidates']
for candidate in candidates:
    print("{} -- {}".format(
        candidate['FinalAutoMLJobObjectiveMetric']['Value'],
        candidate['CandidateName']))

0.8920999765396118 -- tuning-job-1-e3eacd63e426404abe-034-ee63f814
0.8858399987220764 -- tuning-job-1-e3eacd63e426404abe-049-43ed7e22
0.8845900297164917 -- tuning-job-1-e3eacd63e426404abe-069-ec7546ab
0.8841500282287598 -- tuning-job-1-e3eacd63e426404abe-061-9b1cfa28
0.8778899908065796 -- tuning-job-1-e3eacd63e426404abe-029-4e16103c
0.8743399977684021 -- tuning-job-1-e3eacd63e426404abe-055-5e23c003
0.8573499917984009 -- tuning-job-1-e3eacd63e426404abe-043-0cf3ce8b
0.8563500046730042 -- tuning-job-1-e3eacd63e426404abe-057-79b941ba
0.850350022315979 -- tuning-job-1-e3eacd63e426404abe-045-b06f1d3e
0.8365600109100342 -- tuning-job-1-e3eacd63e426404abe-016-7ea96177


### Deploy our best job

In [47]:
model_name = "facies-model-{}".format(int(now.timestamp()))
endpoint_config_name = "facies-endpoint-config-{}".format(int(now.timestamp()))
endpoint_name = "facies-endpoint-{}".format(int(now.timestamp()))
variant_name = "facies-varient-{}".format(int(now.timestamp()))

In [49]:
model_arn = sm.create_model(Containers=candidates[0]['InferenceContainers'],
                            ModelName=model_name,
                            ExecutionRoleArn=os.environ['SM_ROLE'])

In [50]:
ep_config = sm.create_endpoint_config(EndpointConfigName = endpoint_config_name,
                                      ProductionVariants = [{'InstanceType': 'ml.m5.xlarge',
                                                             'InitialInstanceCount': 2,
                                                             'ModelName': model_name,
                                                             'VariantName': variant_name}])

In [51]:
create_endpoint_response = sm.create_endpoint(EndpointName=endpoint_name,
                                              EndpointConfigName=endpoint_config_name)

In [52]:
while True:
    print(sm.describe_endpoint(EndpointName=endpoint_name)['EndpointStatus'])
    sleep(30)

Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
InService


KeyboardInterrupt: 

### Predict the test dataset

In [54]:
smr = boto3.client("sagemaker-runtime",
                  region_name=os.environ["S3_REGION"])
sm.describe_endpoint(EndpointName=endpoint_name)

{'EndpointName': 'facies-endpoint-1602734264',
 'EndpointArn': 'arn:aws:sagemaker:ap-southeast-2:949012111517:endpoint/facies-endpoint-1602734264',
 'EndpointConfigName': 'facies-endpoint-config-1602734264',
 'ProductionVariants': [{'VariantName': 'facies-varient-1602734264',
   'DeployedImages': [{'SpecifiedImage': '783357654285.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-sklearn-automl:0.2-1-cpu-py3',
     'ResolvedImage': '783357654285.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-sklearn-automl@sha256:37b0215b75ec2de3f8974cbbdf9cbd7bc6f6cb0bb0f4e36437e4c4ef4dfc12b9',
     'ResolutionTime': datetime.datetime(2020, 10, 15, 6, 55, 21, 438000, tzinfo=tzlocal())},
    {'SpecifiedImage': '783357654285.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3',
     'ResolvedImage': '783357654285.dkr.ecr.ap-southeast-2.amazonaws.com/sagemaker-xgboost@sha256:b0f7d76963f88f9890bfa3288e256d4cc14308ace32a036f77b977d8cf4319a5',
     'ResolutionTime': datetime.datetime(2020, 10, 

We must define our input data.

The test data is too large to submit in a single push, so we break it down into smaller chunks.

In [65]:
len(df_test)

136786

In [82]:
chunk = 10000

In [87]:
responses = []
for i in range(0, len(df_test) // chunk + 1):  # workaround for ceiling
    print(i)
    csv_file = io.StringIO()
    # by default sagemaker expects comma seperated
    (df_test
     .iloc[i*chunk:(i+1)*chunk]
     .to_csv(csv_file, sep=",", header=False, index=False))
    csv_payload = csv_file.getvalue()
    responses.append(smr.invoke_endpoint(EndpointName=endpoint_name, ContentType='text/csv', Body=csv_payload))

0
1
2
3
4
5
6
7
8
9
10
11
12
13


In [88]:
results = ""
for response in responses:
    results += response['Body'].read().decode("utf-8")

In [99]:
split_outputs = results.split("\n")[:-1]
assert len(split_outputs) == len(df_test) 

(What's the format of our response?)

In [100]:
results[0:20]

'65000\n65000\n65000\n65'

In [103]:
with open(join(xeek.processed_data_dir, "automl_predictions.csv"), 'w') as f:
    f.write("lithology\n" + results)