In [41]:
%%sh
#pip -q install --upgrade pip
#pip -q install sagemaker awscli boto3 --upgrade
pip install deprecation



You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.


In [42]:
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

# Direct Marketing with Amazon SageMaker AutoPilot

Last update: February 6th, 2019

In [43]:
import sagemaker
import smdebug_rulesconfig as rule_configs
import boto3
import os, sys
import s3fs
import numpy as np
import pandas

print (sagemaker.__version__)

sess   = sagemaker.Session()
bucket = sess.default_bucket()
prefix = 'AIH'
region = boto3.Session().region_name

client = boto3.client('s3')
path = 's3://sagemaker-studio-us-east-1-564342467420/AIH/AIH.csv'

2.16.3


In [44]:
# Carrega os dados
dataframediario = pandas.read_csv(path, usecols = [0, 1], engine = 'python')
dataframediario['Data'] = pandas.to_datetime(dataframediario.Data, format = '%d/%m/%Y')
dataframediario = dataframediario.set_index('Data')
# Reamostrando os dados para frequência semanal - Usando o valor médio do número de óbitos de pacientes – AIH
dataframesemanal = dataframediario['QT_INTER'].resample('W').sum()
data = dataframesemanal.to_frame()
data.drop(data.head(1).index,inplace=True) # drop first 1 rows
data.drop(data.tail(1).index,inplace=True) # drop last 1 rows
data.reset_index(level =['Data'], inplace = True) 

data[:10]

Unnamed: 0,Data,QT_INTER
0,2010-01-10,697
1,2010-01-17,678
2,2010-01-24,720
3,2010-01-31,684
4,2010-02-07,700
5,2010-02-14,735
6,2010-02-21,654
7,2010-02-28,689
8,2010-03-07,650
9,2010-03-14,666


In [45]:
data.shape # (number of lines, number of columns)

(556, 2)

## Splitting the dataset

We split the dataset into training (95%) and test (5%) datasets. We will use the training dataset for AutoML, where it will be automatically split again for training and validation.
 
Once the model has been deployed, we'll use the test dataset to evaluate its performance.

In [46]:
# Set the seed to 123 for reproductibility
# https://pandas.pydata.org/pandas-docs/version/0.25/generated/pandas.DataFrame.sample.html
# https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.split.html
train_data = data.iloc[0:500]
test_data = data.iloc[500:]
print(len(train_data), len(test_data))
#train_data, test_data, _ = np.split(data.sample(frac=1, random_state=123), 
#                                                  [int(0.95 * len(data)), int(len(data))])  

# Save to CSV files
train_data.to_csv('automl-train.csv', index=False, header=True, sep=',') # Need to keep column names
test_data.to_csv('automl-test.csv', index=False, header=True, sep=',')

500 56


In [47]:
!ls -l automl*.csv

-rw-r--r-- 1 root root  868 Dec 21 19:02 automl-test.csv
-rw-r--r-- 1 root root 7533 Dec 21 19:02 automl-train.csv


**No preprocessing needed!** AutoML will take care of this, so let's just copy the training set to S3.

In [48]:
s3_input_data = sess.upload_data(path="automl-train.csv", key_prefix=prefix + "/input")
print(s3_input_data)

s3://sagemaker-us-east-1-564342467420/AIH/input/automl-train.csv


## Setting up the SageMaker AutoPilot job

After uploading the dataset to S3, we can invoke SageMaker AutoPilot to find the best ML pipeline to train a model on this dataset. 

The required inputs for invoking a SageMaker AutoML job are the dataset location in S3, the name of the column of the dataset you want to predict (`y` in this case) and an IAM role.

In [49]:
from sagemaker.automl.automl import AutoML
# https://sagemaker.readthedocs.io/en/stable/automl.html

role = sagemaker.get_execution_role()
problem_type = 'Regression'
job_objective = 'MetricName="MSE"'

auto_ml_job = AutoML(
    role = role,                                              # IAM permissions for SageMaker
    sagemaker_session = sess,                                 # 
    target_attribute_name = 'QT_INTER',                              # The column we want to predict
    problem_type = 'Regression',
    job_objective = {"MetricName":"MSE"},
    output_path = 's3://{}/{}/output'.format(bucket,prefix),  # Save artefacts here
    max_candidates = 100,                                     # Default is 500
    max_runtime_per_training_job_in_seconds = 600, 
    total_job_runtime_in_seconds = 3600
)

## Launching the SageMaker AutoPilot job

We can now launch the job by calling the `fit()` API.

In [50]:
auto_ml_job.fit(inputs=s3_input_data, logs=False, wait=False)

In [51]:
auto_ml_job.describe_auto_ml_job()

{'AutoMLJobName': 'automl-2020-12-21-19-02-41-599',
 'AutoMLJobArn': 'arn:aws:sagemaker:us-east-1:564342467420:automl-job/automl-2020-12-21-19-02-41-599',
 'InputDataConfig': [{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
     'S3Uri': 's3://sagemaker-us-east-1-564342467420/AIH/input/automl-train.csv'}},
   'TargetAttributeName': 'QT_INTER'}],
 'OutputDataConfig': {'S3OutputPath': 's3://sagemaker-us-east-1-564342467420/AIH/output'},
 'RoleArn': 'arn:aws:iam::564342467420:role/service-role/AmazonSageMaker-ExecutionRole-20201028T154682',
 'AutoMLJobObjective': {'MetricName': 'MSE'},
 'ProblemType': 'Regression',
 'AutoMLJobConfig': {'CompletionCriteria': {'MaxCandidates': 100,
   'MaxRuntimePerTrainingJobInSeconds': 600,
   'MaxAutoMLJobRuntimeInSeconds': 3600},
  'SecurityConfig': {'EnableInterContainerTrafficEncryption': False}},
 'CreationTime': datetime.datetime(2020, 12, 21, 19, 2, 41, 747000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2020, 12, 21, 19, 2

### Tracking the progress of the AutoPilot job
SageMaker AutoPilot job consists of four high-level steps : 
* Data Preprocessing, where the dataset is split into train and validation sets.
* Recommending Pipelines, where the dataset is analyzed and SageMaker AutoPilot comes up with a list of ML pipelines that should be tried out on the dataset.
* Automatic Feature Engineering, where SageMaker AutoPilot performs feature transformation on individual features of the dataset as well as at an aggregate level.
* ML pipeline selection and hyperparameter tuning, where the top performing pipeline is selected along with the optimal hyperparameters for the training algorithm (the last stage of the pipeline). 

In [52]:
from time import sleep

job = auto_ml_job.describe_auto_ml_job()
job_status = job['AutoMLJobStatus']
job_sec_status = job['AutoMLJobSecondaryStatus']

if job_status not in ('Stopped', 'Failed'):
    while job_status in ('InProgress') and job_sec_status in ('AnalyzingData'):
        sleep(30)
        job = auto_ml_job.describe_auto_ml_job()
        job_status = job['AutoMLJobStatus']
        job_sec_status = job['AutoMLJobSecondaryStatus']
        print (job_status, job_sec_status)
    print("Data analysis complete")

InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress FeatureEngineering
Data analysis complete


## Viewing notebooks generated by SageMaker AutoPilot
Once data analysis is complete, SageMaker AutoPilot generates two notebooks: 
* Data exploration,
* Candidate definition.

In [53]:
job = auto_ml_job.describe_auto_ml_job()
job_candidate_notebook = job['AutoMLJobArtifacts']['CandidateDefinitionNotebookLocation']
job_data_notebook = job['AutoMLJobArtifacts']['DataExplorationNotebookLocation']

print(job_candidate_notebook)
print(job_data_notebook)

s3://sagemaker-us-east-1-564342467420/AIH/output/automl-2020-12-21-19-02-41-599/sagemaker-automl-candidates/pr-1-b88fb5fbcc45499ba60172129addb670201ed23fbbfa42f1bbee1217cc/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb
s3://sagemaker-us-east-1-564342467420/AIH/output/automl-2020-12-21-19-02-41-599/sagemaker-automl-candidates/pr-1-b88fb5fbcc45499ba60172129addb670201ed23fbbfa42f1bbee1217cc/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb


Let's copy these two notebooks.

In [54]:
%%sh -s $job_candidate_notebook $job_data_notebook
aws s3 cp $1 .
aws s3 cp $2 .

download: s3://sagemaker-us-east-1-564342467420/AIH/output/automl-2020-12-21-19-02-41-599/sagemaker-automl-candidates/pr-1-b88fb5fbcc45499ba60172129addb670201ed23fbbfa42f1bbee1217cc/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb to ./SageMakerAutopilotCandidateDefinitionNotebook.ipynb
download: s3://sagemaker-us-east-1-564342467420/AIH/output/automl-2020-12-21-19-02-41-599/sagemaker-automl-candidates/pr-1-b88fb5fbcc45499ba60172129addb670201ed23fbbfa42f1bbee1217cc/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb to ./SageMakerAutopilotDataExplorationNotebook.ipynb


Go back to the folder view, and open these notebooks. Lots of useful information in there!

SageMaker AutoPilot then launches feature engineering, and prepares different training and validation datasets.

In [55]:
job = auto_ml_job.describe_auto_ml_job()
job_status = job['AutoMLJobStatus']
job_sec_status = job['AutoMLJobSecondaryStatus']

if job_status not in ('Stopped', 'Failed'):
    while job_status in ('InProgress') and job_sec_status in ('FeatureEngineering'):
        sleep(30)
        job = auto_ml_job.describe_auto_ml_job()
        job_status = job['AutoMLJobStatus']
        job_sec_status = job['AutoMLJobSecondaryStatus']
        print (job_status, job_sec_status)
    print("Feature engineering complete")

InProgress FeatureEngineering
InProgress FeatureEngineering
InProgress FeatureEngineering
InProgress FeatureEngineering
InProgress FeatureEngineering
InProgress FeatureEngineering
InProgress FeatureEngineering
InProgress FeatureEngineering
InProgress FeatureEngineering
InProgress FeatureEngineering
InProgress FeatureEngineering
InProgress FeatureEngineering
InProgress FeatureEngineering
InProgress FeatureEngineering
InProgress ModelTuning
Feature engineering complete


Once feature engineering is complete, SageMaker AutoPilot launches Automatic Model Tuning on the different candidates. While model tuning is running, we can explore its progress with SageMaker Experiments.

In [57]:
import pandas as pd
from sagemaker.analytics import ExperimentAnalytics, TrainingJobAnalytics

exp = ExperimentAnalytics(
    sagemaker_session=sess, 
#    experiment_name=job['AutoMLJobName'] + '-aws-auto-ml-job',
    experiment_name=job['AutoMLJobName'] + '-aws-auto-ml-job2',

)

df = exp.dataframe()
print("Number of jobs: ", len(df))

# Move metric to first column
df = pd.concat([df['ObjectiveMetric - Max'], df.drop(['ObjectiveMetric - Max'], axis=1)], axis=1)
# Show top 5 jobs
df.sort_values('ObjectiveMetric - Max', ascending=0)[:5]

Number of jobs:  0


KeyError: 'ObjectiveMetric - Max'

In [22]:
job = auto_ml_job.describe_auto_ml_job()
job_status = job['AutoMLJobStatus']
job_sec_status = job['AutoMLJobSecondaryStatus']

if job_status not in ('Stopped', 'Failed'):
    while job_status in ('InProgress') and job_sec_status in ('ModelTuning'):
        sleep(30)
        job = auto_ml_job.describe_auto_ml_job()
        job_status = job['AutoMLJobStatus']
        job_sec_status = job['AutoMLJobSecondaryStatus']
        print (job_status, job_sec_status)
    print("Model tuning complete")

Model tuning complete


## Deploying the best candidate
Now that we have successfully completed the AutoML job on our dataset and visualized the trials, we can create a model from any of the trials with a single API call and then deploy that model for online or batch prediction using [Inference Pipelines](https://docs.aws.amazon.com/sagemaker/latest/dg/inference-pipelines.html). For this notebook, we deploy only the best performing trial for inference.

The best candidate is the one we're really interested in.

In [23]:
from time import strftime, gmtime
timestamp = strftime('%d-%H-%M-%S', gmtime())

endpoint_name = job['AutoMLJobName']+'-'+timestamp

In [24]:
auto_ml_job.deploy(
    initial_instance_count = 1,
    instance_type = 'ml.m4.xlarge',
    endpoint_name = endpoint_name
)

-----------------!

## Scoring the best candidate

Let's predict and score the validation set. We'll compute metrics ourselves just for fun.

In [37]:
from sagemaker.predictor import csv_serializer, RealTimePredictor
from sagemaker.content_types import CONTENT_TYPE_CSV
#import deprecation

predictor = RealTimePredictor(
    endpoint=endpoint_name, 
    sagemaker_session=sess, 
    serializer=csv_serializer,
    content_type=CONTENT_TYPE_CSV, 
    accept='text/csv'
)

ModuleNotFoundError: No module named 'deprecations'

In [26]:
tp = tn = fp = fn = count = 0

with open('automl-test.csv') as f:
    lines = f.readlines()
    for l in lines[1:]:   # Skip header
        l = l.split(',')  # Split CSV line into feature array
        label = l[-1]     # Store 'yes'/'no' label
        l = l[:-1]        # Remove label
        l = ','.join(l)   # Rebuild CSV line without label
                
        response = predictor.predict(l)
        response = response.decode("utf-8")
        #print ("label %s response %s" %(label,response))

        if 'yes' in label:
            # Sample is positive
            if 'yes' in response:
                # True positive
                tp=tp+1
            else:
                # False negative
                fn=fn+1
        else:
            # Sample is negative
            if 'no' in response:
                # True negative
                tn=tn+1
            else:
                # False positive
                fp=fp+1
        count = count+1
        if (count % 100 == 0):   
            sys.stdout.write(str(count)+' ')
            
print ("Done")

NameError: name 'predictor' is not defined

In [27]:
#Confusion matrix
print ("%d %d" % (tn, fp))
print ("%d %d" % (fn, tp))

accuracy  = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall    = tn/(tp+fn)
f1        = (2*precision*recall)/(precision+recall)

print ("%.4f %.4f %.4f %.4f" % (accuracy, precision, recall, f1))

0 0
0 0


ZeroDivisionError: division by zero

## Deleting the endpoint
Once that we're done predicting, we can delete the endpoint (and stop paying for it).

In [None]:
# Uncomment to delete
# sess.delete_endpoint(predictor.endpoint)

The SageMaker AutoML job creates many underlying artifacts such as dataset splits, preprocessing scripts, preprocessed data, etc. Let's delete them.

In [None]:
import boto3

job_outputs_prefix = '{}/output/{}'.format(prefix, job['AutoMLJobName'])
print(job_outputs_prefix)

s3_bucket =boto3.resource('s3').Bucket(bucket)
# Uncomment to delete
# s3_bucket.objects.filter(Prefix=job_outputs_prefix).delete()