In [None]:
!pip install --disable-pip-version-check -q sagemaker==2.35.0

In [None]:
import json
import time

import boto3
from   IPython.core.display import display, HTML
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from   pprint import pprint
import sagemaker


%matplotlib inline
%config InlineBackend.figure_format='retina'

In [None]:
sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# 1. Review transformed dataset

In [None]:
!aws s3 cp 's3://dlai-practical-data-science/data/balanced/womens_clothing_ecommerce_reviews_balanced.csv' ./

In [None]:
path = './womens_clothing_ecommerce_reviews_balanced.csv'
df = pd.read_csv(path)
df.head()

In [None]:
path_autopilot = (
    './womens_clothing_ecommerce_reviews_balanced_for_autopilot.csv')
df[['sentiment', 'review_body']].to_csv(path_autopilot, index=False)

# 2. Configure the Autopilot job

In [None]:
autopilot_train_s3_uri = sess.upload_data(
    bucket=bucket, key_prefix='autopilot/data', path=path_autopilot)
autopilot_train_s3_uri

In [None]:
!aws s3 ls $autopilot_train_s3_uri

In [None]:
model_output_s3_uri = f's3://{bucket}/autopilot'
print(model_output_s3_uri)

In [None]:
timestamp = int(time.time())
auto_ml_job_name = f'automl-dm-{timestamp}'

In [None]:
max_candidates = 3
automl = sagemaker.automl.automl.AutoML(
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    target_attribute_name='sentiment', # Replace None
    base_job_name=auto_ml_job_name, # Replace None
    output_path=model_output_s3_uri, # Replace None
    ### END SOLUTION - DO NOT delete this comment for grading purposes
    max_candidates=max_candidates,
    sagemaker_session=sess,
    role=role,
    max_runtime_per_training_job_in_seconds=1200,
    total_job_runtime_in_seconds=7200)

# 3. Launch the Autopilot job

In [None]:
automl.fit(
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    autopilot_train_s3_uri, #path_autopilot, # Replace None
    ### END SOLUTION - DO NOT delete this comment for grading purposes
    job_name=auto_ml_job_name, 
    wait=False, 
    logs=False)

# 4. Track Autopilot job progress

In [None]:
job_description_response = automl.describe_auto_ml_job(
    job_name=auto_ml_job_name)

In [None]:
while ('AutoMLJobStatus' not in job_description_response.keys() 
       and 'AutoMLJobSecondaryStatus' 
       not in job_description_response.keys()):
    job_description_response = automl.describe_auto_ml_job(
        job_name=auto_ml_job_name)
    print('[INFO] Autopilot job has not yet started. Please wait. ')
    # function `json.dumps` encodes JSON string for printing.
    print(
        json.dumps(job_description_response, 
                   indent=4, 
                   sort_keys=True, 
                   default=str))
    print('[INFO] Waiting for Autopilot job to start...')
    sleep(15)

print('[OK] AutoML job started.')

In [None]:
display(HTML(
    f'<b>Review <a target="blank" href="https://console.aws.amazon.com/'
    f'sagemaker/home?region={region}#/processing-jobs/">'
    f'processing jobs</a></b>'))

In [None]:
%%time

job_status = job_description_response['AutoMLJobStatus']
job_sec_status = job_description_response['AutoMLJobSecondaryStatus']

if job_status not in ('Stopped', 'Failed'):
    while (job_status in ('InProgress') 
           and job_sec_status in ('Starting', 'AnalyzingData')):
        job_description_response = automl.describe_auto_ml_job(
            job_name=auto_ml_job_name)
        job_status = job_description_response['AutoMLJobStatus']
        job_sec_status = job_description_response[
            'AutoMLJobSecondaryStatus']
        print(job_status, job_sec_status)
        time.sleep(15)
    print('[OK] Data analysis phase completed.\n')
    
print(
    json.dumps(
        job_description_response, indent=4, sort_keys=True, default=str))

In [None]:
### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
# get the information about the running Autopilot job
job_description_response = automl.describe_auto_ml_job(
    job_name=auto_ml_job_name) # Replace None

# keep in the while loop until the Autopilot job artifacts will be 
# generated
# Replace all None
while 'AutoMLJobArtifacts' not in job_description_response: 
    # update the information about the running Autopilot job
    job_description_response = automl.describe_auto_ml_job(
        job_name=auto_ml_job_name) # Replace None
    ### END SOLUTION - DO NOT delete this comment for grading purposes
    print('[INFO] Autopilot job has not yet generated the artifacts. 
          'Please wait. ')
    print(
        json.dumps(job_description_response, 
                   indent=4, 
                   sort_keys=True, 
                   default=str))
    print('[INFO] Waiting for AutoMLJobArtifacts...')
    time.sleep(15)

print('[OK] AutoMLJobArtifacts generated.')

In [None]:
### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
# get the information about the running Autopilot job
job_description_response = automl.describe_auto_ml_job(
    job_name=auto_ml_job_name) # Replace None

# keep in the while loop until the notebooks will be created
# Replace all None
while ('DataExplorationNotebookLocation' 
       not in job_description_response['AutoMLJobArtifacts']): 
    # update the information about the running Autopilot job
    job_description_response = automl.describe_auto_ml_job(
        job_name=auto_ml_job_name) # Replace None
    ### END SOLUTION - DO NOT delete this comment for grading purposes
    print('[INFO] Autopilot job has not yet generated the notebooks. '
          'Please wait. ')
    print(
        json.dumps(job_description_response, 
                   indent=4, 
                   sort_keys=True, 
                   default=str))
    print('[INFO] Waiting for DataExplorationNotebookLocation...')
    time.sleep(15)

print('[OK] DataExplorationNotebookLocation found.')   

In [None]:
generated_resources = (
    job_description_response['AutoMLJobArtifacts'][
        'DataExplorationNotebookLocation'])
download_path = generated_resources.rsplit(
    '/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb'
)[0]
job_id = download_path.rsplit('/', 1)[-1]

if not job_id: 
    print('No AutoMLJobArtifacts found.')
else: 
    display(HTML(
        f'<b>Review <a target="blank" href="https://s3.console.aws.amazon'
        f'.com/s3/buckets/{bucket}/autopilot/{auto_ml_job_name}/'
        f'sagemaker-automl-candidates/{job_id}/">'
        f'generated notebooks</a> in S3 bucket</b>'))

# 5. Feature engineering

In [None]:
%%time

job_description_response = automl.describe_auto_ml_job(
    job_name=auto_ml_job_name)
job_status = job_description_response['AutoMLJobStatus']
job_sec_status = job_description_response['AutoMLJobSecondaryStatus']
print(job_status)
print(job_sec_status)

if job_status not in ('Stopped', 'Failed'):
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    # Replace all None    
    while (job_status in ('InProgress') 
           and job_sec_status in ('FeatureEngineering')): 
    ### END SOLUTION - DO NOT delete this comment for grading purposes
        job_description_response = automl.describe_auto_ml_job(
            job_name=auto_ml_job_name)
        job_status = job_description_response['AutoMLJobStatus']
        job_sec_status = job_description_response[
            'AutoMLJobSecondaryStatus']
        print(job_status, job_sec_status)
        time.sleep(5)
    print('[OK] Feature engineering phase completed.\n')
    
print(
    json.dumps(
        job_description_response, indent=4, sort_keys=True, default=str))

# 6. Model training and tuning

In [None]:
display(HTML(
    f'<b>Review <a target="blank" href="https://console.aws.amazon.com/'
    f'sagemaker/home?region={region}#/hyper-tuning-jobs/">'
    f'hyper-parameter tuning jobs</a></b>'))

In [None]:
%%time

job_description_response = automl.describe_auto_ml_job(
    job_name=auto_ml_job_name)
job_status = job_description_response['AutoMLJobStatus']
job_sec_status = job_description_response['AutoMLJobSecondaryStatus']
print(job_status)
print(job_sec_status)
if job_status not in ('Stopped', 'Failed'):
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    # Replace all None
    while (job_status in ('InProgress') 
           and job_sec_status in ('ModelTuning')): 
    ### END SOLUTION - DO NOT delete this comment for grading purposes
        job_description_response = automl.describe_auto_ml_job(
            job_name=auto_ml_job_name)
        job_status = job_description_response['AutoMLJobStatus']
        job_sec_status = job_description_response[
            'AutoMLJobSecondaryStatus']
        print(job_status, job_sec_status)
        time.sleep(5)
    print('[OK] Model tuning phase completed.\n')
    
print(
    json.dumps(
        job_description_response, indent=4, sort_keys=True, default=str))

In [None]:
%%time



job_description_response = automl.describe_auto_ml_job(
    job_name=auto_ml_job_name)
pprint(job_description_response)
job_status = job_description_response['AutoMLJobStatus']
job_sec_status = job_description_response['AutoMLJobSecondaryStatus']
print('Job status:  {}'.format(job_status))
print('Secondary job status:  {}'.format(job_sec_status))
if job_status not in ('Stopped', 'Failed'):
    while job_status not in ('Completed'):
        job_description_response = automl.describe_auto_ml_job(
            job_name=auto_ml_job_name)
        job_status = job_description_response['AutoMLJobStatus']
        job_sec_status = job_description_response[
            'AutoMLJobSecondaryStatus']
        print(f'Job status: {job_status}')
        print(f'Secondary job status: {job_sec_status}')
        time.sleep(10)
    print('[OK] Autopilot job completed.\n')
else:
    print(f'Job status: {job_status}')
    print(f'Secondary job status: {job_sec_status}')

In [None]:
candidates = automl.list_candidates(
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    job_name=auto_ml_job_name, # Replace None
    sort_by='FinalObjectiveMetricValue') # Replace None
    ### END SOLUTION - DO NOT delete this comment for grading purposes

In [None]:
while candidates == []:
    candidates = automl.list_candidates(job_name=auto_ml_job_name)
    print(
        '[INFO] Autopilot job is generating the candidates. Please wait.')
    time.sleep(10)

print('[OK] Candidates generated.') 

In [None]:
print(candidates[0].keys())

In [None]:
while 'CandidateName' not in candidates[0]:
    candidates = automl.list_candidates(job_name=auto_ml_job_name)
    print(
        '[INFO] Autopilot job is generating CandidateName. Please wait.')
    sleep(10)

print('[OK] CandidateName generated.')

In [None]:
while 'FinalAutoMLJobObjectiveMetric' not in candidates[0]:
    candidates = automl.list_candidates(job_name=auto_ml_job_name)
    print('[INFO] Autopilot job is generating '
          'FinalAutoMLJobObjectiveMetric. Please wait.')
    sleep(10)

print('[OK] FinalAutoMLJobObjectiveMetric generated.')

In [None]:
print(json.dumps(candidates, indent=4, sort_keys=True, default=str))

In [None]:
print(
    'metric ' 
    + str(candidates[0]['FinalAutoMLJobObjectiveMetric']['MetricName']))

for index, candidate in enumerate(candidates):
    print(str(index) 
          + '  ' 
          + candidate['CandidateName'] 
          + '  ' 
          + str(candidate['FinalAutoMLJobObjectiveMetric']['Value']))

In [None]:
candidates = automl.list_candidates(
    job_name=auto_ml_job_name)

if candidates != []:
    best_candidate = automl.best_candidate(
        ### BEGIN SOLUTION - DO NOT delete this comment for grading 
        # purposes
        job_name=auto_ml_job_name) # Replace None
        ### END SOLUTION - DO NOT delete this comment for grading purposes
    print(
        json.dumps(best_candidate, indent=4, sort_keys=True, default=str))

In [None]:
while 'CandidateName' not in best_candidate:
    best_candidate = automl.best_candidate(job_name=auto_ml_job_name)
    print('[INFO] Autopilot Job is generating BestCandidate '
          'CandidateName. Please wait.')
    print(
        json.dumps(best_candidate, indent=4, sort_keys=True, default=str))
    sleep(10)

print('[OK] BestCandidate CandidateName generated.')  

In [None]:
while 'FinalAutoMLJobObjectiveMetric' not in best_candidate:
    best_candidate = automl.best_candidate(job_name=auto_ml_job_name)
    print('[INFO] Autopilot Job is generating BestCandidate '
          'FinalAutoMLJobObjectiveMetric. Please wait.')
    print(
        json.dumps(best_candidate, indent=4, sort_keys=True, default=str))
    sleep(10)

print('[OK] BestCandidate FinalAutoMLJobObjectiveMetric generated.')  

In [None]:
best_candidate_identifier = best_candidate['CandidateName']
print('Candidate name: ' + best_candidate_identifier)
print('Metric name: ' 
      + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
print('Metric value: ' 
      + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))

# 7. Review all output in S3 bucket

In [None]:
display(HTML(
    f'<b>Review all <a target="blank" href="https://s3.console.aws.amazon'
    f'.com/s3/buckets/{bucket}?region={region}&prefix=autopilot/'
    f'{auto_ml_job_name}/">output in S3</a></b>'))

# 8. Deploy and test best candidate model

In [None]:
inference_response_keys = ['predicted_label', 'probability']

In [None]:
autopilot_model = automl.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large',
    candidate=best_candidate,
    inference_response_keys=inference_response_keys,
    predictor_cls=sagemaker.predictor.Predictor,
    serializer=sagemaker.serializers.JSONSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer())

print(f'\nEndpoint name: {autopilot_model.endpoint_name}'

In [None]:
display(HTML(
    '<b>Review <a target="blank" href="https://console.aws.amazon.com/'
    'sagemaker/home?region={region}#/endpoints/'
    '{autopilot_model.endpoint_name}">SageMaker REST endpoint</a></b>'))

In [None]:
sm_runtime = boto3.client('sagemaker-runtime')
review_list = ['This product is great!',
               'OK, but not great.',
               'This is not the right product.']

for review in review_list:
    # remove commas from the review since we're passing the inputs as a 
    # CSV
    review = review.replace(',', '')
    response = sm_runtime.invoke_endpoint(
        EndpointName=autopilot_model.endpoint_name, # endpoint name
        ContentType='text/csv', # type of input data
        Accept='text/csv', # type of the inference in the response
        Body=review) # review text
    response_body = (
        response['Body'].read().decode('utf-8').strip().split(','))
    print('Review: ', review, ' Predicated class: {response_body[0]}')
print('(-1 = Negative, 0=Neutral, 1=Positive)')

In [None]:
!aws s3 cp ./C1_W3_Assignment.ipynb s3://$bucket/C1_W3_Assignment_Learner.ipynb