# Process and Train model in 1 step

Processing the images is really simple here, so it won't be split into another container to save in development time

In [2]:
import sagemaker
import boto3
import os

sess = sagemaker.Session()
sm_bucket = sess.default_bucket()
                     
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)
s3 = boto3.Session().client(service_name="s3", region_name=region)

In [3]:
bucket = 'digits-recognizer-project'
dataset = 'input_data/mnist_data'
input_data_s3_uri = 's3://{}/{}/'.format(bucket, dataset)

In [4]:
input_data_s3_uri

's3://digits-recognizer-project/input_data/mnist_data/'

Sagemaker Instances Pricing: https://aws.amazon.com/sagemaker/pricing/

In [5]:
train_instance_type = "ml.m5.large"
train_instance_count = 1

In [6]:
# Update to support TF 2.7
%pip install -U sagemaker

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting sagemaker
  Downloading sagemaker-2.83.0.tar.gz (520 kB)
     |████████████████████████████████| 520 kB 24.5 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting attrs==20.3.0
  Using cached attrs-20.3.0-py2.py3-none-any.whl (49 kB)
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker: filename=sagemaker-2.83.0-py2.py3-none-any.whl size=719395 sha256=b3a51d6fff50821571d8c5a2066f79308a97a721687330fe9dd6207270d0d4fb
  Stored in directory: /root/.cache/pip/wheels/06/1d/68/f4db3097e83a069737269b641e4468eab29b574b07cbf1a7b3
Successfully built sagemaker
Installing collected packages: attrs, sagemaker
  Attempting uninstall: attrs
    Found existing installation: attrs 19.3.0
    Uninstalling attrs-19.3.0:
      Successfully uninstalled attrs-19.3.0
  Attempting uni

In [7]:
from sagemaker.tensorflow import TensorFlow

In [12]:
estimator = TensorFlow(entry_point='train.py',
                       source_dir='./local_train',
                       role=role,
                       framework_version='2.7',
                       py_version='py38',
                       instance_count=train_instance_count,
                       instance_type=train_instance_type,
                       )

In [13]:
estimator.fit(
    inputs={"train": input_data_s3_uri},
    wait=False,
)

AttributeError: type object 'EstimatorBase' has no attribute '_json_encode_hyperparameters'

Ok... this obscure error didn't happen before. I will ignore it for now and deploy a trained model that seemed to work

In [36]:
training_job_name = estimator.latest_training_job.name
print("Training Job Name:  {}".format(training_job_name))

Training Job Name:  tensorflow-training-2022-03-22-12-43-47-937


In [37]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/{}">Training Job</a> After About 5 Minutes</b>'.format(
            region, training_job_name
        )
    )
)

In [38]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(
            region, training_job_name
        )
    )
)

In [39]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Training Job Has Completed</b>'.format(
            bucket, training_job_name, region
        )
    )
)

In [None]:
!aws s3 cp s3://$sm_bucket/$training_job_name/output/model.tar.gz s3://$bucket/models/model_v1.tar.gz

# Deploy model

In [45]:
!aws s3 cp s3://$bucket/models/model_v1.tar.gz ./model.tar.gz

download: s3://digits-recognizer-project/models/model_v1.tar.gz to ./model.tar.gz


In [47]:
!mkdir -p ./model/
!tar -xvzf ./model.tar.gz -C ./model/

01/
01/model_v1.h5


In [None]:
# from sagemaker.tensorflow.serving import Model

# model = Model(model_data=f's3://{bucket}/models/model_v1.tar.gz', role=role)

# # predictor = model.deploy(initial_instance_count=1, instance_type='ml.c5.large')

from sagemaker.tensorflow import TensorFlowModel

import time

timestamp = int(time.time())
tensorflow_model_name = f'tensorflow-inference-{timestamp}

model = TensorFlowModel(
    name=tensorflow_model_name,
    framework_version='2.7', 
    model_data=f's3://{bucket}/models/model_v1.tar.gz', 
    role=role)

# predictor = model.deploy(initial_instance_count=1, instance_type='ml.c5.large')
# An error is produced: ValueError: no SavedModel bundles found!
# This is due to improper formatting of the model files


In [None]:
sm.delete_endpoint(
     EndpointName=tensorflow_model_name
)

# Deploy another trained model

Due to the error produced above, I uploaded a model properly versioned with `1/` to the s3 bucket.

In [15]:
# from sagemaker.tensorflow.serving import Model
# model = Model(model_data=f's3://{bucket}/models/model_v1.tar.gz', role=role)
# # predictor = model.deploy(initial_instance_count=1, instance_type='ml.c5.large')

from sagemaker.tensorflow import TensorFlowModel

import time

# timestamp = int(time.time())
tensorflow_model_name = f'tensorflow-inference-model-1'

model = TensorFlowModel(
    name=tensorflow_model_name,
    framework_version='2.7', 
    model_data=f's3://{bucket}/models/model.tar.gz', 
    role=role)

In [17]:
predictor = model.deploy(initial_instance_count=1, 
                         instance_type='ml.c5.large',
                         )

update_endpoint is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


----!

In [19]:
predictor

<sagemaker.tensorflow.model.TensorFlowPredictor at 0x7fa343f0e5d0>

## Use endpoint for predictions

We will add `Data Capture` to save the inputs. They will be saved into JSONL files. **Important note**: Captured data is not available immediately after the predictions are returned. In my case, it took around 10 minutes for that data to appear in the S3 bucket.

Recorded data can be later used to determine the quality of the algorithm by using human labelers. Another option, that can be combined or used independently, is using Augmented AI (A2I), a human-in-the-loop flow that is launched when the predictions of the model are below a certain threshold.

In [57]:
from sagemaker.model_monitor import DataCaptureConfig

In [59]:
capture_data_dir = 'captured_data'
data_capture_s3_uri = 's3://{}/{}/'.format(bucket, capture_data_dir)

data_capture_config = DataCaptureConfig(
    enable_capture=True,
    sampling_percentage=100, 
    destination_s3_uri=data_capture_s3_uri)

predictor.update_data_capture_config(data_capture_config)

-------------!

In [20]:
from PIL import Image, ImageOps
import numpy as np

## Predict using the MNIST data

In [61]:
def get_predict(results):
    predict_number = np.argmax(results)
    return predict_number, results[predict_number]

In [6]:
mnist_source = '../data/initial_data/'

In [7]:
x_test = np.load(os.path.join(mnist_source, 'test_X.npy'))
y_test = np.load(os.path.join(mnist_source, 'test_y.npy'))

In [8]:
x_test = x_test.astype('float32') / 255.0

In [13]:
x_test.shape, x_test[0].shape

((10000, 28, 28), (28, 28))

In [63]:
y_test[:20]

array([7, 2, 1, 0, 4, 1, 4, 9, 5, 9, 0, 6, 9, 0, 1, 5, 9, 7, 3, 4],
      dtype=uint8)

In [64]:
for i in range(20):
    print(get_predict(np.array(predictor.predict(x_test[i])['predictions']).squeeze()))

(7, 0.202673554)
(2, 0.164640158)
(1, 0.162040204)
(0, 0.252759784)
(4, 0.152242526)
(1, 0.169381008)
(4, 0.186517209)
(9, 0.149589553)
(4, 0.13040334)
(9, 0.171408907)
(0, 0.169871584)
(6, 0.122587062)
(9, 0.171075225)
(0, 0.266866058)
(1, 0.154099584)
(3, 0.151482627)
(7, 0.1409408)
(7, 0.213926122)
(3, 0.182020247)
(9, 0.137253851)


## Predict an image from DIDA dataset
This is a different dataset that requires preprocessing the images

In [65]:
# predict an unseen image
im2 = Image.open("2_87.jpg").convert('L')
im2 = ImageOps.invert(im2)
thresh = 150
fn = lambda x: x if x > thresh else 0
im2 = im2.point(fn, mode='L')
im2 = im2.resize((28,28))
pic = np.array(im2)
print(pic.shape)
pic = pic.astype('float32') / 255.0

(28, 28)


In [66]:
results = predictor.predict(pic)
print(results)

{'predictions': [[0.0977991819, 0.0658421293, 0.0827841833, 0.122882776, 0.145602629, 0.123223945, 0.0780024678, 0.0777508393, 0.110717252, 0.0953945443]]}


In [42]:
predict_number = np.argmax(results['predictions'])
print(f"predicted number: {predict_number}, probability: {results['predictions'][0][4] * 100:.2f}%")

predicted number: 4, probability: 14.56%


# Shutdown endpoint to incur in extra costs

In [67]:
predictor.delete_endpoint(predictor.endpoint_name)