In [2]:
# Instalação das bibliotecas
!pip install -U pip
!pip install sagemaker==2.15.3
!pip install boto3==1.16.2
!pip install pandas==1.0.1
!pip install numpy==1.18.1



In [3]:
import boto3

sessao_boto = boto3.Session(region_name="us-east-1")

In [4]:
from sagemaker import get_execution_role

ROLE = get_execution_role()

print(f'Role de Execução: {ROLE}')

Role de Execução: arn:aws:iam::962752222089:role/service-role/AWSGlueServiceSageMakerNotebookRole-gdelt


In [5]:
import sagemaker

sessao_sagemaker = sagemaker.Session(boto_session=sessao_boto)

In [6]:
BUCKET ='sagemaker-alura-engenharia-machine-learning'

PREFIXO_DADOS = 'dados'

PREFIXO_MODELOS = 'modelos'

REGIAO = sessao_sagemaker.boto_region_name

SEMENTE = 42

In [7]:
# Vamos colocar os caminhos em um dicionario
# apenas para manter a simplicidade e a consistencia
# dos caminhos. Em caso de mudança, essa será refletida
# em todos os caminhos.

dict_caminho_dados = {
    "treino": "dados/df_bytebank_treino.csv",
    "teste": "dados/df_bytebank_teste.csv",
    "validacao": "dados/df_bytebank_validacao.csv",    
}

In [8]:
# Upload dos dados no S3. Uma observação é que esse
# objeto do tipo "upload_data" ele mantem a referencia de
# metadados do S3 e pode ser usado como "canal" de treinamento#
# para qualquer estimador

bytebank_treino_caminho \
    = sessao_sagemaker.upload_data(
        path=dict_caminho_dados['treino'],
        bucket=BUCKET,
        key_prefix=f'{PREFIXO_DADOS}/treino')

bytebank_teste_caminho \
    = sessao_sagemaker.upload_data(
        path=dict_caminho_dados['teste'],
        bucket=BUCKET,
        key_prefix=f'{PREFIXO_DADOS}/teste')

bytebank_validacao_caminho \
    = sessao_sagemaker.upload_data(
        path=dict_caminho_dados['validacao'],
        bucket=BUCKET,
        key_prefix=f'{PREFIXO_DADOS}/validacao')

In [9]:
# Aqui são os canais que passaremos via training input para
# o estimador
s3_input_treino \
    = sagemaker.TrainingInput(s3_data=bytebank_treino_caminho,
                                          content_type='csv')

s3_input_teste \
    = sagemaker.TrainingInput(s3_data=bytebank_teste_caminho,
                                         content_type='csv')

In [10]:
# Aqui chamamos o estimador do Scikit-Learn no Sage Maker
from sagemaker.sklearn.estimator import SKLearn

sklearn_estimator = SKLearn(
    framework_version='0.20.0',
    py_version='py3',
    entry_point='script_customizado_sage_maker.py',
    code_location=f's3://{BUCKET}/{PREFIXO_MODELOS}',
    hyperparameters={'n_estimators': 100},
    role=ROLE,
    instance_type='ml.c4.xlarge',
    output_path=f's3://{BUCKET}/{PREFIXO_MODELOS}',
    base_job_name='codigoCustomizadoEndpoint',
    sagemaker_session=sessao_sagemaker)

In [11]:
# Realizamos o ajuste do modelo invocando o nosso estimador
sklearn_estimator.fit({'train': s3_input_treino})

2021-01-07 12:17:13 Starting - Starting the training job...
2021-01-07 12:17:15 Starting - Launching requested ML instances......
2021-01-07 12:18:27 Starting - Preparing the instances for training......
2021-01-07 12:19:20 Downloading - Downloading input data...
2021-01-07 12:19:58 Training - Downloading the training image..[34m2021-01-07 12:20:18,532 sagemaker-training-toolkit INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-01-07 12:20:18,534 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-01-07 12:20:18,560 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-01-07 12:20:18,790 botocore.utils INFO     IMDS ENDPOINT: http://169.254.169.254/[0m
[34m2021-01-07 12:20:26,765 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-01-07 12:20:26,778 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed


2021-01-07 12:20:41 Uploading - Uploading generated training model
2021-01-07 12:20:41 Completed - Training job completed
Training seconds: 81
Billable seconds: 81


In [12]:
# Deploy do modelo em um endpoint e aqui instanciamos o 
# serializador para indicarmos que vamos passar um 
# input do tipo CSV. 
from sagemaker.serializers import CSVSerializer
serializer = CSVSerializer()

predictor = sklearn_estimator.deploy(instance_type='ml.c4.xlarge',
                                     initial_instance_count=1,
                                     endpoint_name='codigoCustomizadoEndpoint',
                                     serializer=serializer
                                    )

-------------!

In [13]:
# Vamos usar os dados que temos na base de validacão
# para realizar algumas inferencias
import pandas as pd

df_validacao = pd.read_csv(dict_caminho_dados['validacao'])
X_validacao = df_validacao.iloc[:,1:]
y_validacao = df_validacao.iloc[:,0]

In [14]:
df_validacao.head(5)

Unnamed: 0,INADIMPLENTE,CREDITO_CONCEDIDO,GENERO,ESCOLARIDADE,ESTADO_CIVIL,IDADE,ATRASO_PARCELA_M_0,ATRASO_PARCELA_M_2,ATRASO_PARCELA_M_3,ATRASO_PARCELA_M_4,...,SALDO_PAGO_PARCELA_3,SALDO_PAGO_PARCELA_4,SALDO_PAGO_PARCELA_5,SALDO_PAGO_PARCELA_6,VALOR_PAGAMENTO_ANTERIOR_1,VALOR_PAGAMENTO_ANTERIOR_2,VALOR_PAGAMENTO_ANTERIOR_3,VALOR_PAGAMENTO_ANTERIOR_4,VALOR_PAGAMENTO_ANTERIOR_5,VALOR_PAGAMENTO_ANTERIOR_6
0,0,10000,1,2,2,24,2,0,0,0,...,4810,7546,9260,10000,3000,2000,3000,2000,1000,0
1,0,360000,1,2,1,45,2,2,-2,-2,...,0,0,0,0,0,0,0,0,0,0
2,0,280000,2,1,2,28,-1,-1,-1,0,...,10002,13476,23572,14603,3658,10020,10000,11000,14603,3819
3,1,52743,1,1,2,47,0,0,0,0,...,13507,11462,8191,9460,1091,10505,1603,658,3783,1790
4,1,243160,1,2,2,40,2,0,0,0,...,241034,240190,238301,238645,9454,9366,12440,9034,6587,11570


In [15]:
X_validacao = df_validacao.iloc[2:3, 1:]
X_validacao

Unnamed: 0,CREDITO_CONCEDIDO,GENERO,ESCOLARIDADE,ESTADO_CIVIL,IDADE,ATRASO_PARCELA_M_0,ATRASO_PARCELA_M_2,ATRASO_PARCELA_M_3,ATRASO_PARCELA_M_4,ATRASO_PARCELA_M_5,...,SALDO_PAGO_PARCELA_3,SALDO_PAGO_PARCELA_4,SALDO_PAGO_PARCELA_5,SALDO_PAGO_PARCELA_6,VALOR_PAGAMENTO_ANTERIOR_1,VALOR_PAGAMENTO_ANTERIOR_2,VALOR_PAGAMENTO_ANTERIOR_3,VALOR_PAGAMENTO_ANTERIOR_4,VALOR_PAGAMENTO_ANTERIOR_5,VALOR_PAGAMENTO_ANTERIOR_6
2,280000,2,1,2,28,-1,-1,-1,0,0,...,10002,13476,23572,14603,3658,10020,10000,11000,14603,3819


In [16]:
# Input que iremos passar para o nosso preditor
# aqui no momento do treinamento
csv_text = '140000,2,2,1,37,0,0,0,0,0,0,58081,51013,54343,27537,9751,12569,5000,5000,5000,3000,3000,5000'

print(predictor.predict(csv_text))

[array(['0'], dtype=object) array([0.751342, 0.248658])]


In [17]:
# Podemos realizar a chamada do nosso endpoint
# atraves de do objeto do tipo "Predictor"

import boto3
import json
import numpy as np
from sagemaker.serializers import CSVSerializer
from sagemaker.predictor import Predictor

serializer = CSVSerializer()

predictor = Predictor(
    endpoint_name='codigoCustomizadoEndpoint',
    sagemaker_session=sessao_sagemaker,
    serializer=serializer)

print(predictor.predict(csv_text))

b'"[[{\\"features\\":[\\"0\\"]},{\\"features\\":[0.7513419953,0.2486580047]}]]"'


In [18]:
# Podemos realizar a chamada do nosso endpoint
# usando o "invoke_endpoint"

import sagemaker

sessao_sagemaker = sagemaker.Session(boto_session=sessao_boto)

runtime = boto3.Session().client('sagemaker-runtime')
 
csv_text = '140000,2,2,1,37,0,0,0,0,0,0,58081,51013,54343,27537,9751,12569,5000,5000,5000,3000,3000,5000'

nome_endpoint='codigoCustomizadoEndpoint'

response = runtime.invoke_endpoint(EndpointName=nome_endpoint,
                                   ContentType='text/csv',
                                   Body=csv_text)

result = json.loads(response['Body'].read().decode())

result

'[[{"features":["0"]},{"features":[0.7513419953,0.2486580047]}]]'

In [19]:
# Transformacao em batch
# Definir um Transformer do tipo SKLearn
# a partir de um estimador do SKLearn
transformer \
    = sklearn_estimator.transformer(instance_count=1,
                                    instance_type='ml.c4.xlarge',
                                    output_path='s3://sagemaker-alura-engenharia-machine-learning/dados/validacao/batch_input/')

In [20]:
df_validacao.shape

(6990, 24)

In [21]:
df_validacao.head(1)

Unnamed: 0,INADIMPLENTE,CREDITO_CONCEDIDO,GENERO,ESCOLARIDADE,ESTADO_CIVIL,IDADE,ATRASO_PARCELA_M_0,ATRASO_PARCELA_M_2,ATRASO_PARCELA_M_3,ATRASO_PARCELA_M_4,...,SALDO_PAGO_PARCELA_3,SALDO_PAGO_PARCELA_4,SALDO_PAGO_PARCELA_5,SALDO_PAGO_PARCELA_6,VALOR_PAGAMENTO_ANTERIOR_1,VALOR_PAGAMENTO_ANTERIOR_2,VALOR_PAGAMENTO_ANTERIOR_3,VALOR_PAGAMENTO_ANTERIOR_4,VALOR_PAGAMENTO_ANTERIOR_5,VALOR_PAGAMENTO_ANTERIOR_6
0,0,10000,1,2,2,24,2,0,0,0,...,4810,7546,9260,10000,3000,2000,3000,2000,1000,0


In [22]:
df_validacao.iloc[:, 1:]

Unnamed: 0,CREDITO_CONCEDIDO,GENERO,ESCOLARIDADE,ESTADO_CIVIL,IDADE,ATRASO_PARCELA_M_0,ATRASO_PARCELA_M_2,ATRASO_PARCELA_M_3,ATRASO_PARCELA_M_4,ATRASO_PARCELA_M_5,...,SALDO_PAGO_PARCELA_3,SALDO_PAGO_PARCELA_4,SALDO_PAGO_PARCELA_5,SALDO_PAGO_PARCELA_6,VALOR_PAGAMENTO_ANTERIOR_1,VALOR_PAGAMENTO_ANTERIOR_2,VALOR_PAGAMENTO_ANTERIOR_3,VALOR_PAGAMENTO_ANTERIOR_4,VALOR_PAGAMENTO_ANTERIOR_5,VALOR_PAGAMENTO_ANTERIOR_6
0,10000,1,2,2,24,2,0,0,0,0,...,4810,7546,9260,10000,3000,2000,3000,2000,1000,0
1,360000,1,2,1,45,2,2,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,0
2,280000,2,1,2,28,-1,-1,-1,0,0,...,10002,13476,23572,14603,3658,10020,10000,11000,14603,3819
3,52743,1,1,2,47,0,0,0,0,0,...,13507,11462,8191,9460,1091,10505,1603,658,3783,1790
4,243160,1,2,2,40,2,0,0,0,0,...,241034,240190,238301,238645,9454,9366,12440,9034,6587,11570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6985,500000,2,1,1,41,0,0,0,0,0,...,197284,199489,201484,204017,8501,8001,6701,6321,6401,6379
6986,200000,1,1,1,44,0,0,0,0,0,...,79418,80608,81297,83602,4001,3501,3501,3001,4001,4001
6987,20000,1,2,1,30,0,1,1,1,1,...,8191,8544,9226,8933,1131,1547,638,972,0,744
6988,30000,2,2,1,34,1,2,1,1,0,...,34554,33584,32677,31640,1127,1718,605,1092,1970,2334


In [23]:
# Remover a coluna de inadimplente e o header
df_validacao.iloc[:, 1:].to_csv('dados/df_bytebank_validacao_sem_header.csv',
                                header=False,
                                index=None)

In [24]:
# Upload input data from local filesystem to S3
validacao_em_batch \
    = sessao_sagemaker.upload_data(
        path='dados/df_bytebank_validacao_sem_header.csv',
        bucket=BUCKET,
        key_prefix=f'{PREFIXO_DADOS}/validacao/batch_input')

In [26]:
# Execução do Job de Transformação
# Usando o transformer que criamos, vamos
# subir os dados no S3 na pasta de validação em batch
# Vamos inicializasr a execução e esperar ate as finalização

transformer.transform(validacao_em_batch,
                      job_name='jobBatchTransformacaoByte',
                      content_type='text/csv')

print('Esperando a finalização do job de treinamento: ' \
      + transformer.latest_transform_job.job_name)

transformer.wait()

...............................[32m2021-01-07T12:43:25.536:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[34m2021/01/07 12:43:22 [crit] 13#13: *1 connect() to unix:/tmp/gunicorn.sock failed (2: No such file or directory) while connecting to upstream, client: 169.254.255.130, server: , request: "GET /ping HTTP/1.1", upstream: "http://unix:/tmp/gunicorn.sock:/ping", host: "169.254.255.131:8080"[0m
[34m169.254.255.130 - - [07/Jan/2021:12:43:22 +0000] "GET /ping HTTP/1.1" 502 182 "-" "Go-http-client/1.1"[0m
[34m2021/01/07 12:43:22 [crit] 13#13: *3 connect() to unix:/tmp/gunicorn.sock failed (2: No such file or directory) while connecting to upstream, client: 169.254.255.130, server: , request: "GET /ping HTTP/1.1", upstream: "http://unix:/tmp/gunicorn.sock:/ping", host: "169.254.255.131:8080"[0m
[34m169.254.255.130 - - [07/Jan/2021:12:43:22 +0000] "GET /ping HTTP/1.1" 502 182 "-" "Go-http-client/1.1"[0m
[34m2021/01/07 12:43:22 [crit


Esperando a finalização do job de treinamento: jobBatchTransformacaoByte
[32m2021-01-07T12:43:25.536:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[34m2021/01/07 12:43:22 [crit] 13#13: *1 connect() to unix:/tmp/gunicorn.sock failed (2: No such file or directory) while connecting to upstream, client: 169.254.255.130, server: , request: "GET /ping HTTP/1.1", upstream: "http://unix:/tmp/gunicorn.sock:/ping", host: "169.254.255.131:8080"[0m
[34m169.254.255.130 - - [07/Jan/2021:12:43:22 +0000] "GET /ping HTTP/1.1" 502 182 "-" "Go-http-client/1.1"[0m
[34m2021/01/07 12:43:22 [crit] 13#13: *3 connect() to unix:/tmp/gunicorn.sock failed (2: No such file or directory) while connecting to upstream, client: 169.254.255.130, server: , request: "GET /ping HTTP/1.1", upstream: "http://unix:/tmp/gunicorn.sock:/ping", host: "169.254.255.131:8080"[0m
[34m169.254.255.130 - - [07/Jan/2021:12:43:22 +0000] "GET /ping HTTP/1.1" 502 182 "-" "Go-http-cli

[35m169.254.255.130 - - [07/Jan/2021:12:43:24 +0000] "GET /ping HTTP/1.1" 502 182 "-" "Go-http-client/1.1"[0m
[35m[2021-01-07 12:43:24 +0000] [36] [INFO] Starting gunicorn 20.0.4[0m
[35m[2021-01-07 12:43:24 +0000] [36] [INFO] Listening at: unix:/tmp/gunicorn.sock (36)[0m
[35m[2021-01-07 12:43:24 +0000] [36] [INFO] Using worker: gevent[0m
[35m[2021-01-07 12:43:24 +0000] [39] [INFO] Booting worker with pid: 39[0m
[35m[2021-01-07 12:43:24 +0000] [43] [INFO] Booting worker with pid: 43[0m
[35m[2021-01-07 12:43:24 +0000] [44] [INFO] Booting worker with pid: 44[0m
[35m[2021-01-07 12:43:24 +0000] [45] [INFO] Booting worker with pid: 45[0m
[35m2021-01-07 12:43:24,889 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
  import imp[0m
[35m169.254.255.130 - - [07/Jan/2021:12:43:25 +0000] "GET /ping HTTP/1.1" 200 0 "-" "Go-http-client/1.1"[0m
[35m169.254.255.130 - - [07/Jan/2021:12:43:25 +0000] "GET /execution-parameters HTTP/1.1" 404 232 "-" "Go-

In [27]:
caminho_job_transformacao = transformer.output_path
caminho_job_transformacao

's3://sagemaker-alura-engenharia-machine-learning/dados/validacao/batch_input/'

In [34]:
# Deletar o endpoint, configurações e modelo
predictor.delete_endpoint()
predictor.delete_model()

In [None]:
# Ref: https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/scikit_learn_iris/scikit_learn_estimator_example_with_batch_transform.ipynb
# Ref: https://github.com/aws/sagemaker-scikit-learn-container/blob/master/src/sagemaker_sklearn_container/serving.py#L56
# Ref: Sage Maker do not deals with headers: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html
# Ref: Sage Maker Examples: https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/using_sklearn.html#deploy-a-scikit-learn-model
# Ref: https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/scikit_learn_randomforest/Sklearn_on_SageMaker_end2end.ipynb
# Ref: https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/scikit_learn_randomforest/Sklearn_on_SageMaker_end2end.ipynb
# Ref: https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/using_sklearn.html#process-input
# Ref: https://github.com/aws/sagemaker-scikit-learn-container/blob/master/src/sagemaker_sklearn_container/serving.py#L56
# Ref: https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/scikit_learn_iris/scikit_learn_estimator_example_with_batch_transform.ipynb