# Configuração

Bibliotecas

In [2]:
import sagemaker
import boto3
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
import pandas as pd
from sagemaker.serializers import CSVSerializer
import sklearn.metrics as metrics
import numpy as np

Especificação de variáveis

In [3]:
bucket = 's3://datascience-sagemaker-fernandosousa'
role = sagemaker.get_execution_role()
treinamento_arquivo = f'{bucket}/treinamento.csv'
validacao_arquivo = f'{bucket}/validacao.csv'
saida = f'{bucket}/saida'

tipo_instancia = 'ml.m5.large'

# imagem do algortirmo de treinamento
xgboost_container = '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest'

# prefixo do nome do Job
base_name="cobranca-xgboost"

# hiperparâmetros
hp = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":"50"
}


# Treinamento

In [4]:
# construir o classificados (estimador)
estimator = sagemaker.estimator.Estimator(
    base_name=base_name,
    image_uri=xgboost_container, 
    hyperparameters=hp,
    role=role,
    instance_count=1, 
    instance_type=tipo_instancia, 
    volume_size=5, # 5 GB 
    output_path=saida
)

# carregar dados de treinamento e validacao
treinamento_input = TrainingInput(treinamento_arquivo, content_type='csv')
validacao_input = TrainingInput(validacao_arquivo, content_type='csv')

# executar training job
estimator.fit({'train': treinamento_input, 'validation': validacao_input})

2022-02-26 20:17:50 Starting - Starting the training job...
2022-02-26 20:17:54 Starting - Launching requested ML instancesProfilerReport-1645906670: InProgress
.........
2022-02-26 20:19:32 Starting - Preparing the instances for training.........
2022-02-26 20:21:16 Downloading - Downloading input data
2022-02-26 20:21:16 Training - Downloading the training image..[34mArguments: train[0m
[34m[2022-02-26:20:21:31:INFO] Running standalone xgboost training.[0m
[34m[2022-02-26:20:21:31:INFO] File size need to be processed in the node: 0.29mb. Available memory size in the node: 150.24mb[0m
[34m[2022-02-26:20:21:31:INFO] Determined delimiter of CSV input is ','[0m
[34m[20:21:31] S3DistributionType set as FullyReplicated[0m
[34m[20:21:31] 6871x7 matrix with 48097 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-02-26:20:21:31:INFO] Determined delimiter of CSV input is ','[0m
[34m[20:21:31] S3DistributionType set as FullyReplicated

# Implantação

In [16]:
predictor = estimator.deploy(initial_instance_count=1, instance_type=tipo_instancia, serializer=CSVSerializer(), endpoint_name='datascience-credito-xgboost')

-----!

In [17]:
predictor.endpoint_name

'datascience-credito-xgboost'

# Avaliação

In [11]:
dados_teste = pd.read_csv('dados/teste.csv')

predictions = []
for index, row in dados_teste.iterrows():
    predictions.append(float(predictor.predict(row[1:]).decode('utf8')))

predictions = np.array(predictions)

In [12]:
predictions

array([0.53416204, 0.77128845, 0.22254397, 0.41436192, 0.65763527,
       0.49290127, 0.80171412, 0.76347291, 0.81201792, 0.33307606,
       0.53425533, 0.73168671, 0.82232177, 0.81554198, 0.46762788,
       0.20730777, 0.74300921, 0.74124682, 0.59980947, 0.13451986,
       0.08729585, 0.78243613, 0.85688877, 0.22637333, 0.89604211,
       0.23317918, 0.15839815, 0.79951286, 0.18262248, 0.75851977,
       0.10893678, 0.61620438, 0.44653279, 0.28972152, 0.29781023,
       0.43304116, 0.82223696, 0.72083086, 0.40466461, 0.76347291,
       0.64406848, 0.36924765, 0.79057747, 0.79634923, 0.3422488 ,
       0.81615406, 0.7918148 , 0.18786125, 0.13552167, 0.84721655,
       0.13524693, 0.62837237, 0.43367448, 0.7431044 , 0.81554198,
       0.89649349, 0.74287665, 0.73524344, 0.10170303, 0.79653895,
       0.62837237, 0.7577889 , 0.82660514, 0.90293145, 0.32113129,
       0.78409815, 0.75662422, 0.39831901, 0.39223835, 0.45867816,
       0.10054486, 0.70959944, 0.15140037, 0.75955325, 0.26494

In [13]:
# normalização para 0 e 1
cutoff=0.5
print(metrics.confusion_matrix(dados_teste.iloc[:, 0], np.where(predictions > cutoff, 1, 0)))
print(metrics.classification_report(dados_teste.iloc[:, 0], np.where(predictions > cutoff, 1, 0)))

[[336 134]
 [108 404]]
              precision    recall  f1-score   support

           0       0.76      0.71      0.74       470
           1       0.75      0.79      0.77       512

    accuracy                           0.75       982
   macro avg       0.75      0.75      0.75       982
weighted avg       0.75      0.75      0.75       982

