In [1]:
import sagemaker
import boto3
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
import pandas as pd
from sagemaker.serializers import CSVSerializer
import sklearn.metrics as metrics
import numpy as np

In [6]:
bucket = 's3://datascience-sagemaker-fernandosousa'
role = sagemaker.get_execution_role()
treinamento = f'{bucket}/treinamento.csv'
validacao = f'{bucket}/validacao.csv'
saida = f'{bucket}/saida'

tipo_instancia = 'ml.m5.large'

container = '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest'

base_name = 'cobranca-xgboost'

hp = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight":"6",
    "subsample":"0.7",
    "objective":"binary:logistic",
    "num_round": "50"
}

In [7]:
estimador = sagemaker.estimator.Estimator(
    base_name=base_name,
    image_uri=container,
    hyperparameters = hp,
    role=role,
    instance_count=1,
    instance_type=tipo_instancia,
    volume_size=5,
    output_path=saida
)

In [8]:
treinamento_input = TrainingInput(treinamento, content_type='csv')
validacao_input = TrainingInput(validacao, content_type='csv')

In [9]:
estimador.fit({
    'train':treinamento_input,
    'validation':validacao_input
})

2022-04-13 00:45:55 Starting - Starting the training job...
2022-04-13 00:46:22 Starting - Preparing the instances for trainingProfilerReport-1649810755: InProgress
.........
2022-04-13 00:47:42 Downloading - Downloading input data......
2022-04-13 00:48:42 Training - Downloading the training image..[34mArguments: train[0m
[34m[2022-04-13:00:49:10:INFO] Running standalone xgboost training.[0m
[34m[2022-04-13:00:49:10:INFO] File size need to be processed in the node: 0.29mb. Available memory size in the node: 208.03mb[0m
[34m[2022-04-13:00:49:10:INFO] Determined delimiter of CSV input is ','[0m
[34m[00:49:10] S3DistributionType set as FullyReplicated[0m
[34m[00:49:10] 6871x7 matrix with 48097 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-04-13:00:49:10:INFO] Determined delimiter of CSV input is ','[0m
[34m[00:49:10] S3DistributionType set as FullyReplicated[0m
[34m[00:49:10] 1964x7 matrix with 13748 entries loaded from /

In [12]:
predictor = estimador.deploy(
    initial_instance_count=1, 
    instance_type=tipo_instancia,
    serializer=CSVSerializer(), 
    endpoint_name='datascience-credito-xgboost-aula'
)


-----!

In [14]:
dados_teste = pd.read_csv('dados/testes.csv')

predictions = []
for index, row in dados_teste.iterrows():
    p = predictor.predict(row[1:])
    predictions.append(float(p.decode('utf8')))

predictions = np.array(predictions)

In [None]:
predictions

In [None]:
predictions = np.where(predictions > 0.5, 1, 0)
predictions

In [18]:
print(metrics.classification_report(dados_teste.iloc[:, 0], predictions))

              precision    recall  f1-score   support

           0       0.76      0.72      0.74       469
           1       0.75      0.79      0.77       512

    accuracy                           0.75       981
   macro avg       0.75      0.75      0.75       981
weighted avg       0.75      0.75      0.75       981

