In [22]:
import sagemaker
from sagemaker.session import TrainingInput
import numpy as np
from sagemaker.serializers import CSVSerializer
from sagemaker.pytorch import PyTorch
import pandas as pd
from sklearn.metrics import classification_report

sagemaker_session = sagemaker.Session()

bucket = 'datascience-sagemaker-fernandosousa'
prefix = 'perceptron/cobranca'

instance_type = 'ml.m5.large'

role = sagemaker.get_execution_role()

In [23]:
# preparacao
dados = pd.read_csv("s3://{}/{}".format(bucket, "Case_cobranca.csv"))
dados['CLIENTE_NOVO']   = dados['TIPO_CLIENTE'].apply(lambda x: 1 if x == 'NOVO' else 0)
dados['CLIENTE_INVESTIDOR']   = dados['TIPO_CLIENTE'].apply(lambda x: 1 if x == 'INVESTIDOR' else 0)    
dados['EMPRESTIMO_CDC']   = dados['TIPO_EMPRESTIMO'].apply(lambda x: 1 if x == 'CDC' else 0)
dados['EMPRESTIMO_PESSOAL']   = dados['TIPO_EMPRESTIMO'].apply(lambda x: 1 if x == 'PESSOAL' else 0)
dados['SEXO_M']   = dados['CD_SEXO'].apply(lambda x: 1 if x == 'M' else 0)
dados['IDADE_NORM'] = dados['IDADE'].apply(lambda x: 18 if np.isnan(x) or x < 18 else x) # mínimo
dados['IDADE_NORM'] = dados['IDADE_NORM'].apply(lambda x: 76 if x > 76 else x) # máximo
dados['IDADE_NORM'] = dados['IDADE_NORM'].apply(lambda x: (x-18)/(76-18)) # normalização entre 0 e 1
dados['QTD_DIVIDAS_NORM'] = dados['QTD_DIVIDAS'].apply(lambda x: 0. if np.isnan(x) else x/16) # normalização entre 0 e 1
dados['ALVO']   = dados['TEMP_RECUPERACAO'].apply(lambda x: 1 if x <= 90 else 0)
dados = dados.drop(['COD', 'TIPO_CLIENTE', 'TIPO_EMPRESTIMO', 'CD_SEXO', 'IDADE', 'QTD_DIVIDAS', 'TEMP_RECUPERACAO'], axis=1)


In [24]:
#salvar arquivo csv
np.savetxt('data/train.csv', dados.to_numpy(), delimiter=',')

# enviar para s3
sagemaker_session.upload_data(path='data', bucket=bucket, key_prefix=prefix)

's3://datascience-sagemaker-fernandosousa/perceptron/cobranca'

In [25]:
# definir caminhos para sagemaker copiar os dados
train_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "train.csv"), content_type="csv"
)
validation_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "train.csv"), content_type="csv"
)

In [26]:
estimator = PyTorch(entry_point='perceptron.py',
                    base_name='credito_perceptron',
                    role=role,
                    py_version='py3',
                    framework_version='1.8.0',
                    instance_count=1,
                    instance_type=instance_type,
                    hyperparameters={
                        'error':0.3
                    })

In [27]:
estimator.fit({"training": train_input, "validation": validation_input})

2022-04-22 01:26:22 Starting - Starting the training job...
2022-04-22 01:26:40 Starting - Preparing the instances for trainingProfilerReport-1650590782: InProgress
......
2022-04-22 01:27:50 Downloading - Downloading input data.........
2022-04-22 01:29:23 Training - Downloading the training image..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-04-22 01:29:29,049 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-04-22 01:29:29,053 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-04-22 01:29:29,073 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-04-22 01:29:29,082 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-04-22 01:29:29,589 sagemaker-training-toolkit INFO     No GPUs detected (normal if no g

In [28]:
predictor = estimator.deploy(initial_instance_count=1, instance_type=instance_type, serializer=CSVSerializer(), endpoint_name='datascience-credito-perceptron')

------!

In [29]:
dados.insert(0, "bias", [1]*dados.shape[0])

In [30]:
predictions = []

for index, row in dados.iterrows():
    predictions.append(float(predictor.predict(row[:-1].to_numpy())))

predictions = np.array(predictions)
predictions

array([1., 1., 1., ..., 1., 1., 0.])

In [31]:
print(classification_report(dados['ALVO'], predictions))

              precision    recall  f1-score   support

           0       0.78      0.52      0.63      4687
           1       0.67      0.86      0.75      5130

    accuracy                           0.70      9817
   macro avg       0.72      0.69      0.69      9817
weighted avg       0.72      0.70      0.69      9817



In [32]:
i = 3
#response = predictor.predict('1.0,'+','.join(map(str,dados.to_numpy()[i, :-1].tolist())))
response = predictor.predict(np.array([1,1,1,1,1,1,1,1]))
print(f'Predição: {response}')
print(f'Real: {dados.to_numpy()[i,-1:]}')

Predição: [0.]
Real: [0.]
