In [1]:
conda install -c conda-forge ipywidgets

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.8.4
  latest version: 22.9.0

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/ec2-user/SageMaker/diabetes-classifier/.myenv/miniconda/envs/diabetes-classifier-env-38

  added / updated specs:
    - ipywidgets


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    debugpy-1.6.3              |   py38hfa26641_1         2.0 MB  conda-forge
    ipykernel-6.14.0           |   py38h7f3c49e_0         186 KB  conda-forge
    ipython-8.4.0              |   py38h578d9bd_0         1.1 MB  conda-forge
    jupyter_client-7.4.7       |     pyhd8ed1ab_0          92 KB  conda-forge
    jupyter_core-5.0.0         |   py38h578d9bd_0          85 KB  conda-forge
    platformdirs-2.5.2         |     pyhd8ed1ab_1          16 KB  

In [2]:
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install seaborn
%pip install warnings
%pip install pickle
%pip install xgboost
%pip install scikit-learn
%pip install pandas_profiling
%pip install --upgrade sagemaker

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting matplotlib
  Downloading matplotlib-3.6.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.4.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting cycler>=0.10
  Downloading cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting pillow>=6.2.0

## Importando Libs

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pickle
import xgboost as xgb
import os
import boto3
import re
import sagemaker
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, cross_validate, RepeatedStratifiedKFold, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import recall_score, accuracy_score, precision_score, roc_auc_score, f1_score

## Definindo Parâmetros

In [4]:
# Parametros de Saída do Notebook
with warnings.catch_warnings():
    warnings.simplefilter('ignore')

sns.set_theme(style="whitegrid")

params = {'legend.fontsize': 'x-large',
          'figure.figsize': (15, 5),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
plt.rcParams.update(params)

# Integração AWS
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
s3 = boto3.client("s3")

# Buckets
bucket_name = "diabetes-classifier-data-store"
diabetes_raw = "diabetes_binary_health_indicators_BRFSS2015.csv"

## Manipulando a Base de Dados

### Balanceando e salvando em treino e teste

In [5]:
# Carregando Base
s3.download_file(bucket_name, f"data/raw/{diabetes_raw}", f"./diabetes/{diabetes_raw}")

diabetes = pd.read_csv(f"./diabetes/{diabetes_raw}")

In [6]:
# Exibindo o grau de desbalanceamento
diabetes['Diabetes_binary'].value_counts(normalize = True)

0.0    0.860667
1.0    0.139333
Name: Diabetes_binary, dtype: float64

In [7]:
# Balanceando dados
mask = diabetes['Diabetes_binary'] == 1


diabetes_pos = diabetes[mask]
diabetes_neg = diabetes[~mask]


diabetes_downsampled = resample(diabetes_neg, 
                                replace = False, 
                                n_samples = len(diabetes_pos), 
                                random_state = 13)


diabetes_balanced = pd.concat([diabetes_pos, 
                               diabetes_downsampled])

diabetes_balanced['Diabetes_binary'].value_counts(normalize = True)

0.0    0.5
1.0    0.5
Name: Diabetes_binary, dtype: float64

In [8]:
# Salvando base balanceada no S3
file_name = "./diabetes/diabetes_balanced.csv"
diabetes_balanced.to_csv(file_name, index = False)
s3.upload_file(file_name, bucket_name, f"data/raw/diabetes_balanced.csv")

In [61]:
# Separando conjunto de dados em treino e teste
diabetes_train, diabetes_test = train_test_split(diabetes_balanced, test_size=0.2, random_state=13, stratify = diabetes_balanced['Diabetes_binary'])

In [62]:
# Salvando conjunto de treino
file_name = './diabetes/diabetes_train.csv'
diabetes_train.to_csv(file_name, index = False)
s3.upload_file(file_name, bucket_name, f"data/raw/diabetes_train.csv")


# Salvando conjunto de teste
file_name = './diabetes/diabetes_test.csv'
diabetes_test.to_csv('./diabetes/diabetes_test.csv', index = False)
s3.upload_file(file_name, bucket_name, f"data/raw/diabetes_test.csv")

### Criando pipeline de transformação dos dados

In [69]:
# Separando features por tipo

# Features  Categoricas
cat_vars = ['GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']

# Features Numericas
num_vars = ['MentHlth', 'BMI']

# Features Binárias
bin_vars = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex' ]

In [70]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('robust_scaler', RobustScaler())
])

bin_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))  
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  
    ('one-hot-encoding', OneHotEncoder(handle_unknown='ignore'))
])



# (name, transformer, columns)
preprocessed_pipeline = ColumnTransformer([
    ('numerical', num_pipeline, num_vars),
    ('binary', bin_pipeline, bin_vars),
    ('categorical', cat_pipeline, cat_vars)
])

### Tratando base de treino para o Linear Learner do Sagemaker

In [63]:
diabetes_train = "diabetes_train.csv"
s3.download_file(bucket_name, f"data/raw/{diabetes_train}", f"./diabetes/{diabetes_train}")

diabetes_train = pd.read_csv( f"./diabetes/{diabetes_train}")

In [66]:
y_train = diabetes_train['Diabetes_binary']

In [67]:
diabetes_train = diabetes_train.drop(columns =['Diabetes_binary'])

In [68]:
print(f"Shape X: {diabetes_train.shape} & Shape y: {y_train.shape}")

Shape X: (56553, 21) & Shape y: (56553,)


In [79]:
X_train = preprocessed_pipeline.fit_transform(diabetes_train)

In [105]:
X_train.shape

(56553, 110)

In [123]:
y_train.shape

(56553,)

In [139]:
df_y_train = y_train.to_frame()
df_y_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56553 entries, 0 to 56552
Data columns (total 1 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Diabetes_binary  56553 non-null  float64
dtypes: float64(1)
memory usage: 441.9 KB


In [138]:
df_X_train = pd.DataFrame(data=X_train.toarray(), columns=preprocessed_pipeline.get_feature_names_out().tolist())
df_X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56553 entries, 0 to 56552
Columns: 110 entries, numerical__MentHlth to categorical__Income_8.0
dtypes: float64(110)
memory usage: 47.5 MB


In [142]:
df_X_train_final = pd.concat([df_y_train.reset_index(drop=True),df_X_train.reset_index(drop=True)], axis=1)
df_X_train_final.head()

Unnamed: 0,Diabetes_binary,numerical__MentHlth,numerical__BMI,binary__HighBP,binary__HighChol,binary__CholCheck,binary__Smoker,binary__Stroke,binary__HeartDiseaseorAttack,binary__PhysActivity,...,categorical__Education_5.0,categorical__Education_6.0,categorical__Income_1.0,categorical__Income_2.0,categorical__Income_3.0,categorical__Income_4.0,categorical__Income_5.0,categorical__Income_6.0,categorical__Income_7.0,categorical__Income_8.0
0,0.0,0.0,0.75,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,15.0,-0.25,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,-0.125,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [143]:
# Salvando arquivo de treino do linear learning
file_name = './diabetes/df_X_train_final.csv'
df_X_train_final.to_csv(file_name, index = False, header = False)
s3.upload_file(file_name, bucket_name, f"data/train/diabetes_train.csv")

### Tratando base de test para o Linear Learner do Sagemaker

In [149]:
diabetes_test = "diabetes_test.csv"
s3.download_file(bucket_name, f"data/raw/{diabetes_test}", f"./diabetes/{diabetes_test}")

diabetes_test = pd.read_csv( f"./diabetes/{diabetes_test}")

In [150]:
y_test = diabetes_test['Diabetes_binary']

In [151]:
diabetes_test = diabetes_test.drop(columns =['Diabetes_binary'])

In [152]:
X_test = preprocessed_pipeline.fit_transform(diabetes_test)

In [153]:
df_y_test = y_test.to_frame()
df_y_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14139 entries, 0 to 14138
Data columns (total 1 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Diabetes_binary  14139 non-null  float64
dtypes: float64(1)
memory usage: 110.6 KB


In [154]:
df_X_test = pd.DataFrame(data=X_test.toarray(), columns=preprocessed_pipeline.get_feature_names_out().tolist())
df_X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14139 entries, 0 to 14138
Columns: 110 entries, numerical__MentHlth to categorical__Income_8.0
dtypes: float64(110)
memory usage: 11.9 MB


In [155]:
df_X_test_final = pd.concat([df_y_test.reset_index(drop=True),df_X_test.reset_index(drop=True)], axis=1)
df_X_test_final.head()

Unnamed: 0,Diabetes_binary,numerical__MentHlth,numerical__BMI,binary__HighBP,binary__HighChol,binary__CholCheck,binary__Smoker,binary__Stroke,binary__HeartDiseaseorAttack,binary__PhysActivity,...,categorical__Education_5.0,categorical__Education_6.0,categorical__Income_1.0,categorical__Income_2.0,categorical__Income_3.0,categorical__Income_4.0,categorical__Income_5.0,categorical__Income_6.0,categorical__Income_7.0,categorical__Income_8.0
0,0.0,10.0,-0.375,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,-1.125,1.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.0,15.0,0.875,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.75,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,-0.625,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [156]:
# Salvando arquivo de teste do linear learning
file_name = './diabetes/df_X_test_final.csv'
df_X_test_final.to_csv(file_name, index = False, header = False)
s3.upload_file(file_name, bucket_name, f"data/test/diabetes_test.csv")

## Preparando esteira de treino

In [186]:
# Criando apontamento dos dados do S3 para a função do fit()
output_bucket = "diabetes-classifier-data-store"
output_prefix = "data"

s3_train_data = f"s3://{output_bucket}/{output_prefix}/train"
print(f"training files will be taken from: {s3_train_data}")
s3_validation_data = f"s3://{output_bucket}/{output_prefix}/test"
print(f"validation files will be taken from: {s3_validation_data}")
output_location = f"s3://{output_bucket}/model/output"
print(f"training artifacts output location: {output_location}")

# Criando conexão de acesso para o SageMaker
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)

training files will be taken from: s3://diabetes-classifier-data-store/data/train
validation files will be taken from: s3://diabetes-classifier-data-store/data/test
training artifacts output location: s3://diabetes-classifier-data-store/model/output


In [187]:
# Baixando imagem do container do Linear Learner
from sagemaker.image_uris import retrieve

container = retrieve("linear-learner", boto3.Session().region_name, version="1")
print(container)
deploy_amt_model = True

382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:1


In [188]:
%%time
from time import gmtime, strftime

sess = sagemaker.Session()

job_name = "linear-learner-SVM-diabetes-classification" + strftime("%Y%m%d-%H-%M-%S", gmtime())
print("Training job", job_name)

linear = sagemaker.estimator.Estimator(
    container,
    role,
    input_mode="File",
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path=output_location,
    sagemaker_session=sess,
)

# Referencia do SVM no SageMaker usando o Linear Learner
# https://aws.amazon.com/pt/blogs/machine-learning/train-faster-more-flexible-models-with-amazon-sagemaker-linear-learner/

linear.set_hyperparameters(
    epochs=40,
    loss="hinge_loss",
    predictor_type="binary_classifier",
    l1=0.1
)

Training job linear-learner-SVM-diabetes-classification20221119-18-14-11
CPU times: user 12.3 ms, sys: 0 ns, total: 12.3 ms
Wall time: 11.4 ms


### Treinando o modelo no Container

In [189]:
%%time
linear.fit(inputs={"train": train_data, "validation": validation_data}, job_name=job_name)

2022-11-19 18:14:13 Starting - Starting the training job...ProfilerReport-1668881653: InProgress
......
2022-11-19 18:15:38 Starting - Preparing the instances for training.........
2022-11-19 18:17:11 Downloading - Downloading input data...
2022-11-19 18:17:38 Training - Downloading the training image............
2022-11-19 18:19:39 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[11/19/2022 18:19:35 INFO 139887154079552] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07

### Fazendo o deploy do modelo treinado em um enpoint para consumo

In [190]:
%%time
# Criando endpoint com o modelo treinado

linear_predictor = linear.deploy(initial_instance_count=1, instance_type="ml.c4.xlarge")
print(f"\ncreated endpoint: {linear_predictor.endpoint_name}")

---------!
created endpoint: linear-learner-2022-11-19-18-23-43-207
CPU times: user 383 ms, sys: 8.08 ms, total: 391 ms
Wall time: 4min 32s


In [191]:
# Configurando serializadores para enviar os novos dados para o modelo via API
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

linear_predictor.serializer = CSVSerializer()
linear_predictor.deserializer = JSONDeserializer()

## Inferência no Endpoint criado

In [192]:
%%time
import json
from itertools import islice
import math
import struct
import random


diabetes_test = "diabetes_test.csv"
s3.download_file(bucket_name, f"data/test/{diabetes_test}", f"./diabetes/{diabetes_test}")


# Buscando sample aleatória no conjunto de teste
test_data = [l for l in open( f"./diabetes/{diabetes_test}", "r")]
sample = random.choice(test_data).split(",")
actual_age = sample[0]
payload = sample[1:]  # removing actual age from the sample
payload = ",".join(map(str, payload))

# Invocando a função predict passando o payload com os dados
result = linear_predictor.predict(payload)

# Consumindo a resposta do endpoint
result = round(float(result["predictions"][0]["score"]))

print(f"Ground Truth: {actual_age}\nPrediction: {result}")

Ground Truth: 0.0
Prediction: 1
CPU times: user 93.6 ms, sys: 8.08 ms, total: 102 ms
Wall time: 434 ms


### Delentando o Endpoint criado

In [173]:
# sagemaker.Session().delete_endpoint(linear_predictor.endpoint_name)
# print(f"deleted {linear_predictor.endpoint_name} successfully!")

deleted linear-learner-2022-11-19-16-53-55-519 successfully!
