# Avaliação de Um Modelo de Redes Neurais Artificiais em Neonatologia
Projeto final da Disciplina de Aprendizagem de Máquina - PPgEEC/UFRN

### Notebook 1 - CONFECÇÃO DO ARTEFATO

### 1.  Configurações Iniciais

In [1]:
# Módulos
import logging
import wandb
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import fbeta_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from dotenv import load_dotenv
import os

%matplotlib inline

### 2.  Variáveis Auxiliares

In [2]:
# Tamanho do conjunto de teste(fração do dataset que é utilizada como dados de teste)
val_size = 0.1

# Coeficiente de aleatoriedade
seed = 1618

# Variável alvo(target)
stratify = 'reanimacao'

# Artefato de entrada
input_artifact = 'mlreanimacao/clean_data.csv:latest'

# Tipo do artefato
artifact_type = 'Train'

### 3. Aquisição de Artefatos

In [3]:
# Leitura de variáveis de ambiente
load_dotenv('config.env')
HASH_WANDB = os.getenv("HASH")

In [4]:
!wandb login $HASH_WANDB --relogin

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/ericcalasans/.netrc


Configurações de Log

In [5]:
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s %(message)s",
                    datefmt='%d-%m-%Y %H:%M:%S')

# Objeto logging
logger = logging.getLogger()

In [31]:
run = wandb.init(project='mlreanimacao', job_type='train')

# Registra um log desta ação
logger.info("Baixando artefato e realizando leitura...")
artifact = run.use_artifact(input_artifact)
artifact_file = artifact.file()
df_total = pd.read_csv(artifact_file)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

22-07-2022 13:42:02 Baixando artefato e realizando leitura...


In [32]:
df_total

Unnamed: 0,idade_materna,fumo,alcool,psicoativas,tpp,dpp,oligoamnio,sifilis,hiv,covid_mae,dheg,dm,sexo,reanimacao
0,20.0,n_fumo,n_alcool,n_psico,n_tpp,n_dpp,n_oligo,d_sifilis,d_hiv,n_covid,n_dheg,n_dm,Masculino,nr
1,21.0,n_fumo,n_alcool,n_psico,n_tpp,n_dpp,n_oligo,n_sifilis,n_hiv,n_covid,n_dheg,n_dm,Feminino,sr
2,16.0,n_fumo,n_alcool,n_psico,n_tpp,n_dpp,n_oligo,n_sifilis,n_hiv,n_covid,s_dheg,d_dm,Masculino,sr
3,40.0,n_fumo,n_alcool,n_psico,n_tpp,n_dpp,d_oligo,n_sifilis,n_hiv,n_covid,s_dheg,n_dm,Feminino,nr
4,24.0,n_fumo,n_alcool,n_psico,n_tpp,s_dpp,d_oligo,s_sifilis,n_hiv,n_covid,s_dheg,n_dm,Masculino,nr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,35.0,n_fumo,n_alcool,n_psico,n_tpp,n_dpp,n_oligo,n_sifilis,n_hiv,n_covid,n_dheg,n_dm,Masculino,sr
499,29.0,n_fumo,n_alcool,n_psico,n_tpp,n_dpp,n_oligo,n_sifilis,n_hiv,n_covid,n_dheg,n_dm,Feminino,sr
500,31.0,n_fumo,n_alcool,n_psico,n_tpp,n_dpp,n_oligo,n_sifilis,d_hiv,n_covid,d_dheg,d_dm,Masculino,sr
501,27.0,s_fumo,n_alcool,n_psico,n_tpp,n_dpp,n_oligo,n_sifilis,n_hiv,n_covid,n_dheg,n_dm,Masculino,nr


## 4. Preparação do *Dataset*

### 4.1. Remoção de Outliers

In [8]:
logger.info("Remoção de outliers")

# Variável temporária
x = df_total['idade_materna'].copy()

# Redimensiona variável para adequar ao procedimento - técnica sugerida pela mensagem de erro
x = x.values.reshape(-1, 1)

# Identifica e prevê outliers em um único passo
lof = LocalOutlierFactor()
outlier = lof.fit_predict(x)
mask = outlier != -1

22-07-2022 13:40:22 Remoção de outliers


In [9]:
logger.info("x_train shape [original]: {}".format(df_total.shape))
logger.info("x_train shape [outlier removal]: {}".format(df_total.loc[mask,:].shape))

22-07-2022 13:40:24 x_train shape [original]: (503, 14)
22-07-2022 13:40:24 x_train shape [outlier removal]: (427, 14)


In [10]:
df_total = df_total.loc[mask,:].copy()


In [33]:
df_total['reanimacao'].unique()

array(['nr', 'sr'], dtype=object)

### 4.2.  Codificação da Variável *Target*

In [34]:
logger.info("Codificando variável target")
# Objeto codificador
le = LabelEncoder()

# Treinamento e transformação do conjunto de treinamento
df_total['reanimacao'] = le.fit_transform(df_total['reanimacao'])

logger.info("Classes [0, 1]: {}".format(le.inverse_transform([0, 1])))

22-07-2022 13:42:25 Codificando variável target
22-07-2022 13:42:25 Classes [0, 1]: ['nr' 'sr']


In [35]:
df_total['reanimacao'].unique()

array([0, 1])

Retirada da variável *target* das *features*

In [36]:
df_target = df_total['reanimacao']
df_features = df_total.drop(columns=['reanimacao'], axis=1)

In [37]:
df_features.shape

(503, 13)

In [38]:
df_target

0      0
1      1
2      1
3      0
4      0
      ..
498    1
499    1
500    1
501    0
502    1
Name: reanimacao, Length: 503, dtype: int64

## 5. Construção do *Pipeline*

### 5.1. Criação da Classe **FeatureSelection**

In [39]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    # Construtor
    def __init__(self, feature_names):
        self.feature_names = feature_names

    # Override de fit
    def fit(self, X, y=None):
        return self

    # Retorna as colunas passadas no construtor
    def transform(self, X, y=None):
        return X[self.feature_names]

### 5.2. Criação da Classe **CategoricalTransformer**

In [40]:
class CategoricalTransformer(BaseEstimator, TransformerMixin):
    # Construtor
    def __init__(self, new_features=True, colnames=None):
        self.new_features = new_features
        self.colnames = colnames

    # Override de fit
    def fit(self, X, y=None):
        return self

    # Override de get_feature_names_out
    def get_feature_names_out(self):
        return self.colnames.tolist()

    # Transformer method we wrote for this transformer
    def transform(self, X, y=None):
        df = pd.DataFrame(X, columns=self.colnames)

        # Remove eventuais espaços em branco dos valores - a princípio isso não existe na base de dados
        df = df.apply(lambda row: row.str.strip())

        # Opção de fazer preprocessamento se new_features for True(padrão da classe)
        if self.new_features:
            df['fumo'].fillna(2, inplace=True)
            df['alcool'].fillna(2, inplace=True)
            df['psicoativas'].fillna(2, inplace=True)
            df['tpp'].fillna(2, inplace=True)
            df['dheg'].fillna(2, inplace=True)
            df['dm'].fillna(2, inplace=True)
            df['sexo'].fillna(3, inplace=True)
            df['oligoamnio'].fillna(2, inplace=True)
            df['dpp'].fillna(2, inplace=True)
            df['sifilis'].fillna(2, inplace=True)
            df['hiv'].fillna(2, inplace=True)
            df['covid_mae'].fillna(2, inplace=True)

            # Elimina valores com sexo indefinido
            df.drop(df[df['sexo'] == 3].index, inplace=True)

            # Categoriza as features pois o artefato retorna sempre valores numéricos
            df['fumo'].replace([0, 1, 2, 3], ['n_fumo', 's_fumo', 'n_fumo', 'd_fumo'], inplace=True)
            df['alcool'].replace([0, 1, 2, 3], ['n_alcool', 's_alcool', 'n_alcool', 'd_alcool'], inplace=True)
            df['psicoativas'].replace([0, 1, 2, 3], ['n_psico', 's_psico', 'n_psico', 'd_psico'], inplace=True)
            df['tpp'].replace([0, 1, 2, 3], ['n_tpp', 's_tpp', 'n_tpp', 'd_tpp'], inplace=True)
            df['dheg'].replace([0, 1, 2, 3], ['n_dheg', 's_dheg', 'n_dheg', 'd_dheg'], inplace=True)

            # Melhor adequação da feature sexo
            df['sexo'].replace([1,2], ['Feminino', 'Masculino'], inplace=True)
            df['dpp'].replace([0, 1, 2, 3], ['n_dpp', 's_dpp', 'n_dpp', 'd_dpp'], inplace=True)
            df['oligoamnio'].replace([0, 1, 2, 3], ['n_oligo', 's_oligo', 'n_oligo', 'd_oligo'], inplace=True)
            df['sifilis'].replace([0, 1, 2, 3], ['n_sifilis', 's_sifilis', 'n_sifilis', 'd_sifilis'], inplace=True)
            df['hiv'].replace([0, 1, 2, 3], ['n_hiv', 's_hiv', 'n_hiv', 'd_hiv'], inplace=True)
            df['covid_mae'].replace([0, 1, 2, 3], ['n_covid', 's_covid', 'n_covid', 'd_covid'], inplace=True)

            # Consolidações
            df['dm'].replace([0,1,2,3,4,5,6], ['n_dm', 'n_dm','s_dm','s_dm', 's_dm','s_dm','d_dm'], inplace=True)

        self.colnames = df.columns

        return df

### 5.3. Criação da Classe **NumericalTransformer**

In [41]:
class NumericalTransformer(BaseEstimator, TransformerMixin):
    # Tipos de scalers
    # model 0: minmax
    # model 1: standard
    # model 2: without scaler
    def __init__(self, model=0, colnames=None):
        self.model = model
        self.colnames = colnames
        self.scaler = None

    def fit(self, X, y=None):
        df = pd.DataFrame(X, columns=self.colnames)
        # minmax
        if self.model == 0:
            self.scaler = MinMaxScaler()
            self.scaler.fit(df)
        # standard scaler
        elif self.model == 1:
            self.scaler = StandardScaler()
            self.scaler.fit(df)
        return self

    def get_feature_names_out(self):
        return self.colnames

    def transform(self, X, y=None):
        df = pd.DataFrame(X, columns=self.colnames)

        # update columns name
        self.colnames = df.columns.tolist()

        # minmax
        if self.model == 0:
            # transform data
            df = self.scaler.transform(df)
        elif self.model == 1:
            # transform data
            df = self.scaler.transform(df)
        else:
            df = df.values

        return df

In [42]:
fs = FeatureSelector(df_total.select_dtypes("float64").columns.to_list())
df = fs.fit_transform(df_total)
df

Unnamed: 0,idade_materna
0,20.0
1,21.0
2,16.0
3,40.0
4,24.0
...,...
498,35.0
499,29.0
500,31.0
501,27.0


In [43]:
nt = NumericalTransformer(model=1, colnames=df.columns.to_list())
df_nt = nt.fit_transform(df)
df_nt

array([[-1.09089501],
       [-0.95288692],
       [-1.6429274 ],
       [ 1.66926691],
       [-0.53886263],
       [-1.22890311],
       [ 0.56520214],
       [-0.53886263],
       [-0.95288692],
       [-0.40085453],
       [ 1.11723453],
       [-0.53886263],
       [-0.95288692],
       [ 0.70321024],
       [-1.3669112 ],
       [ 1.11723453],
       [-1.3669112 ],
       [ 1.80727501],
       [ 0.01316976],
       [ 0.28918595],
       [-0.53886263],
       [-1.3669112 ],
       [ 0.56520214],
       [-0.12483834],
       [ 0.42719405],
       [ 1.53125882],
       [-0.12483834],
       [ 0.56520214],
       [ 0.56520214],
       [-1.6429274 ],
       [-0.40085453],
       [-0.26284643],
       [-0.26284643],
       [ 0.15117785],
       [ 0.28918595],
       [-0.95288692],
       [ 0.15117785],
       [-0.40085453],
       [-1.78093549],
       [ 0.84121834],
       [-1.3669112 ],
       [ 0.01316976],
       [-1.3669112 ],
       [-1.09089501],
       [ 1.53125882],
       [ 0

### 6.4. Encadeamento

In [44]:
# Tipos de scalers = 0 (min-max), 1 (z-score), 2 (without normalization)
numerical_model = 0

# Features categóricas
categorical_features = df_features.select_dtypes("object").columns.to_list()

# Features numéricas
numerical_features = df_features.select_dtypes("float64").columns.to_list()

# A parte de preenchimento de dados faltantes já é realizada pela classe CategoricalTransformer
categorical_pipeline = Pipeline(steps=[('cat_selector', FeatureSelector(categorical_features)),
                                       ('cat_transformer', CategoricalTransformer(colnames=categorical_features)),
                                       ('cat_encoder', OneHotEncoder(sparse=False, drop="first"))
                                       ]
                                )

# A parte de preenchimento de dados faltantes já é realizada pela classe NumericalTransformer
numerical_pipeline = Pipeline(steps=[('num_selector', FeatureSelector(numerical_features)),
                                     ('num_transformer', NumericalTransformer(numerical_model,
                                                                              colnames=numerical_features))])

# Reune os Pipelines num só
full_pipeline_preprocessing = FeatureUnion(transformer_list=[('cat_pipeline', categorical_pipeline),
                                                             ('num_pipeline', numerical_pipeline)])

In [45]:
features_set = full_pipeline_preprocessing.fit_transform(df_features)

In [46]:
num_names = full_pipeline_preprocessing.get_params()['num_pipeline'][1].get_feature_names_out()
cat_names = list(full_pipeline_preprocessing.get_params()['cat_pipeline'][2].get_feature_names_out())
df_train_set = pd.DataFrame(features_set, columns=cat_names+num_names)
df_train_set.head()

Unnamed: 0,fumo_n_fumo,fumo_s_fumo,alcool_n_alcool,alcool_s_alcool,psicoativas_n_psico,psicoativas_s_psico,tpp_n_tpp,tpp_s_tpp,dpp_n_dpp,dpp_s_dpp,...,hiv_n_hiv,hiv_s_hiv,covid_mae_n_covid,covid_mae_s_covid,dheg_n_dheg,dheg_s_dheg,dm_n_dm,dm_s_dm,sexo_Masculino,idade_materna
0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.242424
1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.272727
2,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.121212
3,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.848485
4,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.363636


In [47]:
df_train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   fumo_n_fumo          503 non-null    float64
 1   fumo_s_fumo          503 non-null    float64
 2   alcool_n_alcool      503 non-null    float64
 3   alcool_s_alcool      503 non-null    float64
 4   psicoativas_n_psico  503 non-null    float64
 5   psicoativas_s_psico  503 non-null    float64
 6   tpp_n_tpp            503 non-null    float64
 7   tpp_s_tpp            503 non-null    float64
 8   dpp_n_dpp            503 non-null    float64
 9   dpp_s_dpp            503 non-null    float64
 10  oligoamnio_n_oligo   503 non-null    float64
 11  oligoamnio_s_oligo   503 non-null    float64
 12  sifilis_n_sifilis    503 non-null    float64
 13  sifilis_s_sifilis    503 non-null    float64
 14  hiv_n_hiv            503 non-null    float64
 15  hiv_s_hiv            503 non-null    flo

## 7. *Upload* de Artefato Final

Concatenação de *datasets*

In [48]:
df_target_set = pd.DataFrame(df_target, columns=['reanimacao'])

In [49]:
df_target_set.reset_index(drop=True, inplace=True)

In [50]:
df_target_set

Unnamed: 0,reanimacao
0,0
1,1
2,1
3,0
4,0
...,...
498,1
499,1
500,1
501,0


In [51]:
df_final = pd.concat([df_train_set, df_target_set], axis=1)

In [52]:
df_final['reanimacao'].unique()

array([0, 1])

In [53]:
df_final.to_csv('df_rean.csv')

In [54]:
!wandb artifact put --name mlprojfinal/df_rean.csv --type rean_joined --description "Full coded dataset" df_rean.csv

[34m[1mwandb[0m: Uploading file df_rean.csv to: "ecalasans/mlprojfinal/df_rean.csv:latest" (rean_joined)
[34m[1mwandb[0m: Currently logged in as: [33mecalasans[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.12.21
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/Users/ericcalasans/Documents/Projetos/mlprojfinal/wandb/run-20220722_134304-2l5ossn3[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mfirm-dew-28[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/ecalasans/mlprojfinal[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/ecalasans/mlprojfinal/runs/2l5ossn3[0m
Artifact uploaded, use this artifact in a run by adding:

    artifact = run.use_artifact("ecalasans/mlprojfinal/df_rean.csv:latest")

[34m[1mwandb[0m: Waiting for W&B process to finish... [32m(success).[0m
[34m[1mwandb[0m:          

In [55]:
run.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…