### Carregamento das bibliotecas

In [1]:
# Manipulacao de dados
import numpy as np
import pandas as pd

# Visualizacao de dados
import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt

# Machine learning
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

#crod utils
from crod_utils import *

# cross validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_validate, KFold
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

# Selecao de caracteristicas e encoders
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Metricas de avaliacao
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import recall_score, precision_score, f1_score, average_precision_score, plot_roc_curve

# remover warning
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("files/covid_2021_cleaned.csv", sep=';')
df_ = pd.read_csv("files/covid_2021_cleaned_raca.csv", sep=';')

In [3]:
df.head()

Unnamed: 0,SEXO,FAIXAETARIA,EVOLUCAO,FEBRE,TOSSE,GARGANTA,DISPNEIA,GESTANTE,SRAG,CARDIOPATIA,DIABETES,DOENCA_RESPIRATORIA,PROBLEMA_RENAL,OBESIDADE,DOENCA_CROMOSSOMICA
0,Masculino,50 a 59,0,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NÃO,NAO,NAO
1,Masculino,<1,0,SIM,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NÃO,NAO,NAO
2,Masculino,30 a 39,0,NAO,SIM,SIM,NAO,NAO,NAO,NAO,NAO,NAO,NÃO,NAO,NAO
3,Masculino,20 a 29,0,NAO,SIM,NAO,SIM,NAO,NAO,NAO,NAO,NAO,NÃO,NAO,NAO
4,Feminino,50 a 59,0,NAO,SIM,NAO,SIM,NAO,SIM,NAO,NAO,NAO,NÃO,NAO,NAO


In [4]:
print_df_dimensions(df)

DIMENSÕES DO DATAFRAME:
Linhas:		584228
Colunas:	15


In [5]:
# raca
print_df_dimensions(df_)

DIMENSÕES DO DATAFRAME:
Linhas:		467476
Colunas:	16


In [6]:
df.describe(include='O')

Unnamed: 0,SEXO,FAIXAETARIA,FEBRE,TOSSE,GARGANTA,DISPNEIA,GESTANTE,SRAG,CARDIOPATIA,DIABETES,DOENCA_RESPIRATORIA,PROBLEMA_RENAL,OBESIDADE,DOENCA_CROMOSSOMICA
count,584228,584228,584228,584228,584228,584228,584228,584228,584228,584228,584228,584228,584228,584228
unique,2,12,2,2,2,2,2,2,2,2,2,2,2,2
top,Feminino,30 a 39,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NAO,NÃO,NAO,NAO
freq,310263,121926,385871,313891,372931,494622,581071,532537,567247,561601,580662,581974,573373,584087


In [7]:
df[df['EVOLUCAO'] == 1].describe(include='O')

Unnamed: 0,SEXO,FAIXAETARIA,FEBRE,TOSSE,GARGANTA,DISPNEIA,GESTANTE,SRAG,CARDIOPATIA,DIABETES,DOENCA_RESPIRATORIA,PROBLEMA_RENAL,OBESIDADE,DOENCA_CROMOSSOMICA
count,18579,18579,18579,18579,18579,18579,18579,18579,18579,18579,18579,18579,18579,18579
unique,2,12,2,2,2,2,2,1,2,2,2,2,2,2
top,Masculino,60 a 69,NAO,SIM,NAO,SIM,NAO,SIM,NAO,NAO,NAO,NÃO,NAO,NAO
freq,9889,4799,9670,10873,15627,15676,18532,18579,10851,12986,17027,17718,15711,18521


---

# III. Teste e validação dos modelos

## 1.  *Feature Encoding*

Como modelos de *machine learning* performam algebra linear em matrizes, precisamos que todas as variáveis estejam como valores numericos. O processo de conversão de variaveis categoricas em numéricas é chamado de *Encoding*. Existem algumas abordagens para converter os dados categóricos em numéricos, para que o modelo possa entendê-los. Veremos os dois métodos: **Label encoding** e **One-Hot encoding**.

- Label Encoding: Recomendado quando lidamos com variáveis categóricas ordinais: Consiste em *designar um valor inteiro* (0, 1, 2, ...) oara cada valor unico da coluna.
- One-hot encoding: Consiste em *criar uma nova coluna* para cada valor único de uma coluna existente.

Material complementar:
- [Um guia (simples) de como tratar variáveis categóricas em Machine Learning](https://medium.com/@nelson.ewert.oliveira/um-guia-simples-de-como-tratar-vari%C3%A1veis-categ%C3%B3ricas-em-machine-learning-b791a00da805)
- [One-Hot Encoding vs. Label Encoding using Scikit-Learn](https://www.analyticsvidhya.com/blog/2020/03/one-hot-encoding-vs-label-encoding-using-scikit-learn/)
- [Guide to Encoding Categorical Values in Python](https://pbpython.com/categorical-encoding.html)

Para nosso contexto, se utilizássemos o **One-hot encoding**, aplicaria apenas na 'FAIXAETARIA', pois é a única que possui mais de 2 valores possíveis. As demais variáveis são dicotômicas, e por isso, caso utilizássemos o one-hot, na prática este seria o **label encoding**. Então, vamos utilizar diretamente este último.

In [8]:
df['FAIXAETARIA'].value_counts()

30 a 39      121926
20 a 29      108501
40 a 49      104102
50 a 59       88424
60 a 69       57058
15 a 19       30292
70 a 79       27233
10 a 14       13395
80 e mais     12490
05 a 09        9267
01 a 04        7407
<1             4133
Name: FAIXAETARIA, dtype: int64

In [9]:
# label encoding
df['FAIXAETARIA'] = df['FAIXAETARIA'].map({'<1': 0, 
                       '01 a 04': 1, 
                       '05 a 09': 2,
                       '10 a 14': 3, 
                       '15 a 19': 4,
                       '20 a 29': 5, 
                       '30 a 39': 6, 
                       '40 a 49': 7, 
                       '50 a 59': 8, 
                       '60 a 69': 9,
                       '70 a 79': 10, 
                       '80 e mais': 11})

In [10]:
# label encoding raça
df_['FAIXAETARIA'] = df_ ['FAIXAETARIA'].map({'<1': 0, 
                       '01 a 04': 1, 
                       '05 a 09': 2,
                       '10 a 14': 3, 
                       '15 a 19': 4,
                       '20 a 29': 5, 
                       '30 a 39': 6, 
                       '40 a 49': 7, 
                       '50 a 59': 8, 
                       '60 a 69': 9,
                       '70 a 79': 10, 
                       '80 e mais': 11})

In [11]:
df['FAIXAETARIA'].value_counts()

6     121926
5     108501
7     104102
8      88424
9      57058
4      30292
10     27233
3      13395
11     12490
2       9267
1       7407
0       4133
Name: FAIXAETARIA, dtype: int64

In [12]:
# LabelEncoder
le = LabelEncoder()

# apply "le.fit_transform"
df = df.apply(le.fit_transform)
df

Unnamed: 0,SEXO,FAIXAETARIA,EVOLUCAO,FEBRE,TOSSE,GARGANTA,DISPNEIA,GESTANTE,SRAG,CARDIOPATIA,DIABETES,DOENCA_RESPIRATORIA,PROBLEMA_RENAL,OBESIDADE,DOENCA_CROMOSSOMICA
0,1,8,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,1,6,0,0,1,1,0,0,0,0,0,0,0,0,0
3,1,5,0,0,1,0,1,0,0,0,0,0,0,0,0
4,0,8,0,0,1,0,1,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
584223,0,9,1,0,1,0,1,0,1,1,0,0,0,0,0
584224,0,10,0,0,0,0,1,0,1,0,0,0,0,0,0
584225,0,10,0,0,0,0,1,0,1,1,0,0,0,0,0
584226,1,8,0,0,1,0,0,0,0,0,0,0,0,0,0


In [13]:
# raça
# apply "le.fit_transform"
df_ = df_.apply(le.fit_transform)

In [14]:
# salvar no disco
df.to_csv("files/covid_2021_prepared.csv", sep=';', index=False)
df_.to_csv("files/covid_2021_prepared_raça.csv", sep=";", index=False)

## 2. Separação X y

In [15]:
# sem raca
X_val = df.drop(['EVOLUCAO'], axis=1)
y_val = df['EVOLUCAO']

# com raca/cor
X_val_ = df_.drop(['EVOLUCAO'], axis=1)
y_val_ = df_['EVOLUCAO']

# Validacao
X, X_test_val, y, y_test_val = train_test_split(X_val, y_val, stratify=y_val, test_size=0.3, random_state=777)
X_, X_test_val_, y_, y_test_val_ = train_test_split(X_val_, y_val_, stratify=y_val_, test_size=0.3, random_state=777)

# treino e teste
y.reset_index(drop=True, inplace=True)
X.reset_index(drop=True, inplace=True)
y_.reset_index(drop=True, inplace=True)
X_.reset_index(drop=True, inplace=True)

In [16]:
y.value_counts()

0    395954
1     13005
Name: EVOLUCAO, dtype: int64

In [17]:
y_.value_counts()

0    321850
1      5383
Name: EVOLUCAO, dtype: int64

## 3. Treino dos modelos

Material complementar:
- [How To Choose The Right Test Options When Evaluating Machine Learning Algorithms](https://machinelearningmastery.com/how-to-choose-the-right-test-options-when-evaluating-machine-learning-algorithms/)
- [Why you should be Spot-Checking Algorithms on your Machine Learning Problems](https://machinelearningmastery.com/why-you-should-be-spot-checking-algorithms-on-your-machine-learning-problems/)
- [A Data-Driven Approach to Choosing Machine Learning Algorithms](https://machinelearningmastery.com/a-data-driven-approach-to-machine-learning/)
- [Cross-validation: evaluating estimator performance](https://scikit-learn.org/stable/modules/cross_validation.html)

In [18]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import confusion_matrix

def print_metric(results, name):
    media = np.mean(results)
    desvio_padrao = np.std(results)
    print(f"{name} ", end='')
    print("médio: %.3f | intervalo: [%.3f, %.3f]" % ((media), (media - 2 * desvio_padrao), (media + 2 * desvio_padrao)))

# Dados padrão

In [19]:
# initial parameters
SEED = 777
K_FOLD = 5


# definicao dos modelos
models = [
          RandomForestClassifier(random_state=SEED), 
          ]

# nome dos modelos
verbose_models = ['Floresta aleatória']

# figuras para plots
# plt.rcParams.update({'font.size': 12})
# fig1, ax1 = plt.subplots(figsize=(12, 6))
# fig2, ax2 = plt.subplots(figsize=(12, 6))

for name, model in zip(verbose_models, models):
    # kfold
    kf = StratifiedKFold(n_splits=K_FOLD, shuffle=True, random_state=SEED)

    print("===============")
    print(name + " - Sem raça/cor")
    # percorre cada fold
    auc_roc = []
    precision = []
    recall = []
    f1score = []
    
    for train_index, test_index in kf.split(X,y):
        # divisao treino e teste
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # treino e predicao
        y_pred = model.fit(X_train, y_train).predict(X_test)

        # computacao da metrica
        auc_roc.append(roc_auc_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        f1score.append(f1_score(y_test, y_pred))

    print_metric(auc_roc, 'AUC-ROC')
    print_metric(precision, 'PRECISÃO')
    print_metric(recall, 'REVOCAÇÃO')
    print_metric(f1score, 'F1 SCORE')
    print(confusion_matrix(y_test, y_pred))
    # plot_roc_curve(model, X_test, y_test, ax=ax1, name=name)
    # plot_precision_recall_curve(model, X_test, y_test, ax=ax2, name=name)
    print("===============")

    # kfold
    kf = StratifiedKFold(n_splits=K_FOLD, shuffle=True, random_state=SEED)
    print("===============")
    name = name + " - Com raça/cor"
    print(name)
    # percorre cada fold
    auc_roc = []
    precision = []
    recall = []
    f1score = []
    
    for train_index, test_index in kf.split(X_,y_):
        # divisao treino e teste
        X_train, X_test = X_.loc[train_index], X_.loc[test_index]
        y_train, y_test = y_[train_index], y_[test_index]

        # treino e predicao
        y_pred = model.fit(X_train, y_train).predict(X_test)

        # computacao da metrica
        auc_roc.append(roc_auc_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        f1score.append(f1_score(y_test, y_pred))

    print_metric(auc_roc, 'AUC-ROC')
    print_metric(precision, 'PRECISÃO')
    print_metric(recall, 'REVOCAÇÃO')
    print_metric(f1score, 'F1 SCORE')
    print(confusion_matrix(y_test, y_pred))
    # plot_roc_curve(model, X_test, y_test, ax=ax1, name=name)
    # plot_precision_recall_curve(model, X_test, y_test, ax=ax2, name=name)
    print("===============")

#_ = ax1.set_title('Curva ROC')
#_ = ax2.set_title('Curva Precisão-Sensibilidade')
#plt.show()     

Floresta aleatória - Sem raça/cor
AUC-ROC médio: 0.705 | intervalo: [0.694, 0.716]
PRECISÃO médio: 0.558 | intervalo: [0.543, 0.572]
REVOCAÇÃO médio: 0.422 | intervalo: [0.399, 0.444]
F1 SCORE médio: 0.480 | intervalo: [0.465, 0.495]
[[78269   921]
 [ 1461  1140]]
Floresta aleatória - Com raça/cor
AUC-ROC médio: 0.539 | intervalo: [0.528, 0.550]
PRECISÃO médio: 0.346 | intervalo: [0.302, 0.391]
REVOCAÇÃO médio: 0.081 | intervalo: [0.059, 0.103]
F1 SCORE médio: 0.131 | intervalo: [0.099, 0.163]
[[64209   161]
 [  980    96]]


In [23]:
from sklearn.model_selection import cross_val_score
# balanceamento

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline 


In [27]:
print("Sem raça/cor")
rus = RandomUnderSampler(random_state=777)
# kfold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=777)

imba_pipeline = make_pipeline(rus, RandomForestClassifier(random_state=SEED))
score = cross_val_score(imba_pipeline, X, y, scoring='roc_auc', cv=kf)
print_metric(score, 'ROC AUC')

Sem raça/cor
ROC AUC médio: 0.979 | intervalo: [0.978, 0.979]


In [28]:
print("Com raça/cor")
rus = RandomUnderSampler(random_state=777)
# kfold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=777)

imba_pipeline = make_pipeline(rus, RandomForestClassifier(random_state=SEED))
score = cross_val_score(imba_pipeline, X_, y_, scoring='roc_auc', cv=kf)
print_metric(score, 'ROC AUC')

Com raça/cor
ROC AUC médio: 0.974 | intervalo: [0.973, 0.974]


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=ad6f6ae1-4103-4b0d-873e-7643398d0093' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>