In [1]:
import pandas as pd
import psycopg2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, multilabel_confusion_matrix
from sklearn.dummy import DummyClassifier
import numpy as np


In [29]:
# Carregando o Dataset
df = pd.read_pickle('../storage/pickle/dataframe_amostra_no_dummies0.15.pkl')
df = df.drop_duplicates()
# Tratando dado de idade
df["idade"] = pd.to_numeric(df["idade"])

In [30]:
# Conecta ao banco de dados
conn = psycopg2.connect(
    host="localhost",
    database="previsao-patologias",
    user="postgres",
    password="test"
)

cursor = conn.cursor()
query = "SELECT texto FROM tb_questoes WHERE tipo_resposta = 'binaria';"
cursor.execute(query)

# Pega resultado da query e transforma em lista para realizar iteração
tuplas_resultados = cursor.fetchall()
questoes = [item for tupla in tuplas_resultados for item in tupla]

In [31]:
questoes_binarias = []
for c in questoes:
    nome_coluna_tratado = c.replace(" ", "_").replace(".", "").replace("?", "").lower()
    questoes_binarias.append(nome_coluna_tratado)
    
colunas_dummies = [col for col in df.columns if col not in ['idade', 'patologia', 'severidade'] + questoes_binarias]

df_dummies = pd.get_dummies(df, columns=colunas_dummies, dtype=int)

Separando variáveis dependentes e de testes e criando conjunto de treino e testes

In [32]:
variaveis_dependentes = df_dummies.drop(columns=['patologia', 'severidade'])
variavel_target = df_dummies['patologia']

In [33]:
# Splitando conjuntos de treino e testes
X_train, X_test, y_train, y_test = train_test_split(variaveis_dependentes, variavel_target, test_size=0.2, random_state=42)

In [8]:
# Criar um modelo de regressão logística multinomial
random_forest = RandomForestClassifier(
    n_estimators=50,  # Reduza o número de árvores
    max_depth=5,  # Reduza a profundidade máxima das árvores
    min_samples_split=5,  # Aumente o número mínimo de amostras necessárias para dividir um nó
    random_state=42
)

Realizando Fit das Variáveis e Predição das Observações

In [35]:
# Realizando o fit das variáveis
random_forest.fit(X_train, y_train)

# Fazer previsões
y_pred = random_forest.predict(X_test)

Utilizando técnica de acurácia comum

In [40]:
# Avaliar o desempenho do modelo
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)
matriz_confusao = confusion_matrix(y_test, y_pred, labels=random_forest.classes_)
f1 = f1_score(y_test, y_pred, average='macro')

# Calcular sensibilidade (recall) e especificidade para cada classe
sensibilidade = np.diag(matriz_confusao) / np.sum(matriz_confusao, axis=1)
especificidade = np.diag(matriz_confusao) / np.sum(matriz_confusao, axis=0)

# Calcular sensibilidade e especificidade médias (macro)
sensibilidade_media = np.mean(sensibilidade)

# Exibir métricas de desempenho
print(f'Acurácia: {accuracy}')
print('Relatório de Classificação:\n', classification_report_result)
print(f"F1-Score: {f1}")
print(f'Sensibilidade (Recall) Média: {sensibilidade_media}')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Acurácia: 0.7659349173553719
Relatório de Classificação:
                                           precision    recall  f1-score   support

     acute copd exacerbation / infection       1.00      0.83      0.91       660
                acute dystonic reactions       1.00      1.00      1.00      1017
                        acute laryngitis       1.00      0.24      0.39       905
                      acute otitis media       1.00      0.17      0.29       978
                   acute pulmonary edema       1.00      1.00      1.00       699
                    acute rhinosinusitis       0.00      0.00      0.00       487
                      allergic sinusitis       0.93      1.00      0.96       879
                             anaphylaxis       1.00      1.00      1.00      1056
                                  anemia       0.94      1.00      0.97      1939
                     atrial fibrillation       1.00      1.00      1.00       807
                               boerhaav

  especificidade = np.diag(matriz_confusao) / np.sum(matriz_confusao, axis=0)


Utilizando K-fold Cross Validation

In [10]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acuaria_validacao_cruzada = cross_val_score(random_forest, variaveis_dependentes, variavel_target, cv=kfold, scoring='f1_macro')

# Exibir a acurácia média e o desvio padrão
print(f'F1-Score Médio (k-fold): {acuaria_validacao_cruzada.mean()}')
print(f'Desvio Padrão da Acurácia (k-fold): {acuaria_validacao_cruzada.std()}')

F1-Score Médio (k-fold): 0.6043395500990572
Desvio Padrão da Acurácia (k-fold): 0.013115882759319297


Estimação do Modelo com Conjunto de Dados Balanceado

In [7]:
# Carregando o Dataset
df = pd.read_pickle('../storage/pickle/dataframe_smote0.10.pkl')
df = df.drop_duplicates()
# Tratando dado de idade
df["idade"] = pd.to_numeric(df["idade"])

Unnamed: 0,idade,severidade,"are_the_symptoms_or_pain_increased_with_coughing,_with_an_effort_like_lifting_a_weight_or_from_forcing_a_bowel_movement",are_there_any_members_of_your_family_who_have_been_diagnosed_myasthenia_gravis,are_there_members_of_your_family_who_have_been_diagnosed_with_pancreatic_cancer,are_you_a_former_smoker,are_you_being_treated_for_osteoporosis,are_you_consulting_because_you_have_high_blood_pressure,are_you_currently_being_treated_or_have_you_recently_been_treated_with_an_oral_antibiotic_for_an_ear_infection,are_you_currently_taking_or_have_you_recently_taken_anti-inflammatory_drugs_(nsaids),...,where_is_the_swelling_located_nowhere,where_is_the_swelling_located_posterior aspect of the ankle(L),where_is_the_swelling_located_posterior aspect of the ankle(R),where_is_the_swelling_located_sole(L),where_is_the_swelling_located_sole(R),where_is_the_swelling_located_thigh(L),where_is_the_swelling_located_thigh(R),where_is_the_swelling_located_tibia(L),where_is_the_swelling_located_tibia(R),patologia
0,35.000000,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,urti
1,65.000000,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,panic attack
2,36.000000,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,epiglottitis
3,58.000000,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,panic attack
4,15.000000,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,acute laryngitis
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199571,30.143876,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,whooping cough
199572,65.017238,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,whooping cough
199574,28.000000,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,whooping cough
199575,2.544605,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,whooping cough


In [6]:
variaveis_dependentes = df.drop(columns=['patologia', 'severidade'])
variavel_target = df['patologia']

In [8]:
# Splitando conjuntos de treino e testes
X_train, X_test, y_train, y_test = train_test_split(variaveis_dependentes, variavel_target, test_size=0.2, random_state=42)

In [9]:
# Criar um modelo de regressão logística multinomial
random_forest = RandomForestClassifier(
    n_estimators=50,  # Reduza o número de árvores
    max_depth=5,  # Reduza a profundidade máxima das árvores
    min_samples_split=5,  # Aumente o número mínimo de amostras necessárias para dividir um nó
    random_state=42
)

In [10]:
# Realizando o fit das variáveis
random_forest.fit(X_train, y_train)

# Fazer previsões
y_pred = random_forest.predict(X_test)

In [12]:
# Avaliar o desempenho do modelo
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)
matriz_confusao = confusion_matrix(y_test, y_pred, labels=random_forest.classes_)
f1 = f1_score(y_test, y_pred, average='macro')

# Calcular sensibilidade (recall) e especificidade para cada classe
sensibilidade = np.diag(matriz_confusao) / np.sum(matriz_confusao, axis=1)

# Calcular sensibilidade e especificidade médias (macro)
sensibilidade_media = np.mean(sensibilidade)

# Exibir métricas de desempenho
print(f'Acurácia: {accuracy}')
print('Relatório de Classificação:\n', classification_report_result)
print(f"F1-Score: {f1}")
# print(f'Sensibilidade (Recall) Média: {sensibilidade_media}')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Acurácia: 0.9136672539642587
Relatório de Classificação:
                                           precision    recall  f1-score   support

     acute copd exacerbation / infection       1.00      0.99      0.99       803
                acute dystonic reactions       1.00      0.98      0.99       781
                        acute laryngitis       1.00      0.93      0.96       829
                      acute otitis media       0.00      0.00      0.00       850
                   acute pulmonary edema       1.00      1.00      1.00       796
                    acute rhinosinusitis       0.66      0.83      0.73       820
                      allergic sinusitis       1.00      1.00      1.00       789
                             anaphylaxis       1.00      1.00      1.00       851
                                  anemia       1.00      0.99      0.99       775
                     atrial fibrillation       1.00      1.00      1.00       797
                               boerhaav

: 

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acuaria_validacao_cruzada = cross_val_score(random_forest, variaveis_dependentes, variavel_target, cv=kfold, scoring='f1_macro')

# Exibir a acurácia média e o desvio padrão
print(f'F1-Score Médio (k-fold): {acuaria_validacao_cruzada.mean()}')
print(f'Desvio Padrão da Acurácia (k-fold): {acuaria_validacao_cruzada.std()}')

F1-Score Médio (k-fold): 0.9338032960468435
Desvio Padrão da Acurácia (k-fold): 0.010901193192020299
