# ROC AUC Calculator - Experts

In [None]:
import os
import sys
from dotenv import load_dotenv

load_dotenv(override=True)
print('ENV variables loaded successfully!')

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from operator import itemgetter
from sklearn.metrics import plot_roc_curve, roc_auc_score, roc_curve

from lib.env_var_keys import EnvVarKeys
from lib.dataframe_helper import vdrl_count

pre_processed_dataset_path = os.getenv(EnvVarKeys.PRE_PROCESSED_DATASET_PATH_KEY.value)
df = pd.read_csv(pre_processed_dataset_path, sep=',', low_memory=False)

print(f'Pre-processed dataset shape: {df.shape}')

vdrl_count(df)

cols_to_keep = [
  'mc_get_risco_gestacional',
  'mc_mul_qtd_aborto',
  'mc_get_gravidez_planejada',
  'mc_mul_est_civil',
  'mc_mul_nivel_inseguranca',
  'mc_mul_qtd_filhos_vivos',
  'mc_mul_qtd_gest',
  'mc_mul_rec_inf_plan_fam',
  'mc_dae_escolaridade',
  'mc_dae_rfa',
  'idade',
]

In [None]:
def plot_roc_curve(tpr, fpr, label, scatter = True):
    plt.figure(figsize = (8, 8))
    if scatter:
        sns.scatterplot(x = fpr, y = tpr)
    sns.lineplot(x = fpr, y = tpr, label=label)
    ax = sns.lineplot(x = [0, 1], y = [0, 1], color = 'gray', label='Threshold')
    ax.lines[1].set_linestyle('--')
    plt.xlim(-0.05, 1.05)
    plt.ylim(-0.05, 1.05)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend(loc="lower right")

## IDS - Exp 1:

### Decision tree:

In [None]:
from sklearn.tree import DecisionTreeClassifier
from lib.experiment_helper import prepare_data_exp_ids, print_metrics

X_train, X_test, y_train, y_test, abc = prepare_data_exp_ids(df, cols_to_keep)

clf_ids_decision_tree = DecisionTreeClassifier(criterion='entropy', splitter='random')
clf_ids_decision_tree.fit(X_train, y_train)

y_pred_ids_decision_tree = clf_ids_decision_tree.predict(X_test)
y_pred_proba_ids_decision_tree = clf_ids_decision_tree.predict_proba(X_test)
fpr_ids_decision_tree, tpr_ids_decision_tree, _ = roc_curve(y_test, y_pred_proba_ids_decision_tree[:, 1])

roc_score_decision_tree = roc_auc_score(y_test, y_pred_proba_ids_decision_tree[:, 1])
print(f"\nROC AUC: {roc_score_decision_tree:.4f}")

print_metrics(y_test, y_pred_ids_decision_tree)

plot_roc_curve(tpr_ids_decision_tree, fpr_ids_decision_tree, 'Decision Tree', False)

### Random forest:

In [None]:
from sklearn.ensemble import RandomForestClassifier
from lib.experiment_helper import prepare_data_exp_ids, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_ids(df, cols_to_keep)

clf_ids_rf = RandomForestClassifier(criterion='gini', n_estimators=50)
clf_ids_rf.fit(X_train, y_train)

y_pred_ids_rf = clf_ids_rf.predict(X_test)
y_pred_proba_ids_rf = clf_ids_rf.predict_proba(X_test)
fpr_ids_rf, tpr_ids_rf, _ = roc_curve(y_test, y_pred_proba_ids_rf[:, 1])

roc_score_rf = roc_auc_score(y_test, y_pred_proba_ids_rf[:, 1])
print(f"\nROC AUC: {roc_score_rf:.4f}")

print_metrics(y_test, y_pred_ids_rf)

plot_roc_curve(tpr_ids_rf, fpr_ids_rf, 'Random Forest', False)

### AdaBoost:

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from lib.experiment_helper import prepare_data_exp_ids, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_ids(df, cols_to_keep)

clf_ids_ada = AdaBoostClassifier(learning_rate=0.5, n_estimators=100)
clf_ids_ada.fit(X_train, y_train)

y_pred_ids_ada = clf_ids_ada.predict(X_test)
y_pred_proba_ids_ada = clf_ids_ada.predict_proba(X_test)
fpr_ids_ada, tpr_ids_ada, _ = roc_curve(y_test, y_pred_proba_ids_ada[:, 1])

roc_score_ada = roc_auc_score(y_test, y_pred_proba_ids_ada[:, 1])
print(f"\nROC AUC: {roc_score_ada:.4f}")

print_metrics(y_test, y_pred_ids_ada)

plot_roc_curve(tpr_ids_ada, fpr_ids_ada, 'AdaBoost', False)

### GBM:

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from lib.experiment_helper import prepare_data_exp_ids, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_ids(df, cols_to_keep)

clf_ids_gbm = GradientBoostingClassifier(learning_rate=0.5, loss='exponential', n_estimators=50)
clf_ids_gbm.fit(X_train, y_train)

y_pred_ids_gbm = clf_ids_gbm.predict(X_test)
y_pred_proba_ids_gbm = clf_ids_gbm.predict_proba(X_test)
fpr_ids_gbm, tpr_ids_gbm, _ = roc_curve(y_test, y_pred_proba_ids_gbm[:, 1])

roc_score_gbm = roc_auc_score(y_test, y_pred_proba_ids_gbm[:, 1])
print(f"\nROC AUC: {roc_score_gbm:.4f}")

print_metrics(y_test, y_pred_ids_gbm)

plot_roc_curve(tpr_ids_gbm, fpr_ids_gbm, 'GBM', False)

### XGBoost:

In [None]:
from xgboost import XGBClassifier
from lib.experiment_helper import prepare_data_exp_ids, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_ids(df, cols_to_keep)

clf_ids_xgb = XGBClassifier(learning_rate=0.3, max_depth=5)
clf_ids_xgb.fit(X_train, y_train)

y_pred_ids_xgb = clf_ids_xgb.predict(X_test)
y_pred_proba_ids_xgb = clf_ids_xgb.predict_proba(X_test)
fpr_ids_xgb, tpr_ids_xgb, _ = roc_curve(y_test, y_pred_proba_ids_xgb[:, 1])

roc_score_xgb = roc_auc_score(y_test, y_pred_proba_ids_xgb[:, 1])
print(f"\nROC AUC: {roc_score_xgb:.4f}")

print_metrics(y_test, y_pred_ids_xgb)

plot_roc_curve(tpr_ids_xgb, fpr_ids_xgb, 'XGB', False)

### KNN:

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from lib.experiment_helper import prepare_data_exp_ids, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_ids(df, cols_to_keep)

clf_ids_knn = KNeighborsClassifier(n_neighbors=15, p=1, weights='uniform')
clf_ids_knn.fit(X_train, y_train)

y_pred_ids_knn = clf_ids_knn.predict(X_test)
y_pred_proba_ids_knn = clf_ids_knn.predict_proba(X_test)
fpr_ids_knn, tpr_ids_knn, _ = roc_curve(y_test, y_pred_proba_ids_knn[:, 1])

roc_score_knn = roc_auc_score(y_test, y_pred_proba_ids_knn[:, 1])
print(f"\nROC AUC: {roc_score_knn:.4f}")

print_metrics(y_test, y_pred_ids_knn)

plot_roc_curve(tpr_ids_knn, fpr_ids_knn, 'KNN', False)

### SVM:

In [None]:
from sklearn.svm import SVC
from lib.experiment_helper import prepare_data_exp_ids, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_ids(df, cols_to_keep)

clf_ids_svm = SVC(gamma='auto', kernel='rbf', probability=True)
clf_ids_svm.fit(X_train, y_train)

y_pred_ids_svm = clf_ids_svm.predict(X_test)
y_pred_proba_ids_svm = clf_ids_svm.predict_proba(X_test)
fpr_ids_svm, tpr_ids_svm, _ = roc_curve(y_test, y_pred_proba_ids_svm[:, 1])

roc_score_svm = roc_auc_score(y_test, y_pred_proba_ids_svm[:, 1])
print(f"\nROC AUC: {roc_score_svm:.4f}")

print_metrics(y_test, y_pred_ids_svm)

plot_roc_curve(tpr_ids_svm, fpr_ids_svm, 'SVM', False)

### All:

In [None]:
plt.figure(figsize = (8, 8))
ax = sns.lineplot(x = [0, 1], y = [0, 1], color = 'gray', label='Threshold')
ax.lines[0].set_linestyle('--')

sns.lineplot(x = fpr_ids_decision_tree, y = tpr_ids_decision_tree, label='Decision Tree')
sns.lineplot(x = fpr_ids_rf, y = tpr_ids_rf, label='Random Forest')
sns.lineplot(x = fpr_ids_ada, y = tpr_ids_ada, label='AdaBoost')
sns.lineplot(x = fpr_ids_gbm, y = tpr_ids_gbm, label='GBM')
sns.lineplot(x = fpr_ids_xgb, y = tpr_ids_xgb, label='XGB')
sns.lineplot(x = fpr_ids_knn, y = tpr_ids_knn, label='KNN')
sns.lineplot(x = fpr_ids_svm, y = tpr_ids_svm, label='SVM')

plt.xlim(-0.05, 1.05)
plt.ylim(-0.05, 1.05)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")

## BDS - Exp 2:

### Decision tree:

In [None]:
from sklearn.tree import DecisionTreeClassifier
from lib.experiment_helper import prepare_data_exp_bds, print_metrics

X_train, X_test, y_train, y_test, abc = prepare_data_exp_bds(df, cols_to_keep)

clf_bds_decision_tree = DecisionTreeClassifier(criterion='gini', splitter='best')
clf_bds_decision_tree.fit(X_train, y_train)

y_pred_bds_decision_tree = clf_bds_decision_tree.predict(X_test)
y_pred_proba_bds_decision_tree = clf_bds_decision_tree.predict_proba(X_test)
fpr_bds_decision_tree, tpr_bds_decision_tree, _ = roc_curve(y_test, y_pred_proba_bds_decision_tree[:, 1])

roc_score_decision_tree = roc_auc_score(y_test, y_pred_proba_bds_decision_tree[:, 1])
print(f"\nROC AUC: {roc_score_decision_tree:.4f}")

print_metrics(y_test, y_pred_bds_decision_tree)

plot_roc_curve(tpr_bds_decision_tree, fpr_bds_decision_tree, 'Decision Tree', False)

### Random forest:

In [None]:
from sklearn.ensemble import RandomForestClassifier
from lib.experiment_helper import prepare_data_exp_bds, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_bds(df, cols_to_keep)

clf_bds_rf = RandomForestClassifier(criterion='entropy', n_estimators=50)
clf_bds_rf.fit(X_train, y_train)

y_pred_bds_rf = clf_bds_rf.predict(X_test)
y_pred_proba_bds_rf = clf_bds_rf.predict_proba(X_test)
fpr_bds_rf, tpr_bds_rf, _ = roc_curve(y_test, y_pred_proba_bds_rf[:, 1])

roc_score_rf = roc_auc_score(y_test, y_pred_proba_bds_rf[:, 1])
print(f"\nROC AUC: {roc_score_rf:.4f}")

print_metrics(y_test, y_pred_bds_rf)

plot_roc_curve(tpr_bds_rf, fpr_bds_rf, 'Random Forest', False)

### AdaBoost:

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from lib.experiment_helper import prepare_data_exp_bds, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_bds(df, cols_to_keep)

clf_bds_ada = AdaBoostClassifier(learning_rate=1, n_estimators=50)
clf_bds_ada.fit(X_train, y_train)

y_pred_bds_ada = clf_bds_ada.predict(X_test)
y_pred_proba_bds_ada = clf_bds_ada.predict_proba(X_test)
fpr_bds_ada, tpr_bds_ada, _ = roc_curve(y_test, y_pred_proba_bds_ada[:, 1])

roc_score_ada = roc_auc_score(y_test, y_pred_proba_bds_ada[:, 1])
print(f"\nROC AUC: {roc_score_ada:.4f}")

print_metrics(y_test, y_pred_bds_ada)

plot_roc_curve(tpr_bds_ada, fpr_bds_ada, 'AdaBoost', False)

### GBM:

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from lib.experiment_helper import prepare_data_exp_bds, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_bds(df, cols_to_keep)

clf_bds_gbm = GradientBoostingClassifier(learning_rate=0.5, loss='deviance', n_estimators=50)
clf_bds_gbm.fit(X_train, y_train)

y_pred_bds_gbm = clf_bds_gbm.predict(X_test)
y_pred_proba_bds_gbm = clf_bds_gbm.predict_proba(X_test)
fpr_bds_gbm, tpr_bds_gbm, _ = roc_curve(y_test, y_pred_proba_bds_gbm[:, 1])

roc_score_gbm = roc_auc_score(y_test, y_pred_proba_bds_gbm[:, 1])
print(f"\nROC AUC: {roc_score_gbm:.4f}")

print_metrics(y_test, y_pred_bds_gbm)

plot_roc_curve(tpr_bds_gbm, fpr_bds_gbm, 'GBM', False)

### XGBoost:

In [None]:
from xgboost import XGBClassifier
from lib.experiment_helper import prepare_data_exp_bds, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_bds(df, cols_to_keep)

clf_bds_xgb = XGBClassifier(learning_rate=0.3, max_depth=5)
clf_bds_xgb.fit(X_train, y_train)

y_pred_bds_xgb = clf_bds_xgb.predict(X_test)
y_pred_proba_bds_xgb = clf_bds_xgb.predict_proba(X_test)
fpr_bds_xgb, tpr_bds_xgb, _ = roc_curve(y_test, y_pred_proba_bds_xgb[:, 1])

roc_score_xgb = roc_auc_score(y_test, y_pred_proba_bds_xgb[:, 1])
print(f"\nROC AUC: {roc_score_xgb:.4f}")

print_metrics(y_test, y_pred_bds_xgb)

plot_roc_curve(tpr_bds_xgb, fpr_bds_xgb, 'XGB', False)

### KNN:

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from lib.experiment_helper import prepare_data_exp_bds, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_bds(df, cols_to_keep)

clf_bds_knn = KNeighborsClassifier(n_neighbors=10, p=1, weights='uniform')
clf_bds_knn.fit(X_train, y_train)

y_pred_bds_knn = clf_bds_knn.predict(X_test)
y_pred_proba_bds_knn = clf_bds_knn.predict_proba(X_test)
fpr_bds_knn, tpr_bds_knn, _ = roc_curve(y_test, y_pred_proba_bds_knn[:, 1])

roc_score_knn = roc_auc_score(y_test, y_pred_proba_bds_knn[:, 1])
print(f"\nROC AUC: {roc_score_knn:.4f}")

print_metrics(y_test, y_pred_bds_knn)

plot_roc_curve(tpr_bds_knn, fpr_bds_knn, 'KNN', False)

### SVM:

In [None]:
from sklearn.svm import SVC
from lib.experiment_helper import prepare_data_exp_bds, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_bds(df, cols_to_keep)

clf_bds_svm = SVC(gamma='scale', kernel='linear', probability=True)
clf_bds_svm.fit(X_train, y_train)

y_pred_bds_svm = clf_bds_svm.predict(X_test)
y_pred_proba_bds_svm = clf_bds_svm.predict_proba(X_test)
fpr_bds_svm, tpr_bds_svm, _ = roc_curve(y_test, y_pred_proba_bds_svm[:, 1])

roc_score_svm = roc_auc_score(y_test, y_pred_proba_bds_svm[:, 1])
print(f"\nROC AUC: {roc_score_svm:.4f}")

print_metrics(y_test, y_pred_bds_svm)

plot_roc_curve(tpr_bds_svm, fpr_bds_svm, 'SVM', False)

### All:

In [None]:
plt.figure(figsize = (8, 8))
ax = sns.lineplot(x = [0, 1], y = [0, 1], color = 'gray', label='Threshold')
ax.lines[0].set_linestyle('--')

sns.lineplot(x = fpr_bds_decision_tree, y = tpr_bds_decision_tree, label='Decision Tree')
sns.lineplot(x = fpr_bds_rf, y = tpr_bds_rf, label='Random Forest')
sns.lineplot(x = fpr_bds_ada, y = tpr_bds_ada, label='AdaBoost')
sns.lineplot(x = fpr_bds_gbm, y = tpr_bds_gbm, label='GBM')
sns.lineplot(x = fpr_bds_xgb, y = tpr_bds_xgb, label='XGB')
sns.lineplot(x = fpr_bds_knn, y = tpr_bds_knn, label='KNN')
sns.lineplot(x = fpr_bds_svm, y = tpr_bds_svm, label='SVM')

plt.xlim(-0.05, 1.05)
plt.ylim(-0.05, 1.05)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")

## IODS - Exp 3:

In [None]:
cols_to_keep_one_hot_encoding = [
  'mc_get_risco_gestacional_2.0',
  'mc_mul_qtd_aborto_3.0',
  'mc_get_gravidez_planejada_2.0',
  'mc_mul_est_civil_5.0',
  'mc_mul_nivel_inseguranca_2.0',
  'mc_mul_qtd_filhos_vivos_4.0',
  'mc_mul_qtd_gest_4.0',
  'mc_mul_rec_inf_plan_fam_2.0',
  'mc_dae_escolaridade_9.0',
  'mc_dae_rfa_3.0',
]

### Decision tree:

In [None]:
from sklearn.tree import DecisionTreeClassifier
from lib.experiment_helper import prepare_data_exp_iods, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_iods(df, cols_to_keep_one_hot_encoding)

clf_iods_decision_tree = DecisionTreeClassifier(criterion='gini', splitter='random')
clf_iods_decision_tree.fit(X_train, y_train)

y_pred_iods_decision_tree = clf_iods_decision_tree.predict(X_test)
y_pred_proba_iods_decision_tree = clf_iods_decision_tree.predict_proba(X_test)
fpr_iods_decision_tree, tpr_iods_decision_tree, _ = roc_curve(y_test, y_pred_proba_iods_decision_tree[:, 1])

roc_score_decision_tree = roc_auc_score(y_test, y_pred_proba_iods_decision_tree[:, 1])
print(f"\nROC AUC: {roc_score_decision_tree:.4f}")

print_metrics(y_test, y_pred_iods_decision_tree)

plot_roc_curve(tpr_iods_decision_tree, fpr_iods_decision_tree, 'Decision Tree', False)

### Random forest:

In [None]:
from sklearn.ensemble import RandomForestClassifier
from lib.experiment_helper import prepare_data_exp_iods, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_iods(df, cols_to_keep_one_hot_encoding)

clf_iods_rf = RandomForestClassifier(criterion='entropy', n_estimators=100)
clf_iods_rf.fit(X_train, y_train)

y_pred_iods_rf = clf_iods_rf.predict(X_test)
y_pred_proba_iods_rf = clf_iods_rf.predict_proba(X_test)
fpr_iods_rf, tpr_iods_rf, _ = roc_curve(y_test, y_pred_proba_iods_rf[:, 1])

roc_score_rf = roc_auc_score(y_test, y_pred_proba_iods_rf[:, 1])
print(f"\nROC AUC: {roc_score_rf:.4f}")

print_metrics(y_test, y_pred_iods_rf)

plot_roc_curve(tpr_iods_rf, fpr_iods_rf, 'Random Forest', False)

### AdaBoost (SBS):

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from lib.experiment_helper import prepare_data_exp_iods, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_iods(df, cols_to_keep_one_hot_encoding)

clf_iods_ada = AdaBoostClassifier(learning_rate=0.5, n_estimators=50)
clf_iods_ada.fit(X_train, y_train)

y_pred_iods_ada = clf_iods_ada.predict(X_test)
y_pred_proba_iods_ada = clf_iods_ada.predict_proba(X_test)
fpr_iods_ada, tpr_iods_ada, _ = roc_curve(y_test, y_pred_proba_iods_ada[:, 1])

roc_score_ada = roc_auc_score(y_test, y_pred_proba_iods_ada[:, 1])
print(f"\nROC AUC: {roc_score_ada:.4f}")

print_metrics(y_test, y_pred_iods_ada)

plot_roc_curve(tpr_iods_ada, fpr_iods_ada, 'AdaBoost', False)

### GBM:

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from lib.experiment_helper import prepare_data_exp_iods, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_iods(df, cols_to_keep_one_hot_encoding)

clf_iods_gbm = GradientBoostingClassifier(learning_rate=0.5, loss='deviance', n_estimators=50)
clf_iods_gbm.fit(X_train, y_train)

y_pred_iods_gbm = clf_iods_gbm.predict(X_test)
y_pred_proba_iods_gbm = clf_iods_gbm.predict_proba(X_test)
fpr_iods_gbm, tpr_iods_gbm, _ = roc_curve(y_test, y_pred_proba_iods_gbm[:, 1])

roc_score_gbm = roc_auc_score(y_test, y_pred_proba_iods_gbm[:, 1])
print(f"\nROC AUC: {roc_score_gbm:.4f}")

print_metrics(y_test, y_pred_iods_gbm)

plot_roc_curve(tpr_iods_gbm, fpr_iods_gbm, 'GBM', False)

### XGBoost:

In [None]:
from xgboost import XGBClassifier
from lib.experiment_helper import prepare_data_exp_iods, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_iods(df, cols_to_keep_one_hot_encoding)

clf_iods_xgb = XGBClassifier(learning_rate=0.3, max_depth=5)
clf_iods_xgb.fit(X_train, y_train)

y_pred_iods_xgb = clf_iods_xgb.predict(X_test)
y_pred_proba_iods_xgb = clf_iods_xgb.predict_proba(X_test)
fpr_iods_xgb, tpr_iods_xgb, _ = roc_curve(y_test, y_pred_proba_iods_xgb[:, 1])

roc_score_xgb = roc_auc_score(y_test, y_pred_proba_iods_xgb[:, 1])
print(f"\nROC AUC: {roc_score_xgb:.4f}")

print_metrics(y_test, y_pred_iods_xgb)

plot_roc_curve(tpr_iods_xgb, fpr_iods_xgb, 'XGB', False)

### KNN:

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from lib.experiment_helper import prepare_data_exp_iods, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_iods(df, cols_to_keep_one_hot_encoding)

clf_iods_knn = KNeighborsClassifier(n_neighbors=15, p=1, weights='uniform')
clf_iods_knn.fit(X_train, y_train)

y_pred_iods_knn = clf_iods_knn.predict(X_test)
y_pred_proba_iods_knn = clf_iods_knn.predict_proba(X_test)
fpr_iods_knn, tpr_iods_knn, _ = roc_curve(y_test, y_pred_proba_iods_knn[:, 1])

roc_score_knn = roc_auc_score(y_test, y_pred_proba_iods_knn[:, 1])
print(f"\nROC AUC: {roc_score_knn:.4f}")

print_metrics(y_test, y_pred_iods_knn)

plot_roc_curve(tpr_iods_knn, fpr_iods_knn, 'KNN', False)

### SVM:

In [None]:
from sklearn.svm import SVC
from lib.experiment_helper import prepare_data_exp_iods, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_iods(df, cols_to_keep_one_hot_encoding)

clf_iods_svm = SVC(gamma='auto', kernel='poly', probability=True)
clf_iods_svm.fit(X_train, y_train)

y_pred_iods_svm = clf_iods_svm.predict(X_test)
y_pred_proba_iods_svm = clf_iods_svm.predict_proba(X_test)
fpr_iods_svm, tpr_iods_svm, _ = roc_curve(y_test, y_pred_proba_iods_svm[:, 1])

roc_score_svm = roc_auc_score(y_test, y_pred_proba_iods_svm[:, 1])
print(f"\nROC AUC: {roc_score_svm:.4f}")

print_metrics(y_test, y_pred_iods_svm)

plot_roc_curve(tpr_iods_svm, fpr_iods_svm, 'SVM', False)

### All:

In [None]:
plt.figure(figsize = (8, 8))
ax = sns.lineplot(x = [0, 1], y = [0, 1], color = 'gray', label='Threshold')
ax.lines[0].set_linestyle('--')

sns.lineplot(x = fpr_iods_decision_tree, y = tpr_iods_decision_tree, label='Decision Tree')
sns.lineplot(x = fpr_iods_rf, y = tpr_iods_rf, label='Random Forest')
sns.lineplot(x = fpr_iods_ada, y = tpr_iods_ada, label='AdaBoost')
sns.lineplot(x = fpr_iods_gbm, y = tpr_iods_gbm, label='GBM')
sns.lineplot(x = fpr_iods_xgb, y = tpr_iods_xgb, label='XGB')
sns.lineplot(x = fpr_iods_knn, y = tpr_iods_knn, label='KNN')
sns.lineplot(x = fpr_iods_svm, y = tpr_iods_svm, label='SVM')

plt.xlim(-0.05, 1.05)
plt.ylim(-0.05, 1.05)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")

## BODS - Exp 4:

In [None]:
cols_to_keep_one_hot_encoding = [
  'mc_get_risco_gestacional_2.0',
  'mc_mul_qtd_aborto_3.0',
  'mc_get_gravidez_planejada_2.0',
  'mc_mul_est_civil_5.0',
  'mc_mul_nivel_inseguranca_2.0',
  'mc_mul_qtd_filhos_vivos_4.0',
  'mc_mul_qtd_gest_4.0',
  'mc_mul_rec_inf_plan_fam_2.0',
  'mc_dae_escolaridade_9.0',
  'mc_dae_rfa_3.0',
]

### Decision tree:

In [None]:
from sklearn.tree import DecisionTreeClassifier
from lib.experiment_helper import prepare_data_exp_bods, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_bods(df, cols_to_keep_one_hot_encoding)

clf_bods_decision_tree = DecisionTreeClassifier(criterion='gini', splitter='best')
clf_bods_decision_tree.fit(X_train, y_train)

y_pred_bods_decision_tree = clf_bods_decision_tree.predict(X_test)
y_pred_proba_bods_decision_tree = clf_bods_decision_tree.predict_proba(X_test)
fpr_bods_decision_tree, tpr_bods_decision_tree, _ = roc_curve(y_test, y_pred_proba_bods_decision_tree[:, 1])

roc_score_decision_tree = roc_auc_score(y_test, y_pred_proba_bods_decision_tree[:, 1])
print(f"\nROC AUC: {roc_score_decision_tree:.4f}")

print_metrics(y_test, y_pred_bods_decision_tree)

plot_roc_curve(tpr_bods_decision_tree, fpr_bods_decision_tree, 'Decision Tree', False)

### Random forest:

In [None]:
from sklearn.ensemble import RandomForestClassifier
from lib.experiment_helper import prepare_data_exp_bods, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_bods(df, cols_to_keep_one_hot_encoding)

clf_bods_rf = RandomForestClassifier(criterion='entropy', n_estimators=150)
clf_bods_rf.fit(X_train, y_train)

y_pred_bods_rf = clf_bods_rf.predict(X_test)
y_pred_proba_bods_rf = clf_bods_rf.predict_proba(X_test)
fpr_bods_rf, tpr_bods_rf, _ = roc_curve(y_test, y_pred_proba_bods_rf[:, 1])

roc_score_rf = roc_auc_score(y_test, y_pred_proba_bods_rf[:, 1])
print(f"\nROC AUC: {roc_score_rf:.4f}")

print_metrics(y_test, y_pred_bods_rf)

plot_roc_curve(tpr_bods_rf, fpr_bods_rf, 'Random Forest', False)

### AdaBoost:

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from lib.experiment_helper import prepare_data_exp_bods, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_bods(df, cols_to_keep_one_hot_encoding)

clf_bods_ada = AdaBoostClassifier(learning_rate=0.5, n_estimators=150)
clf_bods_ada.fit(X_train, y_train)

y_pred_bods_ada = clf_bods_ada.predict(X_test)
y_pred_proba_bods_ada = clf_bods_ada.predict_proba(X_test)
fpr_bods_ada, tpr_bods_ada, _ = roc_curve(y_test, y_pred_proba_bods_ada[:, 1])

roc_score_ada = roc_auc_score(y_test, y_pred_proba_bods_ada[:, 1])
print(f"\nROC AUC: {roc_score_ada:.4f}")

print_metrics(y_test, y_pred_bods_ada)

plot_roc_curve(tpr_bods_ada, fpr_bods_ada, 'AdaBoost', False)

### GBM:

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from lib.experiment_helper import prepare_data_exp_bods, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_bods(df, cols_to_keep_one_hot_encoding)

clf_bods_gbm = GradientBoostingClassifier(learning_rate=0.5, loss='exponential', n_estimators=100)
clf_bods_gbm.fit(X_train, y_train)

y_pred_bods_gbm = clf_bods_gbm.predict(X_test)
y_pred_proba_bods_gbm = clf_bods_gbm.predict_proba(X_test)
fpr_bods_gbm, tpr_bods_gbm, _ = roc_curve(y_test, y_pred_proba_bods_gbm[:, 1])

roc_score_gbm = roc_auc_score(y_test, y_pred_proba_bods_gbm[:, 1])
print(f"\nROC AUC: {roc_score_gbm:.4f}")

print_metrics(y_test, y_pred_bods_gbm)

plot_roc_curve(tpr_bods_gbm, fpr_bods_gbm, 'GBM', False)

### XGBoost:

In [None]:
from xgboost import XGBClassifier
from lib.experiment_helper import prepare_data_exp_bods, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_bods(df, cols_to_keep_one_hot_encoding)

clf_bods_xgb = XGBClassifier(learning_rate=0.3, max_depth=5)
clf_bods_xgb.fit(X_train, y_train)

y_pred_bods_xgb = clf_bods_xgb.predict(X_test)
y_pred_proba_bods_xgb = clf_bods_xgb.predict_proba(X_test)
fpr_bods_xgb, tpr_bods_xgb, _ = roc_curve(y_test, y_pred_proba_bods_xgb[:, 1])

roc_score_xgb = roc_auc_score(y_test, y_pred_proba_bods_xgb[:, 1])
print(f"\nROC AUC: {roc_score_xgb:.4f}")

print_metrics(y_test, y_pred_bods_xgb)

plot_roc_curve(tpr_bods_xgb, fpr_bods_xgb, 'XGB', False)

### KNN:

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from lib.experiment_helper import prepare_data_exp_bods, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_bods(df, cols_to_keep_one_hot_encoding)

clf_bods_knn = KNeighborsClassifier(n_neighbors=15, p=2, weights='distance')
clf_bods_knn.fit(X_train, y_train)

y_pred_bods_knn = clf_bods_knn.predict(X_test)
y_pred_proba_bods_knn = clf_bods_knn.predict_proba(X_test)
fpr_bods_knn, tpr_bods_knn, _ = roc_curve(y_test, y_pred_proba_bods_knn[:, 1])

roc_score_knn = roc_auc_score(y_test, y_pred_proba_bods_knn[:, 1])
print(f"\nROC AUC: {roc_score_knn:.4f}")

print_metrics(y_test, y_pred_bods_knn)

plot_roc_curve(tpr_bods_knn, fpr_bods_knn, 'KNN', False)

### SVM:

In [None]:
from sklearn.svm import SVC
from lib.experiment_helper import prepare_data_exp_bods, print_metrics

X_train, X_test, y_train, y_test, _ = prepare_data_exp_bods(df, cols_to_keep_one_hot_encoding)

clf_bods_svm = SVC(gamma='scale', kernel='linear', probability=True)
clf_bods_svm.fit(X_train, y_train)

y_pred_bods_svm = clf_bods_svm.predict(X_test)
y_pred_proba_bods_svm = clf_bods_svm.predict_proba(X_test)
fpr_bods_svm, tpr_bods_svm, _ = roc_curve(y_test, y_pred_proba_bods_svm[:, 1])

roc_score_svm = roc_auc_score(y_test, y_pred_proba_bods_svm[:, 1])
print(f"\nROC AUC: {roc_score_svm:.4f}")

print_metrics(y_test, y_pred_bods_svm)

plot_roc_curve(tpr_bods_svm, fpr_bods_svm, 'SVM', False)

### All:

In [None]:
plt.figure(figsize = (8, 8))
ax = sns.lineplot(x = [0, 1], y = [0, 1], color = 'gray', label='Threshold')
ax.lines[0].set_linestyle('--')

sns.lineplot(x = fpr_bods_decision_tree, y = tpr_bods_decision_tree, label='Decision Tree')
sns.lineplot(x = fpr_bods_rf, y = tpr_bods_rf, label='Random Forest')
sns.lineplot(x = fpr_bods_ada, y = tpr_bods_ada, label='AdaBoost')
sns.lineplot(x = fpr_bods_gbm, y = tpr_bods_gbm, label='GBM')
sns.lineplot(x = fpr_bods_xgb, y = tpr_bods_xgb, label='XGB')
sns.lineplot(x = fpr_bods_knn, y = tpr_bods_knn, label='KNN')
sns.lineplot(x = fpr_bods_svm, y = tpr_bods_svm, label='SVM')

plt.xlim(-0.05, 1.05)
plt.ylim(-0.05, 1.05)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")

## IODDS - Exp 5:

In [None]:
cols_to_keep_one_hot_encoding = [
  'mc_get_risco_gestacional_2.0',
  'mc_mul_qtd_aborto_3.0',
  'mc_get_gravidez_planejada_2.0',
  'mc_mul_est_civil_5.0',
  'mc_mul_nivel_inseguranca_2.0',
  'mc_mul_qtd_filhos_vivos_4.0',
  'mc_mul_qtd_gest_4.0',
  'mc_mul_rec_inf_plan_fam_2.0',
  'mc_dae_escolaridade_9.0',
  'mc_dae_rfa_3.0',
]

### Decision tree:

In [None]:
from sklearn.tree import DecisionTreeClassifier
from lib.experiment_helper import prepare_data_exp_iodds, print_metrics

cols_to_keep = [
  'mc_get_fumo_0.0',
  'mc_mul_nivel_inseguranca_0.0',
  'mc_dae_escolaridade_5.0',
  'mc_dae_possui_horta_0.0',
  'mc_dae_sit_moradia_1.0',
]
X_train, X_test, y_train, y_test, _ = prepare_data_exp_iodds(df, cols_to_keep_one_hot_encoding)

clf_iodds_decision_tree = DecisionTreeClassifier(criterion='gini', splitter='best')
clf_iodds_decision_tree.fit(X_train, y_train)

y_pred_iodds_decision_tree = clf_iodds_decision_tree.predict(X_test)
y_pred_proba_iodds_decision_tree = clf_iodds_decision_tree.predict_proba(X_test)
fpr_iodds_decision_tree, tpr_iodds_decision_tree, _ = roc_curve(y_test, y_pred_proba_iodds_decision_tree[:, 1])

roc_score_decision_tree = roc_auc_score(y_test, y_pred_proba_iodds_decision_tree[:, 1])
print(f"\nROC AUC: {roc_score_decision_tree:.4f}")

print_metrics(y_test, y_pred_iodds_decision_tree)

plot_roc_curve(tpr_iodds_decision_tree, fpr_iodds_decision_tree, 'Decision Tree (SBS)', False)

### Random forest (SBS):

In [None]:
from sklearn.ensemble import RandomForestClassifier
from lib.experiment_helper import prepare_data_exp_iodds, print_metrics

cols_to_keep = [
  'mc_mul_renda_familiar',
  'idade',
  'mc_get_fator_rh_1.0',
  'mc_get_fumo_0.0',
  'mc_get_grupo_sanguineo_1.0',
  'mc_get_grupo_sanguineo_3.0',
  'mc_get_risco_gestacional_0.0',
  'mc_get_risco_gestacional_1.0',
  'mc_mul_chefe_familia_0.0',
  'mc_mul_est_civil_0.0',
  'mc_mul_est_civil_1.0',
  'mc_mul_est_civil_3.0',
  'mc_mul_est_civil_4.0',
  'mc_mul_nivel_inseguranca_0.0',
  'mc_mul_qtd_filhos_vivos_0.0',
  'mc_mul_qtd_filhos_vivos_1.0',
  'mc_mul_qtd_gest_1.0',
  'mc_mul_qtd_gest_3.0',
  'mc_mul_rec_inf_plan_fam_0.0',
  'mc_mul_tipo_const_casa_1.0',
  'mc_dae_escolaridade_2.0',
  'mc_dae_escolaridade_4.0',
  'mc_dae_escolaridade_5.0',
  'mc_dae_escolaridade_6.0',
  'mc_dae_escolaridade_7.0',
  'mc_dae_escolaridade_8.0',
  'mc_dae_mrd_lgd_red_esg_0.0',
  'mc_dae_numero_res_domic_1.0',
  'mc_dae_possui_arv_frut_1.0',
  'mc_dae_possui_horta_0.0',
  'mc_dae_possui_horta_1.0',
  'mc_dae_rfa_0.0',
  'mc_dae_rfa_1.0',
  'mc_dae_rfa_2.0',
  'mc_dae_sit_moradia_0.0',
  'mc_dae_sit_moradia_2.0',
  'mc_dae_trat_agua_uso_1.0',
  'mc_dae_trat_agua_uso_2.0',
]
X_train, X_test, y_train, y_test, _ = prepare_data_exp_iodds(df, cols_to_keep)

clf_iodds_rf = RandomForestClassifier(criterion='entropy', n_estimators=50)
clf_iodds_rf.fit(X_train, y_train)

y_pred_iodds_rf = clf_iodds_rf.predict(X_test)
y_pred_proba_iodds_rf = clf_iodds_rf.predict_proba(X_test)
fpr_iodds_rf, tpr_iodds_rf, _ = roc_curve(y_test, y_pred_proba_iodds_rf[:, 1])

roc_score_rf = roc_auc_score(y_test, y_pred_proba_iodds_rf[:, 1])
print(f"\nROC AUC: {roc_score_rf:.4f}")

print_metrics(y_test, y_pred_iodds_rf)

plot_roc_curve(tpr_iodds_rf, fpr_iodds_rf, 'Random Forest (SBS)', False)

### AdaBoost (SBS):

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from lib.experiment_helper import prepare_data_exp_iodds, print_metrics

cols_to_keep = [
  'mc_get_fumo_0.0',
  'mc_get_gravidez_planejada_1.0',
  'mc_get_grupo_sanguineo_0.0',
  'mc_get_grupo_sanguineo_1.0',
  'mc_get_grupo_sanguineo_2.0',
  'mc_get_risco_gestacional_0.0',
  'mc_mul_est_civil_0.0',
  'mc_mul_est_civil_1.0',
  'mc_mul_nivel_inseguranca_0.0',
  'mc_mul_nivel_inseguranca_1.0',
  'mc_mul_qtd_gest_2.0',
  'mc_mul_rec_inf_plan_fam_0.0',
  'mc_dae_escolaridade_1.0',
  'mc_dae_escolaridade_2.0',
  'mc_dae_numero_res_domic_1.0',
  'mc_dae_possui_horta_1.0',
  'mc_dae_rfa_2.0',
  'mc_dae_sit_moradia_1.0',
]
X_train, X_test, y_train, y_test, _ = prepare_data_exp_iodds(df, cols_to_keep)

clf_iodds_ada = AdaBoostClassifier(learning_rate=0.5, n_estimators=50)
clf_iodds_ada.fit(X_train, y_train)

y_pred_iodds_ada = clf_iodds_ada.predict(X_test)
y_pred_proba_iodds_ada = clf_iodds_ada.predict_proba(X_test)
fpr_iodds_ada, tpr_iodds_ada, _ = roc_curve(y_test, y_pred_proba_iodds_ada[:, 1])

roc_score_ada = roc_auc_score(y_test, y_pred_proba_iodds_ada[:, 1])
print(f"\nROC AUC: {roc_score_ada:.4f}")

print_metrics(y_test, y_pred_iodds_ada)

plot_roc_curve(tpr_iodds_ada, fpr_iodds_ada, 'AdaBoost (SBS)', False)

### GBM (SBS):

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from lib.experiment_helper import prepare_data_exp_iodds, print_metrics

cols_to_keep = [
  'mc_get_alcool_0.0',
  'mc_get_risco_gestacional_0.0',
  'mc_mul_est_civil_1.0',
  'mc_mul_nivel_inseguranca_0.0',
  'mc_mul_qtd_aborto_2.0',
  'mc_mul_qtd_filhos_vivos_1.0',
  'mc_mul_tipo_const_casa_1.0',
  'mc_dae_escolaridade_2.0',
  'mc_dae_escolaridade_5.0',
  'mc_dae_sit_moradia_1.0',
  'mc_dae_trat_agua_uso_2.0',
]
X_train, X_test, y_train, y_test, _ = prepare_data_exp_iodds(df, cols_to_keep)

clf_iodds_gbm = GradientBoostingClassifier(learning_rate=0.5, loss='deviance', n_estimators=50)
clf_iodds_gbm.fit(X_train, y_train)

y_pred_iodds_gbm = clf_iodds_gbm.predict(X_test)
y_pred_proba_iodds_gbm = clf_iodds_gbm.predict_proba(X_test)
fpr_iodds_gbm, tpr_iodds_gbm, _ = roc_curve(y_test, y_pred_proba_iodds_gbm[:, 1])

roc_score_gbm = roc_auc_score(y_test, y_pred_proba_iodds_gbm[:, 1])
print(f"\nROC AUC: {roc_score_gbm:.4f}")

print_metrics(y_test, y_pred_iodds_gbm)

plot_roc_curve(tpr_iodds_gbm, fpr_iodds_gbm, 'GBM (SBS)', False)

### XGBoost (SBS):

In [None]:
from xgboost import XGBClassifier
from lib.experiment_helper import prepare_data_exp_iodds, print_metrics

cols_to_keep = [
  'mc_get_vacina_anti_tetanica',
  'idade',
  'mc_get_fumo_0.0',
  'mc_get_risco_gestacional_1.0',
  'mc_mul_est_civil_1.0',
  'mc_mul_nivel_inseguranca_0.0',
  'mc_mul_nivel_inseguranca_1.0',
  'mc_mul_qtd_filhos_vivos_1.0',
  'mc_mul_qtd_gest_1.0',
  'mc_mul_rec_inf_plan_fam_0.0',
  'mc_dae_escolaridade_2.0',
  'mc_dae_escolaridade_5.0',
  'mc_dae_numero_res_domic_1.0',
  'mc_dae_possui_arv_frut_1.0',
  'mc_dae_sit_moradia_2.0',
  'mc_dae_trat_agua_uso_2.0',
]
X_train, X_test, y_train, y_test, _ = prepare_data_exp_iodds(df, cols_to_keep)

clf_iodds_xgb = XGBClassifier(learning_rate=0.3, max_depth=5)
clf_iodds_xgb.fit(X_train, y_train)

y_pred_iodds_xgb = clf_iodds_xgb.predict(X_test)
y_pred_proba_iodds_xgb = clf_iodds_xgb.predict_proba(X_test)
fpr_iodds_xgb, tpr_iodds_xgb, _ = roc_curve(y_test, y_pred_proba_iodds_xgb[:, 1])

roc_score_xgb = roc_auc_score(y_test, y_pred_proba_iodds_xgb[:, 1])
print(f"\nROC AUC: {roc_score_xgb:.4f}")

print_metrics(y_test, y_pred_iodds_xgb)

plot_roc_curve(tpr_iodds_xgb, fpr_iodds_xgb, 'XGB (SBS)', False)

### KNN (SBS):

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from lib.experiment_helper import prepare_data_exp_iodds, print_metrics

cols_to_keep = [
  'mc_mul_renda_familiar',
  'mc_get_alcool_0.0',
  'mc_get_alcool_1.0',
  'mc_get_fator_rh_1.0',
  'mc_get_gravidez_planejada_1.0',
  'mc_get_grupo_sanguineo_0.0',
  'mc_get_grupo_sanguineo_2.0',
  'mc_get_grupo_sanguineo_3.0',
  'mc_get_risco_gestacional_1.0',
  'mc_mul_chefe_familia_0.0',
  'mc_mul_est_civil_0.0',
  'mc_mul_est_civil_1.0',
  'mc_mul_est_civil_4.0',
  'mc_mul_nivel_inseguranca_0.0',
  'mc_mul_nivel_inseguranca_1.0',
  'mc_mul_qtd_aborto_0.0',
  'mc_mul_qtd_aborto_2.0',
  'mc_mul_qtd_filhos_vivos_0.0',
  'mc_mul_qtd_filhos_vivos_1.0',
  'mc_mul_qtd_filhos_vivos_2.0',
  'mc_mul_qtd_gest_1.0',
  'mc_mul_qtd_gest_2.0',
  'mc_mul_rec_inf_plan_fam_0.0',
  'mc_mul_tipo_const_casa_1.0',
  'mc_dae_escolaridade_0.0',
  'mc_dae_escolaridade_1.0',
  'mc_dae_escolaridade_2.0',
  'mc_dae_escolaridade_4.0',
  'mc_dae_escolaridade_5.0',
  'mc_dae_mrd_lgd_red_esg_0.0',
  'mc_dae_mrd_lgd_red_esg_1.0',
  'mc_dae_numero_res_domic_2.0',
  'mc_dae_numero_res_domic_4.0',
  'mc_dae_possui_arv_frut_1.0',
  'mc_dae_possui_horta_0.0',
  'mc_dae_rfa_0.0',
  'mc_dae_rfa_1.0',
  'mc_dae_rfa_2.0',
  'mc_dae_sit_moradia_1.0',
  'mc_dae_trat_agua_uso_2.0',
  'mc_dae_trat_agua_uso_3.0',
]
X_train, X_test, y_train, y_test, _ = prepare_data_exp_iodds(df, cols_to_keep)

clf_iodds_knn = KNeighborsClassifier(n_neighbors=5, p=1, weights='uniform')
clf_iodds_knn.fit(X_train, y_train)

y_pred_iodds_knn = clf_iodds_knn.predict(X_test)
y_pred_proba_iodds_knn = clf_iodds_knn.predict_proba(X_test)
fpr_iodds_knn, tpr_iodds_knn, _ = roc_curve(y_test, y_pred_proba_iodds_knn[:, 1])

roc_score_knn = roc_auc_score(y_test, y_pred_proba_iodds_knn[:, 1])
print(f"\nROC AUC: {roc_score_knn:.4f}")

print_metrics(y_test, y_pred_iodds_knn)

plot_roc_curve(tpr_iodds_knn, fpr_iodds_knn, 'KNN (SBS)', False)

### SVM (SFS):

In [None]:
from sklearn.svm import SVC
from lib.experiment_helper import prepare_data_exp_iodds, print_metrics

cols_to_keep = [
  'mc_get_vacina_anti_tetanica',
  'mc_get_fator_rh_0.0',
  'mc_get_fumo_0.0',
  'mc_mul_chefe_familia_0.0',
  'mc_mul_est_civil_1.0',
  'mc_mul_nivel_inseguranca_0.0',
  'mc_mul_qtd_filhos_vivos_0.0',
  'mc_mul_qtd_filhos_vivos_1.0',
  'mc_mul_qtd_filhos_vivos_3.0',
  'mc_mul_qtd_gest_2.0',
  'mc_dae_escolaridade_4.0',
  'mc_dae_escolaridade_5.0',
  'mc_dae_escolaridade_6.0',
  'mc_dae_sit_moradia_0.0',
  'mc_dae_trat_agua_uso_2.0',
]
X_train, X_test, y_train, y_test, _ = prepare_data_exp_iodds(df, cols_to_keep)

clf_iodds_svm = SVC(gamma='scale', kernel='rbf', probability=True)
clf_iodds_svm.fit(X_train, y_train)

y_pred_iodds_svm = clf_iodds_svm.predict(X_test)
y_pred_proba_iodds_svm = clf_iodds_svm.predict_proba(X_test)
fpr_iodds_svm, tpr_iodds_svm, _ = roc_curve(y_test, y_pred_proba_iodds_svm[:, 1])

roc_score_svm = roc_auc_score(y_test, y_pred_proba_iodds_svm[:, 1])
print(f"\nROC AUC: {roc_score_svm:.4f}")

print_metrics(y_test, y_pred_iodds_svm)

plot_roc_curve(tpr_iodds_svm, fpr_iodds_svm, 'SVM (SFS)', False)

### All:

In [None]:
plt.figure(figsize = (8, 8))
ax = sns.lineplot(x = [0, 1], y = [0, 1], color = 'gray', label='Threshold')
ax.lines[0].set_linestyle('--')

sns.lineplot(x = fpr_iodds_decision_tree, y = tpr_iodds_decision_tree, label='Decision Tree')
sns.lineplot(x = fpr_iodds_rf, y = tpr_iodds_rf, label='Random Forest')
sns.lineplot(x = fpr_iodds_ada, y = tpr_iodds_ada, label='AdaBoost')
sns.lineplot(x = fpr_iodds_gbm, y = tpr_iodds_gbm, label='GBM')
sns.lineplot(x = fpr_iodds_xgb, y = tpr_iodds_xgb, label='XGB')
sns.lineplot(x = fpr_iodds_knn, y = tpr_iodds_knn, label='KNN')
sns.lineplot(x = fpr_iodds_svm, y = tpr_iodds_svm, label='SVM')

plt.xlim(-0.05, 1.05)
plt.ylim(-0.05, 1.05)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")

## BODDS - Exp 6

### Decision tree (SFS):

In [None]:
from sklearn.tree import DecisionTreeClassifier
from lib.experiment_helper import prepare_data_exp_bodds, print_metrics

cols_to_keep = [
  'mc_get_grupo_sanguineo_2.0',
  'mc_get_grupo_sanguineo_3.0',
  'mc_mul_est_civil_1.0',
  'mc_mul_est_civil_2.0',
  'mc_mul_nivel_inseguranca_0.0',
  'mc_mul_qtd_filhos_vivos_2.0',
  'mc_dae_escolaridade_0.0',
  'mc_dae_escolaridade_7.0',
  'mc_dae_escolaridade_8.0',
  'mc_dae_trat_agua_uso_3.0',
]
X_train, X_test, y_train, y_test, _ = prepare_data_exp_bodds(df, cols_to_keep)

clf_bodds_decision_tree = DecisionTreeClassifier(criterion='entropy', splitter='random')
clf_bodds_decision_tree.fit(X_train, y_train)

y_pred_bodds_decision_tree = clf_bodds_decision_tree.predict(X_test)
y_pred_proba_bodds_decision_tree = clf_bodds_decision_tree.predict_proba(X_test)
fpr_bodds_decision_tree, tpr_bodds_decision_tree, _ = roc_curve(y_test, y_pred_proba_bodds_decision_tree[:, 1])

roc_score_decision_tree = roc_auc_score(y_test, y_pred_proba_bodds_decision_tree[:, 1])
print(f"\nROC AUC: {roc_score_decision_tree:.4f}")

print_metrics(y_test, y_pred_bodds_decision_tree)

plot_roc_curve(tpr_bodds_decision_tree, fpr_bodds_decision_tree, 'Decision Tree (SFS)', False)

### Random forest (SFS):

In [None]:
from sklearn.ensemble import RandomForestClassifier
from lib.experiment_helper import prepare_data_exp_bodds, print_metrics

cols_to_keep = [
  'mc_get_fumo_0.0',
  'mc_get_grupo_sanguineo_1.0',
  'mc_get_grupo_sanguineo_2.0',
  'mc_mul_est_civil_1.0',
  'mc_mul_est_civil_2.0',
  'mc_mul_est_civil_3.0',
  'mc_mul_nivel_inseguranca_0.0',
  'mc_mul_qtd_gest_0.0',
  'mc_mul_tipo_const_casa_0.0',
  'mc_mul_tipo_const_casa_1.0',
  'mc_dae_escolaridade_1.0',
  'mc_dae_escolaridade_3.0',
  'mc_dae_escolaridade_8.0',
  'mc_dae_rfa_2.0',
  'mc_dae_trat_agua_uso_0.0',
  'mc_dae_trat_agua_uso_1.0',
  'mc_dae_trat_agua_uso_2.0',
  'mc_dae_trat_agua_uso_3.0',
]
X_train, X_test, y_train, y_test, _ = prepare_data_exp_bodds(df, cols_to_keep)

clf_bodds_rf = RandomForestClassifier(criterion='entropy', n_estimators=100)
clf_bodds_rf.fit(X_train, y_train)

y_pred_bodds_rf = clf_bodds_rf.predict(X_test)
y_pred_proba_bodds_rf = clf_bodds_rf.predict_proba(X_test)
fpr_bodds_rf, tpr_bodds_rf, _ = roc_curve(y_test, y_pred_proba_bodds_rf[:, 1])

roc_score_rf = roc_auc_score(y_test, y_pred_proba_bodds_rf[:, 1])
print(f"\nROC AUC: {roc_score_rf:.4f}")

print_metrics(y_test, y_pred_bodds_rf)

plot_roc_curve(tpr_bodds_rf, fpr_bodds_rf, 'Random Forest (SFS)', False)

### AdaBoost (SFS):

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from lib.experiment_helper import prepare_data_exp_bodds, print_metrics

cols_to_keep = [
  'mc_get_vacina_anti_tetanica',
  'mc_get_fator_rh_1.0',
  'mc_get_gravidez_planejada_0.0',
  'mc_get_gravidez_planejada_1.0',
  'mc_get_grupo_sanguineo_3.0',
  'mc_get_risco_gestacional_1.0',
  'mc_mul_chefe_familia_0.0',
  'mc_mul_est_civil_1.0',
  'mc_mul_est_civil_3.0',
  'mc_mul_nivel_inseguranca_0.0',
  'mc_mul_qtd_aborto_2.0',
  'mc_mul_qtd_filhos_vivos_1.0',
  'mc_mul_qtd_filhos_vivos_3.0',
  'mc_mul_qtd_gest_3.0',
  'mc_mul_rec_inf_plan_fam_1.0',
  'mc_mul_tipo_const_casa_3.0',
  'mc_dae_escolaridade_0.0',
  'mc_dae_escolaridade_1.0',
  'mc_dae_escolaridade_4.0',
  'mc_dae_escolaridade_5.0',
  'mc_dae_escolaridade_7.0',
  'mc_dae_numero_res_domic_0.0',
  'mc_dae_numero_res_domic_1.0',
  'mc_dae_numero_res_domic_3.0',
  'mc_dae_possui_horta_0.0',
]
X_train, X_test, y_train, y_test, _ = prepare_data_exp_bodds(df, cols_to_keep)

clf_bodds_ada = AdaBoostClassifier(learning_rate=0.5, n_estimators=50)
clf_bodds_ada.fit(X_train, y_train)

y_pred_bodds_ada = clf_bodds_ada.predict(X_test)
y_pred_proba_bodds_ada = clf_bodds_ada.predict_proba(X_test)
fpr_bodds_ada, tpr_bodds_ada, _ = roc_curve(y_test, y_pred_proba_bodds_ada[:, 1])

roc_score_ada = roc_auc_score(y_test, y_pred_proba_bodds_ada[:, 1])
print(f"\nROC AUC: {roc_score_ada:.4f}")

print_metrics(y_test, y_pred_bodds_ada)

plot_roc_curve(tpr_bodds_ada, fpr_bodds_ada, 'AdaBoost (SFS)', False)

### GBM (SFS):

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from lib.experiment_helper import prepare_data_exp_bodds, print_metrics

cols_to_keep = [
  'mc_get_fumo_0.0',
  'mc_get_grupo_sanguineo_2.0',
  'mc_get_grupo_sanguineo_3.0',
  'mc_mul_est_civil_1.0',
  'mc_mul_nivel_inseguranca_0.0',
  'mc_mul_qtd_aborto_1.0',
  'mc_mul_tipo_const_casa_2.0',
  'mc_mul_tipo_const_casa_3.0',
  'mc_dae_escolaridade_8.0',
  'mc_dae_numero_res_domic_0.0',
  'mc_dae_numero_res_domic_1.0',
  'mc_dae_possui_arv_frut_0.0',
  'mc_dae_rfa_2.0',
  'mc_dae_trat_agua_uso_1.0',
  'mc_dae_trat_agua_uso_3.0',
]
X_train, X_test, y_train, y_test, _ = prepare_data_exp_bodds(df, cols_to_keep)

clf_bodds_gbm = GradientBoostingClassifier(learning_rate=1, loss='exponential', n_estimators=150)
clf_bodds_gbm.fit(X_train, y_train)

y_pred_bodds_gbm = clf_bodds_gbm.predict(X_test)
y_pred_proba_bodds_gbm = clf_bodds_gbm.predict_proba(X_test)
fpr_bodds_gbm, tpr_bodds_gbm, _ = roc_curve(y_test, y_pred_proba_bodds_gbm[:, 1])

roc_score_gbm = roc_auc_score(y_test, y_pred_proba_bodds_gbm[:, 1])
print(f"\nROC AUC: {roc_score_gbm:.4f}")

print_metrics(y_test, y_pred_bodds_gbm)

plot_roc_curve(tpr_bodds_gbm, fpr_bodds_gbm, 'GBM (SFS)', False)

### XGBoost (SBS):

In [None]:
from xgboost import XGBClassifier
from lib.experiment_helper import prepare_data_exp_bodds, print_metrics

cols_to_keep = [
  'mc_get_vacina_anti_tetanica',
  'idade',
  'mc_get_alcool_0.0',
  'mc_get_alcool_1.0',
  'mc_get_fumo_1.0',
  'mc_get_gravidez_planejada_0.0',
  'mc_get_grupo_sanguineo_0.0',
  'mc_get_grupo_sanguineo_2.0',
  'mc_get_grupo_sanguineo_3.0',
  'mc_get_risco_gestacional_0.0',
  'mc_get_risco_gestacional_1.0',
  'mc_mul_chefe_familia_0.0',
  'mc_mul_chefe_familia_1.0',
  'mc_mul_est_civil_4.0',
  'mc_mul_nivel_inseguranca_0.0',
  'mc_mul_qtd_filhos_vivos_2.0',
  'mc_mul_qtd_gest_0.0',
  'mc_mul_qtd_gest_2.0',
  'mc_mul_rec_inf_plan_fam_0.0',
  'mc_mul_tipo_const_casa_1.0',
  'mc_mul_tipo_const_casa_4.0',
  'mc_dae_escolaridade_5.0',
  'mc_dae_escolaridade_8.0',
  'mc_dae_mrd_lgd_red_esg_0.0',
  'mc_dae_numero_res_domic_4.0',
  'mc_dae_possui_arv_frut_0.0',
  'mc_dae_possui_arv_frut_1.0',
  'mc_dae_possui_horta_0.0',
  'mc_dae_rfa_0.0',
  'mc_dae_rfa_1.0',
  'mc_dae_sit_moradia_1.0',
  'mc_dae_sit_moradia_2.0',
  'mc_dae_trat_agua_uso_0.0',
  'mc_dae_trat_agua_uso_1.0',
  'mc_dae_trat_agua_uso_3.0',
]
X_train, X_test, y_train, y_test, _ = prepare_data_exp_bodds(df, cols_to_keep)

clf_bodds_xgb = XGBClassifier(learning_rate=0.3, max_depth=5)
clf_bodds_xgb.fit(X_train, y_train)

y_pred_bodds_xgb = clf_bodds_xgb.predict(X_test)
y_pred_proba_bodds_xgb = clf_bodds_xgb.predict_proba(X_test)
fpr_bodds_xgb, tpr_bodds_xgb, _ = roc_curve(y_test, y_pred_proba_bodds_xgb[:, 1])

roc_score_xgb = roc_auc_score(y_test, y_pred_proba_bodds_xgb[:, 1])
print(f"\nROC AUC: {roc_score_xgb:.4f}")

print_metrics(y_test, y_pred_bodds_xgb)

plot_roc_curve(tpr_bodds_xgb, fpr_bodds_xgb, 'XGB (SBS)', False)

### KNN (SFS):

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from lib.experiment_helper import prepare_data_exp_bodds, print_metrics

cols_to_keep = [
  'mc_get_fator_rh_1.0',
  'mc_get_fumo_0.0',
  'mc_get_grupo_sanguineo_0.0',
  'mc_get_grupo_sanguineo_2.0',
  'mc_get_grupo_sanguineo_3.0',
  'mc_mul_est_civil_1.0',
  'mc_mul_nivel_inseguranca_0.0',
  'mc_mul_tipo_const_casa_1.0',
  'mc_mul_tipo_const_casa_3.0',
  'mc_dae_escolaridade_7.0',
]
X_train, X_test, y_train, y_test, _ = prepare_data_exp_bodds(df, cols_to_keep)

clf_bodds_knn = KNeighborsClassifier(n_neighbors=5, p=1, weights='uniform')
clf_bodds_knn.fit(X_train, y_train)

y_pred_bodds_knn = clf_bodds_knn.predict(X_test)
y_pred_proba_bodds_knn = clf_bodds_knn.predict_proba(X_test)
fpr_bodds_knn, tpr_bodds_knn, _ = roc_curve(y_test, y_pred_proba_bodds_knn[:, 1])

roc_score_knn = roc_auc_score(y_test, y_pred_proba_bodds_knn[:, 1])
print(f"\nROC AUC: {roc_score_knn:.4f}")

print_metrics(y_test, y_pred_bodds_knn)

plot_roc_curve(tpr_bodds_knn, fpr_bodds_knn, 'KNN (SFS)', False)

### SVM (SFS):

In [None]:
from sklearn.svm import SVC
from lib.experiment_helper import prepare_data_exp_bodds, print_metrics

cols_to_keep = [
  'mc_get_fumo_0.0',
  'mc_get_grupo_sanguineo_2.0',
  'mc_get_grupo_sanguineo_3.0',
  'mc_mul_est_civil_1.0',
  'mc_mul_est_civil_2.0',
  'mc_mul_est_civil_3.0',
  'mc_mul_nivel_inseguranca_0.0',
  'mc_mul_qtd_filhos_vivos_2.0',
  'mc_mul_tipo_const_casa_1.0',
  'mc_dae_escolaridade_5.0',
  'mc_dae_escolaridade_8.0',
  'mc_dae_rfa_1.0',
  'mc_dae_trat_agua_uso_0.0',
  'mc_dae_trat_agua_uso_3.0',
]
X_train, X_test, y_train, y_test, _ = prepare_data_exp_bodds(df, cols_to_keep)

clf_bodds_svm = SVC(gamma='scale', kernel='rbf', probability=True)
clf_bodds_svm.fit(X_train, y_train)

y_pred_bodds_svm = clf_bodds_svm.predict(X_test)
y_pred_proba_bodds_svm = clf_bodds_svm.predict_proba(X_test)
fpr_bodds_svm, tpr_bodds_svm, _ = roc_curve(y_test, y_pred_proba_bodds_svm[:, 1])

roc_score_svm = roc_auc_score(y_test, y_pred_proba_bodds_svm[:, 1])
print(f"\nROC AUC: {roc_score_svm:.4f}")

print_metrics(y_test, y_pred_bodds_svm)

plot_roc_curve(tpr_bodds_svm, fpr_bodds_svm, 'SVM (SFS)', False)

### All:

In [None]:
plt.figure(figsize = (8, 8))
ax = sns.lineplot(x = [0, 1], y = [0, 1], color = 'gray', label='Threshold')
ax.lines[0].set_linestyle('--')

sns.lineplot(x = fpr_bodds_decision_tree, y = tpr_bodds_decision_tree, label='Decision Tree')
sns.lineplot(x = fpr_bodds_rf, y = tpr_bodds_rf, label='Random Forest')
sns.lineplot(x = fpr_bodds_ada, y = tpr_bodds_ada, label='AdaBoost')
sns.lineplot(x = fpr_bodds_gbm, y = tpr_bodds_gbm, label='GBM')
sns.lineplot(x = fpr_bodds_xgb, y = tpr_bodds_xgb, label='XGB')
sns.lineplot(x = fpr_bodds_knn, y = tpr_bodds_knn, label='KNN')
sns.lineplot(x = fpr_bodds_svm, y = tpr_bodds_svm, label='SVM')

plt.xlim(-0.05, 1.05)
plt.ylim(-0.05, 1.05)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")