# Capítulo 08 - Métodos baseados em árvore

## Comparando:

1. Árvores de decisão
2. Bagging
3. Random forest
4. Boosting - Gradient Tree Boosting


In [5]:
import os

from feature_engine.encoding import CountFrequencyEncoder
# import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.ensemble import GradientBoostingClassifier as gb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.pipeline import Pipeline


os.chdir('../')
from src.data_describe import DataDescribe as dd
os.chdir('./ch08_tree_based_methods/')

In [6]:
df = pd.read_csv('../data/Heart.csv', index_col=0)

print(f"df.shape: {df.shape}")
df.head()

df.shape: (303, 14)


Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [7]:
# Quantidade de nulos
df_copy = df.copy()
df_copy.dropna(axis=0, how='any', inplace=True)

df.isnull().sum()[df.isnull().sum()>0]

Ca      4
Thal    2
dtype: int64

In [8]:
df_cardinalidade = dd.cardinalidade(df_copy)
lst_int_to_float = df_cardinalidade[df_cardinalidade["Cardinalidade"]>20]["Atributo"].tolist()

for column in lst_int_to_float:
    df_copy[column] = df_copy[column].astype(float)
    
dd.cardinalidade(df_copy)

Unnamed: 0,Atributo,Cardinalidade,Valores
7,AHD,2,"[No, Yes]"
4,ExAng,2,"[0, 1]"
2,Fbs,2,"[0, 1]"
0,Sex,2,"[0, 1]"
3,RestECG,3,"[0, 1, 2]"
5,Slope,3,"[1, 2, 3]"
6,Thal,3,"[fixed, normal, reversable]"
1,ChestPain,4,"[asymptomatic, nonanginal, nontypical, typical]"


### Preparando os dados

#### Transformando dados categóricos em númericos.

Nesse momento, apenas substituiremos as classes de cada atributo categórico pela sua frequência de sua ocorrência.

In [11]:
df_copy["AHD_binary"] = np.where(df_copy["AHD"]=="No", 0, 1)
df_copy[["AHD_binary", "AHD"]].head()

Unnamed: 0,AHD_binary,AHD
1,0,No
2,1,Yes
3,1,Yes
4,0,No
5,0,No


In [15]:
X = df_copy.drop(labels=["AHD_binary", "AHD"], axis=1)
y = df_copy["AHD_binary"]

In [18]:
# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

encoder = CountFrequencyEncoder(encoding_method='frequency', variables=['Thal', 'ChestPain'])

encoder.fit(X_train)

X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

encoder.encoder_dict_

{'Thal': {'normal': 0.5507246376811594,
  'reversable': 0.391304347826087,
  'fixed': 0.057971014492753624},
 'ChestPain': {'asymptomatic': 0.4927536231884058,
  'nonanginal': 0.28502415458937197,
  'nontypical': 0.14009661835748793,
  'typical': 0.0821256038647343}}

### Aplicando os modelos

In [27]:
# Decision Tree
model = dt()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"""Accuracy: {round(accuracy_score(y_test, y_pred), 2)}
F1_score: {round(f1_score(y_test, y_pred), 2)}
ROC_auc_score: {round(roc_auc_score(y_test, y_pred), 2)}
Precision_score: {round(precision_score(y_test, y_pred), 2)}
Recall_score: {round(recall_score(y_test, y_pred), 2)}
""")

Accuracy: 0.71
F1_score: 0.7
ROC_auc_score: 0.71
Precision_score: 0.67
Recall_score: 0.73



In [28]:
# Bagging
model = rf(max_features=None)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"""Accuracy: {round(accuracy_score(y_test, y_pred), 2)}
F1_score: {round(f1_score(y_test, y_pred), 2)}
ROC_auc_score: {round(roc_auc_score(y_test, y_pred), 2)}
Precision_score: {round(precision_score(y_test, y_pred), 2)}
Recall_score: {round(recall_score(y_test, y_pred), 2)}
""")

Accuracy: 0.82
F1_score: 0.8
ROC_auc_score: 0.82
Precision_score: 0.8
Recall_score: 0.8



In [30]:
# Random forest
model = rf(max_features='sqrt')

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"""Accuracy: {round(accuracy_score(y_test, y_pred), 2)}
F1_score: {round(f1_score(y_test, y_pred), 2)}
ROC_auc_score: {round(roc_auc_score(y_test, y_pred), 2)}
Precision_score: {round(precision_score(y_test, y_pred), 2)}
Recall_score: {round(recall_score(y_test, y_pred), 2)}
""")

Accuracy: 0.86
F1_score: 0.83
ROC_auc_score: 0.85
Precision_score: 0.89
Recall_score: 0.78



In [31]:
# Gradient boosting
model = gb()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"""Accuracy: {round(accuracy_score(y_test, y_pred), 2)}
F1_score: {round(f1_score(y_test, y_pred), 2)}
ROC_auc_score: {round(roc_auc_score(y_test, y_pred), 2)}
Precision_score: {round(precision_score(y_test, y_pred), 2)}
Recall_score: {round(recall_score(y_test, y_pred), 2)}
""")

Accuracy: 0.82
F1_score: 0.81
ROC_auc_score: 0.82
Precision_score: 0.79
Recall_score: 0.83

