# Capítulo 08 - Métodos baseados em árvore

## Comparando em problemas de classificação:

1. Árvores de decisão
2. Bagging
3. Random forest
4. Boosting - Gradient Tree Boosting


In [104]:
import os

from feature_engine.encoding import CountFrequencyEncoder, OneHotEncoder, OrdinalEncoder
# import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.ensemble import GradientBoostingClassifier as gb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.pipeline import Pipeline


os.chdir('../')
from src.data_describe import DataDescribe as dd
os.chdir('./ch08_tree_based_methods/')

RANDOM_STATE = 42

In [88]:
df = pd.read_csv('../data/Heart.csv', index_col=0)

print(f"df.shape: {df.shape}")
df.head()

df.shape: (303, 14)


Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [89]:
df["AHD"].value_counts()

No     164
Yes    139
Name: AHD, dtype: int64

In [90]:
# Quantidade de nulos
df_copy = df.copy()
df_copy.dropna(axis=0, how='any', inplace=True)

df.isnull().sum()[df.isnull().sum()>0]

Ca      4
Thal    2
dtype: int64

In [91]:
df_cardinalidade = dd.cardinalidade(df_copy)
lst_int_to_float = df_cardinalidade[df_cardinalidade["Cardinalidade"]>20]["Atributo"].tolist()

for column in lst_int_to_float:
    df_copy[column] = df_copy[column].astype(float)
    
dd.cardinalidade(df_copy)

Unnamed: 0,Atributo,Cardinalidade,Valores
7,AHD,2,"[No, Yes]"
4,ExAng,2,"[0, 1]"
2,Fbs,2,"[0, 1]"
0,Sex,2,"[0, 1]"
3,RestECG,3,"[0, 1, 2]"
5,Slope,3,"[1, 2, 3]"
6,Thal,3,"[fixed, normal, reversable]"
1,ChestPain,4,"[asymptomatic, nonanginal, nontypical, typical]"


### Preparando os dados

#### Transformando dados categóricos em númericos.

Nesse momento, apenas substituiremos as classes de cada atributo categórico pela sua frequência de sua ocorrência.

In [92]:
df_copy["AHD_binary"] = np.where(df_copy["AHD"]=="No", 0, 1)
df_copy[["AHD_binary", "AHD"]].head()

Unnamed: 0,AHD_binary,AHD
1,0,No
2,1,Yes
3,1,Yes
4,0,No
5,0,No


In [93]:
X = df_copy.drop(labels=["AHD_binary", "AHD"], axis=1)
y = df_copy["AHD_binary"]

# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

encoder = CountFrequencyEncoder(encoding_method='frequency', variables=['Thal', 'ChestPain'])

encoder.fit(X_train)

X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

encoder.encoder_dict_

{'Thal': {'normal': 0.5507246376811594,
  'reversable': 0.391304347826087,
  'fixed': 0.057971014492753624},
 'ChestPain': {'asymptomatic': 0.4927536231884058,
  'nonanginal': 0.28502415458937197,
  'nontypical': 0.14009661835748793,
  'typical': 0.0821256038647343}}

### Aplicando os modelos

In [94]:
lst_model = ["decision tree", "bagging", "random forest", "gradient boosting machine"]
max_depth = len(X_train.columns)/2

result = {}

for choosen_model in lst_model:
    if choosen_model == "decision tree":
        # Decision Tree
        model = dt(random_state=RANDOM_STATE, max_depth=max_depth)
    elif choosen_model == "bagging":
        # Bagging
        model = rf(max_features=None, random_state=RANDOM_STATE, max_depth=max_depth)
    elif choosen_model == "random forest":
        # Random forest
        model = rf(max_features='sqrt', random_state=RANDOM_STATE, max_depth=max_depth)
    elif choosen_model == "gradient boosting machine":
        # Gradient boosting
        model = gb(random_state=RANDOM_STATE, max_depth=max_depth)
           
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    result[choosen_model] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "F1_score": f1_score(y_test, y_pred),
        "ROC_auc_score": roc_auc_score(y_test, y_pred),
        "Precision_score": precision_score(y_test, y_pred),
        "Recall_score": recall_score(y_test, y_pred)
    } 
    
#     print(f"""Accuracy: {round(accuracy_score(y_test, y_pred), 2)}
#     F1_score: {round(f1_score(y_test, y_pred), 2)}
#     ROC_auc_score: {round(roc_auc_score(y_test, y_pred), 2)}
#     Precision_score: {round(precision_score(y_test, y_pred), 2)}
#     Recall_score: {round(recall_score(y_test, y_pred), 2)}
#     """)

df_result = pd.DataFrame.from_dict(result, orient='index').sort_values(by='ROC_auc_score', ascending=False)
df_result

Unnamed: 0,Accuracy,F1_score,ROC_auc_score,Precision_score,Recall_score
random forest,0.855556,0.835443,0.851419,0.868421,0.804878
bagging,0.822222,0.804878,0.820806,0.804878,0.804878
gradient boosting machine,0.8,0.775,0.796416,0.794872,0.756098
decision tree,0.733333,0.7,0.729219,0.717949,0.682927


## Comparando em problemas de regressão:

1. Árvores de decisão
2. Bagging
3. Random forest
4. Boosting - Gradient Tree Boosting

In [95]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.ensemble import GradientBoostingRegressor as gbr
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.tree import DecisionTreeRegressor as dtr

In [96]:
df_housing = pd.DataFrame(load_boston()['data'], columns=load_boston()['feature_names'])
df_housing['target'] = load_boston()['target'].reshape(-1,1)

print(f"df_housing.shape: {df_housing.shape}")
df_housing.head()

df_housing.shape: (506, 14)


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [97]:
X = df_housing.drop(labels=["target"], axis=1)
y = df_housing["target"]

# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

In [98]:
lst_model = ["decision tree", "bagging", "random forest", "gradient boosting machine"]
max_depth = len(X_train.columns)/2

result = {}

for choosen_model in lst_model:
    if choosen_model == "decision tree":
        # Decision Tree
        model = dtr(random_state=RANDOM_STATE, max_depth=max_depth)
    elif choosen_model == "bagging":
        # Bagging
        model = rfr(
            max_features=None, random_state=RANDOM_STATE, max_depth=max_depth, n_jobs=-1
            , n_estimators = 1000
        )
    elif choosen_model == "random forest":
        # Random forest
        model = rfr(
            max_features='sqrt', random_state=RANDOM_STATE, max_depth=max_depth, n_jobs=-1
            , n_estimators = 1000
        )
    elif choosen_model == "gradient boosting machine":
        # Gradient boosting
        model = gbr(
            random_state=RANDOM_STATE, max_depth=max_depth)
           
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    result[choosen_model] = {
        "RMSE": mean_squared_error(y_test, y_pred)**0.5,
        "MSE": mean_squared_error(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
    }
    
    cross = cross_validate(model, X.values, y, cv=10, scoring="neg_mean_squared_error", return_train_score=True)
    print("\n"+choosen_model)
    for key in cross.keys():
        print(f"""{key}: {round(-cross[key].mean(), 10)}""")
    print("-"*30)
    
#     print(f"""Accuracy: {round(accuracy_score(y_test, y_pred), 2)}
#     F1_score: {round(f1_score(y_test, y_pred), 2)}
#     ROC_auc_score: {round(roc_auc_score(y_test, y_pred), 2)}
#     Precision_score: {round(precision_score(y_test, y_pred), 2)}
#     Recall_score: {round(recall_score(y_test, y_pred), 2)}
#     """)

df_result = pd.DataFrame.from_dict(result, orient='index').sort_values(by='RMSE', ascending=True)
df_result


decision tree
fit_time: -0.0021992445
score_time: -0.0002998114
test_score: 34.2127998276
train_score: 4.4389407796
------------------------------

bagging
fit_time: -2.7474698782
score_time: -0.0868993521
test_score: 22.4051044717
train_score: 3.6548788486
------------------------------

random forest
fit_time: -0.5954256535
score_time: -0.0904506922
test_score: 22.1636221203
train_score: 4.7239062945
------------------------------

gradient boosting machine
fit_time: -0.1539128065
score_time: -0.0004955292
test_score: 27.307470045
train_score: 0.0508329155
------------------------------


Unnamed: 0,RMSE,MSE,MAE
gradient boosting machine,2.71892,7.392524,1.908958
bagging,3.162192,9.999457,2.15975
decision tree,3.224707,10.398734,2.373466
random forest,3.597804,12.944197,2.335882


## Exercício 8

Página 333

In [106]:
df_carseats = pd.read_csv('../data/Carseats.csv', index_col=0)

print(f"df_housing.shape: {df_carseats.shape}")

df_carseats.head()

df_housing.shape: (400, 11)


Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
1,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
2,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
3,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
4,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
5,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [130]:
X = df_carseats.drop(labels=["Sales"], axis=1)
y = df_carseats["Sales"]

# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [132]:
pipe = Pipeline([
    ("ohe", OneHotEncoder(variables=['Urban', 'US'], drop_last=True)),
    ("ordinal", OrdinalEncoder(encoding_method='ordered', variables=['ShelveLoc']))
])

pipe.fit(X_train, y_train)

X_train_encoded = pipe.transform(X_train)
X_test_encoded = pipe.transform(X_test)

X_train_encoded.sort_index().head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban_Yes,US_Yes
2,111,48,16,260,83,2,65,10,1,1
3,113,35,10,269,80,1,59,12,1,1
4,117,100,4,466,97,1,55,14,1,1
5,141,64,3,340,128,0,38,13,1,0
7,115,105,0,45,108,1,71,15,1,0


In [135]:
mtry=6
lst_ntree=[25, 500]
max_depth = len(X_train_encoded.columns)/2

for ntree in lst_ntree:
    model = rfr(
        max_features=mtry, random_state=RANDOM_STATE, max_depth=max_depth,
        n_jobs=-1, n_estimators=ntree
        )

    model.fit(X_train_encoded, y_train)

    y_pred = model.predict(X_test_encoded)
    
    cross = cross_validate(model, X_train_encoded.values, y_train, cv=5, scoring="neg_mean_squared_error", return_train_score=True)
    print(f"""ntree: {ntree}""")
    for key in [key for key in cross.keys() if '_time' not in key]:
        print(f"""{key}: {round(-cross[key].mean(), 5)}""")
    print("-"*30)

ntree: 25
test_score: 3.30918
train_score: 1.51677
------------------------------
ntree: 500
test_score: 3.24157
train_score: 1.41182
------------------------------
