# Решающие деревья

Авторы: Гирдюк Дмитрий, Никольская Анастасия

In [None]:
import random

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas
import pandas as pd
import yaml
from matplotlib.colors import ListedColormap
from mushrooms_data_engineering import prepare_mushrooms_data
from sklearn import metrics
from sklearn.datasets import make_circles, make_blobs, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from wdbc_data_engineering import prepare_wdbc_data


SEED = 314159
TRAIN_TEST_SPLIT = 0.80


with open("../config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

### Препроцессинг

#### WDBC

In [None]:
df_wdbc = pd.read_csv(cfg["wdbc"])

y_wdbc = df_wdbc[["diagnosis"]].replace({"B": 0, "M": 1})
df_wdbc.drop(columns=["id", "Unnamed: 32", "diagnosis"], inplace=True)

df_wdbc_train, df_wdbc_test, y_wdbc_train, y_wdbc_test = train_test_split(
    df_wdbc, y_wdbc, test_size=1 - TRAIN_TEST_SPLIT, random_state=SEED
)

df_wdbc_train, df_wdbc_test = prepare_wdbc_data(
    df_train=df_wdbc_train, df_test=df_wdbc_test
)

#### Mushrooms

In [None]:
df_m = pd.read_csv(cfg["mushrooms"])

y_m = df_m[["class"]].replace({"e": 0, "p": 1})
df_m.drop(columns=["class"], inplace=True)

df_m_train, df_m_test, y_m_train, y_m_test = train_test_split(
    df_m, y_m, test_size=1 - TRAIN_TEST_SPLIT, random_state=SEED
)

df_m_train, df_m_test = prepare_mushrooms_data(df_train=df_m_train, df_test=df_m_test)

### Решающее дерево для WDBC

In [None]:
dt_wdbc = DecisionTreeClassifier(max_depth=5, min_samples_leaf=30, random_state=SEED)
dt_wdbc.fit(df_wdbc_train, y_wdbc_train)
print(
    "Test accuracy:",
    dt_wdbc.score(df_wdbc_test, y_wdbc_test),
    "Test ROCAUC:",
    roc_auc_score(y_wdbc_test, dt_wdbc.predict_proba(df_wdbc_test)[:, 1]),
)

В модуле `sklearn.tree` реализована функция для отрисовки обученного дерева:

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
plot_tree(dt_wdbc, ax=ax);

Рассмотрим другие значения гиперпараметров.

In [None]:
dt_wdbc_gini = DecisionTreeClassifier(criterion="gini", max_depth=3, random_state=SEED)
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
plot_tree(dt_wdbc_gini.fit(df_wdbc_train, y_wdbc_train), ax=ax);

In [None]:
print(
    "Train accuracy: {0:0.4f}".format(dt_wdbc_gini.score(df_wdbc_train, y_wdbc_train))
)
print("Test accuracy: {0:0.4f}".format(dt_wdbc_gini.score(df_wdbc_test, y_wdbc_test)))

Попробуем вместо критерия Джини использовать энтропию.

In [None]:
dt_wdbc_ent = DecisionTreeClassifier(
    criterion="entropy", max_depth=3, random_state=SEED
)
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
plot_tree(dt_wdbc_ent.fit(df_wdbc_train, y_wdbc_train), ax=ax);

In [None]:
print("Train accuracy: {:.4f}".format(dt_wdbc_ent.score(df_wdbc_train, y_wdbc_train)))
print("Test accuracy: {:.4f}".format(dt_wdbc_ent.score(df_wdbc_test, y_wdbc_test)))

### Решающее дерево для Mushrooms

In [None]:
dt_m = DecisionTreeClassifier(max_depth=5, min_samples_leaf=30, random_state=SEED)
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
plot_tree(dt_m.fit(df_m_train, y_m_train), ax=ax);

In [None]:
print("Train accuracy: {0:0.4f}".format(dt_m.score(df_m_train, y_m_train)))
print("Test accuracy: {0:0.4f}".format(dt_m.score(df_m_test, y_m_test)))

### Разделяющие границы

In [None]:
classifiers = {
    "DT_basic_3": DecisionTreeClassifier(max_depth=3, random_state=SEED),
    "DT_basic_5_no_min": DecisionTreeClassifier(max_depth=5, random_state=SEED),
    "DT_basic_5": DecisionTreeClassifier(
        max_depth=5, min_samples_leaf=30, random_state=SEED
    ),
    "DT_gini_5": DecisionTreeClassifier(
        max_depth=5, criterion="gini", min_samples_leaf=30, random_state=SEED
    ),
    "DT_entropy_5": DecisionTreeClassifier(
        max_depth=5, criterion="entropy", min_samples_leaf=30, random_state=SEED
    ),
    "DT_entropy_5_no_min": DecisionTreeClassifier(
        max_depth=5, criterion="entropy", random_state=SEED
    ),
    "DT_entropy_deep": DecisionTreeClassifier(
        criterion="entropy", min_samples_leaf=30, random_state=SEED
    ),
    "DT_gini_deep": DecisionTreeClassifier(
        criterion="gini", min_samples_leaf=30, random_state=SEED
    ),
}

# 2 синтетических датасета
X_moons, y_moons = make_moons(n_samples=2000, noise=0.3, random_state=SEED)
X_moons_train, X_moons_test, y_moons_train, y_moons_test = train_test_split(
    X_moons, y_moons, test_size=1 - TRAIN_TEST_SPLIT, random_state=SEED
)
X_circles, y_circles = make_circles(
    n_samples=2000, noise=0.2, factor=0.3, random_state=SEED
)
X_circles_train, X_circles_test, y_circles_train, y_circles_test = train_test_split(
    X_circles, y_circles, test_size=1 - TRAIN_TEST_SPLIT, random_state=SEED
)

wdbc_cols = [df_wdbc.columns[20], df_wdbc.columns[27]]
m_cols = [df_m.columns[4], df_m.columns[10]]

datasets = [
    (
        df_wdbc_train[wdbc_cols].to_numpy(),
        df_wdbc_test[wdbc_cols].to_numpy(),
        y_wdbc_train.to_numpy(),
        y_wdbc_test.to_numpy(),
    ),
    (
        df_m_train[m_cols].to_numpy(),
        df_m_test[m_cols].to_numpy(),
        y_m_train.to_numpy(),
        y_m_test.to_numpy(),
    ),
    (X_moons_train, X_moons_test, y_moons_train, y_moons_test),
    (X_circles_train, X_circles_test, y_circles_train, y_circles_test),
]

In [None]:
def set_grid(
    ax, i: int, j: int, x_min: float, x_max: float, y_min: float, y_max: float
) -> None:
    ax[i][j].set_xlim(x_min, x_max)
    ax[i][j].set_ylim(y_min, y_max)
    ax[i][j].set_xticks(())
    ax[i][j].set_yticks(())


def calculate_borders(X, y) -> tuple[float, float, float, float]:
    x_diff = X_train[:, 0].max() - X_train[:, 0].min()
    y_diff = X_train[:, 1].max() - X_train[:, 1].min()
    x_min, x_max = (
        X_train[:, 0].min() - x_diff * 0.1,
        X_train[:, 0].max() + x_diff * 0.1,
    )
    y_min, y_max = (
        X_train[:, 1].min() - y_diff * 0.1,
        X_train[:, 1].max() + y_diff * 0.1,
    )
    return x_min, x_max, y_min, y_max

In [None]:
fig, ax = plt.subplots(len(datasets), len(classifiers) + 1, figsize=(18, 10))

for dataset_idx, data in enumerate(datasets):
    X_train, X_test, y_train, y_test = data
    x_min, x_max, y_min, y_max = calculate_borders(X_train, y_train)

    # Отображаем датасеты
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(["#FF0000", "#0000FF"])
    if dataset_idx == 0:
        ax[dataset_idx][0].set_title("Input data")
    ax[dataset_idx][0].scatter(
        X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k"
    )
    ax[dataset_idx][0].scatter(
        X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k"
    )
    set_grid(ax, i=dataset_idx, j=0, x_min=x_min, x_max=x_max, y_min=y_min, y_max=y_max)

    # Строим решающие границы
    for cls_idx, (name, clf) in enumerate(classifiers.items(), start=1):
        clf = make_pipeline(StandardScaler(), clf)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        DecisionBoundaryDisplay.from_estimator(
            clf, X_train, cmap=cm, alpha=0.8, ax=ax[dataset_idx][cls_idx], eps=0.5
        )

        # # Тренировочные данные
        # ax[dataset_idx][cls_idx].scatter(
        #    X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k"
        # )

        # Тестовые данные
        ax[dataset_idx][cls_idx].scatter(
            X_test[:, 0],
            X_test[:, 1],
            c=y_test,
            cmap=cm_bright,
            edgecolors="k",
            alpha=0.6,
        )

        set_grid(
            ax,
            i=dataset_idx,
            j=cls_idx,
            x_min=x_min,
            x_max=x_max,
            y_min=y_min,
            y_max=y_max,
        )
        if dataset_idx == 0:
            ax[dataset_idx][cls_idx].set_title(
                name, fontdict={"fontsize": 10, "fontweight": "medium"}
            )

        ax[dataset_idx][cls_idx].text(
            x_max - 0.3,
            y_min + 0.3,
            ("%.2f" % score).lstrip("0"),
            size=15,
            horizontalalignment="right",
        )

plt.tight_layout()
plt.show()

### Мультиклассификация на примере блобов

In [None]:
X, y = make_blobs(
    n_samples=5000,
    n_features=2,
    centers=[(0, 0), (0, 1), (1, 0)],
    cluster_std=0.7,
    shuffle=False,
    random_state=SEED,
)
X, y = X[1000:3500], y[1000:3500] + 1
dt_blobs = DecisionTreeClassifier(min_samples_leaf=10, random_state=SEED)
dt_blobs.fit(X, y)
dt_blobs.score(X, y)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))

ax.scatter(X[:, 0], X[:, 1], c=y, edgecolors="k")
DecisionBoundaryDisplay.from_estimator(dt_blobs, X, ax=ax, alpha=0.4, eps=0.7);

In [None]:
print(confusion_matrix(y_true=y, y_pred=dt_blobs.predict(X)))

In [None]:
print(classification_report(y_true=y, y_pred=dt_blobs.predict(X), labels=[1, 2, 3]))