# Decision tree

## References

* https://medium.com/@penggongting/implementing-decision-tree-from-scratch-in-python-c732e7c69aea
* https://www.kdnuggets.com/2020/01/decision-tree-algorithm-explained.html

## The core algorithm aka the CART algorithm

CART = Classification And Regression Tree

Starting with a tabular dataset we have columns / features and rows / observations. Each row has a target value, of which either all are continuous or categorical. 

Taking a subset of the observations as a training set, the algorithm iterates:

1. select a feature
2. select a range of thresholds (e.g. the feature values in the taining set) 
3. for each threshold
    * create two groups of observations, one below the threshold and one above and 
    * evaluate the split score
4. select the threshold with the optimal split score (here that always means largest)
5. select the related group split 
6. continue from 1. for each group whose target values are not yet homogeneous (e.g. not all the same class, or the standard variation is greater than zero)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import sklearn.datasets as sk_datasets

from random_tree_models.models.decisiontree import (
    DecisionTreeClassifier,
    DecisionTreeRegressor,
)
from random_tree_models.models.decisiontree.visualize import show_tree
from random_tree_models.scoring import MetricNames

In [None]:
rng = np.random.RandomState(42)

## Classification

split score:
* gini
* entropy

In [None]:
X, y = sk_datasets.make_classification(
    n_samples=1_000,
    n_features=2,
    n_classes=2,
    n_redundant=0,
    class_sep=2,
    random_state=rng,
)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, alpha=0.3);

In [None]:
model = DecisionTreeClassifier(measure_name=MetricNames.gini, max_depth=4)

In [None]:
X

In [None]:
model.fit(X, y)

In [None]:
show_tree(model)

In [None]:
y_prob = model.predict_proba(X)
y_prob[:5]

In [None]:
x0 = np.linspace(X[:, 0].min(), X[:, 0].max(), 100)
x1 = np.linspace(X[:, 1].min(), X[:, 1].max(), 100)
X0, X1 = np.meshgrid(x0, x1)
X_plot = np.array([X0.ravel(), X1.ravel()]).T

In [None]:
y_prob = model.predict_proba(X_plot)[:, 1]
y_prob[:5]

In [None]:
fig, ax = plt.subplots()
im = ax.pcolormesh(X0, X1, y_prob.reshape(X0.shape), alpha=0.2)
fig.colorbar(im, ax=ax)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, ax=ax, alpha=0.3)
plt.show()

## Regression

split score:

* variance

In [None]:
X, y, coefs = sk_datasets.make_regression(
    n_samples=1_000, n_features=2, n_targets=1, coef=True, random_state=rng
)
sns.scatterplot(x=X[:, 0], y=y, alpha=0.3)

In [None]:
model = DecisionTreeRegressor(measure_name=MetricNames.variance, max_depth=2)

In [None]:
model.fit(X, y)

In [None]:
show_tree(model)

In [None]:
x0 = np.linspace(X[:, 0].min(), X[:, 0].max(), 100)
x1 = np.linspace(X[:, 1].min(), X[:, 1].max(), 100)
X0, X1 = np.meshgrid(x0, x1)
X_plot = np.array([X0.ravel(), X1.ravel()]).T

In [None]:
y_pred = model.predict(X_plot)
y_pred[:5]

In [None]:
fig, axs = plt.subplots(nrows=2, figsize=(12, 6))

ax = axs[0]
sns.scatterplot(x=X_plot[:, 0], y=y_pred, ax=ax, alpha=0.1, label="prediction")

ax = axs[1]
sns.scatterplot(x=X_plot[:, 1], y=y_pred, ax=ax, alpha=0.1, label="prediction")

plt.tight_layout()

In [None]:
fig, ax = plt.subplots()
im = ax.pcolormesh(X0, X1, y_pred.reshape(X0.shape), alpha=0.2)
fig.colorbar(im, ax=ax)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, ax=ax, alpha=0.3)
plt.show()

In [None]:
y_pred = model.predict(X)

fig, axs = plt.subplots(nrows=2, figsize=(12, 6))

ax = axs[0]
sns.scatterplot(x=X[:, 0], y=y_pred, ax=ax, alpha=0.1, label="prediction")
sns.scatterplot(x=X[:, 0], y=y, ax=ax, alpha=0.1, label="actual")

ax = axs[1]
sns.scatterplot(x=X[:, 1], y=y_pred, ax=ax, alpha=0.1, label="prediction")
sns.scatterplot(x=X[:, 1], y=y, ax=ax, alpha=0.1, label="actual")

plt.tight_layout()