In [None]:
import pathlib
import requests

import graphviz
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, metrics, model_selection
from sklearn import pipeline, preprocessing, tree

# Decision Trees

[Decision Trees](https://scikit-learn.org/stable/modules/tree.html) are a non-parametric supervised learning method used for [classification](https://scikit-learn.org/stable/modules/tree.html#tree-classification) and [regression](https://scikit-learn.org/stable/modules/tree.html#tree-regression). The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features. A tree can be seen as a piecewise constant approximation.

## Training and Visualizing a Decision Tree

In [None]:
iris_data = datasets.load_iris(
    as_frame=True
)

In [None]:
features_df = iris_data.data
target = iris_data.target

In [None]:
features_df.head()

In [None]:
target

In [None]:
classifier = tree.DecisionTreeClassifier(max_depth=2, random_state=42)
_ = classifier.fit(features_df, target)

In [None]:
_ = tree.export_graphviz(
    classifier,
    out_file="iris-tree.dot",
    feature_names=features_df.columns,
    class_names=iris_data.target_names,
    rounded=True,
    filled=True
)

In [None]:
(
    graphviz.Source
            .from_file("iris-tree.dot")
)

In [None]:
help(classifier.tree_)

## Making Predictions

In [None]:
classifier.predict(features_df)

## Predicting Class Probabilities

In [None]:
classifier.predict_proba(features_df)

## Regularization

In [None]:
tree.DecisionTreeClassifier?

In [None]:
metrics.get_scorer_names()

In [None]:
classifier = tree.DecisionTreeClassifier()
cv_scores = model_selection.cross_val_score(
    classifier,
    features_df,
    target,
    cv=5,
    n_jobs=-1,
    scoring="f1_macro"
)

In [None]:
cv_scores

In [None]:
np.mean(cv_scores)

## Exercise

Fit a decision tree classifier to the following dataset. Select an appropriate scoring metric and evaluate the performance of your classifier using cross-validation. Is your classifier under-fitting? Over-fitting? Tune the regularization hyperparameters to improve the performance of your classifier.

In [None]:
datasets.load_breast_cancer?

In [None]:
breast_cancer_data = datasets.load_breast_cancer(
    as_frame=True
)

## Understanding Feature Importance

One of the nice features of decision trees is that they provide a way to measure the importance of each of feature. Understanding feature importance is a topic all unto itself. If you are interested in pulling this thread, then I recommend that you start with [SHapley Additive Explanations (SHAP)](https://shap.readthedocs.io/en/latest/index.html) and then take a look through [*Interpretable Machine Learning*](https://christophm.github.io/interpretable-ml-book/).

In [None]:
classifier = tree.DecisionTreeClassifier()
_ = classifier.fit(breast_cancer_data.data, breast_cancer_data.target)

In [None]:
pd.Series(
    classifier.feature_importances_,
    index=breast_cancer_data.data.columns
).sort_values(
    ascending=False
)

## Exercise

Grow a forest by following these steps:

1. Continuing the previous exercise, generate 1,000 subsets of the training set, each containing 80% of the training instances selected randomly. Hint: you can use Scikit-Learn’s ShuffleSplit class for this.

2. Train one decision tree on each subset, using the best hyperparameter values found in the previous exercise. Evaluate these 1,000 decision trees on the test set. Since they were trained on smaller sets, these decision trees will likely perform worse than the first decision tree, achieving only about 80% accuracy.


3. Now comes the magic. For each test set instance, generate the predictions of the 1,000 decision trees, and keep only the most frequent prediction (you can use SciPy’s mode() function for this). This approach gives you majority-vote predictions over the test set.

4. Evaluate these predictions on the test set: you should obtain a slightly higher accuracy than your first model. Congratulations, you have trained a random forest classifier!