## Prediction with CART
                                   
Topics covered:                   
   - Regression trees              
       - sample splitting          
       - visualize simple tree     
       - depth, no leaves, etc     
       - pruning                   
       - diagnostics via variable importance plots 
   - Prediction evaluation         
     - which model gives best prediction on hold-out 
     - comparing to lin. reg       
                                   
Case studies:                     
  - CH15A Predicting used car value with regression trees 
                                   
Dataset:

    used-cars

In [None]:
import os
import random
import sys
import warnings
from collections import Counter
from io import StringIO

import numpy as np
import pandas as pd
import pydotplus
import sklearn
import statsmodels.api as sm
import statsmodels.formula.api as smf
from IPython.display import Image, display
from mizani.formatters import percent_format
from patsy import dmatrices
from plotnine import *
from skimpy import skim
from sklearn import tree
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from stargazer import stargazer
from statsmodels.tools.eval_measures import mse, rmse

warnings.filterwarnings("ignore")

### Import work data

In [None]:
data = pd.read_csv("data/usedcars_cart_work.csv")

### Train-test split

In [None]:
data_train, data_test = train_test_split(data, test_size=0.3, random_state=2314)

### Regression tree
---

#### 1. Simplest case
  age is the only predictor and we allow only one split, make sure it's a single split by setting 'max_depth' to 1

In [None]:
skim(data[["price","age"]])

In [None]:
cart1 = DecisionTreeRegressor(random_state=20108, max_depth=1)

# Note X should be a matrix instead of series, that's why we need double []
X = data_train[["age"]]
Y = data_train["price"]
cart1.fit(X, Y)


In [None]:
def jupyter_graphviz(m: sklearn.tree.DecisionTreeRegressor, **kwargs):
    """
    Function to display sklearn decision trees.
    """
        
    dot_data = StringIO()
    export_graphviz(m, dot_data, **kwargs)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    display(Image(graph.create_png()))

Display split

In [None]:
jupyter_graphviz(
    cart1, filled=True, rounded=True, special_characters=True, feature_names=["age"]
)

Test RMSE

In [None]:
rmse_cart1 = rmse(data_test["price"], cart1.predict(data_test[["age"]]))
rmse_cart1

Visualise scatterplot with step function

In [None]:
plot_helper = np.arange(min(data_train["age"]), max(data_train["age"]))

plot_helper_df = pd.DataFrame({"age": plot_helper})
plot_helper_df["xend"] = pd.Series(plot_helper) + 1
plot_helper_df["yend"] = cart1.predict(plot_helper_df[["age"]])
pred_cart1t = cart1.predict(data_train[["age"]])


In [None]:
(
    ggplot(data_train, aes(x="age", y="price"))
    + geom_point()
    + geom_segment(
        plot_helper_df,
        aes(x="age", y="yend", xend="xend", yend="yend"),
        color="blue",
        size=1,
        na_rm=True,
    )
    + scale_y_continuous(
        expand=(0.01, 0.01), limits=(0, 20000), breaks=np.arange(0, 20001, 2500)
    )
    + scale_x_continuous(
        expand=(0.01, 0.01), limits=(0, 25), breaks=np.arange(0, 26, 5)
    )
    + labs(x="Age (years)", y="Price (US dollars)")
    + theme_bw()
)

Splits at two levels
    
(make sure it stops by setting "max_depth" to 2)


In [None]:
cart2 = DecisionTreeRegressor(random_state=2018, max_depth=2)
# Note X should be a matrix instead of series, that's why we need double []
X = data_train[["age"]]
Y = data_train["price"]
cart2.fit(X, Y)


In [None]:
jupyter_graphviz(
    cart2, filled=True, rounded=True, special_characters=True, feature_names=["age"]
)


In [None]:
data_train["cart2_prediction"] = cart2.predict(X)

(
    data_train.groupby("cart2_prediction")
    .agg(min_age=("age", "min"), max_age=("age", "max"), n_obs=("age", "count"))
    .reset_index()
    .round(1)
    .rename(columns={"cart2_prediction": "Average price", "n_obs": "Number of obs."})
    .assign(
        Category=lambda x: "Age "
        + x["min_age"].astype(str)
        + "-"
        + x["max_age"].astype(str)
    )
    .filter(["Category", "Number of obs.", "Average price"])
    .sort_values(by=["Average price"], ascending=False)
)

Test RMSE

In [None]:
rmse_cart2 = rmse(data_test["price"], cart2.predict(data_test[["age"]]))
rmse_cart2

Scatterplot with step function

In [None]:
plot_helper = np.arange(min(data_train["age"]), max(data_train["age"]), 1)

plot_helper_df = pd.DataFrame({"age": plot_helper})
plot_helper_df["xend"] = pd.Series(plot_helper) + 1
plot_helper_df["yend"] = cart2.predict(plot_helper_df[["age"]])
pred_cart1t = cart2.predict(data_train[["age"]])

In [None]:
(
    ggplot(data_train, aes(x="age", y="price"))
    + geom_point()
    + geom_segment(
        plot_helper_df,
        aes(x="age", y="yend", xend="xend", yend="yend"),
        color="blue",
        size=1,
        na_rm=True,
    )
    + scale_y_continuous(
        expand=(0.01, 0.01), limits=(0, 20000), breaks=np.arange(0, 20001, 2500)
    )
    + scale_x_continuous(
        expand=(0.01, 0.01), limits=(0, 25), breaks=np.arange(0, 26, 5)
    )
    + labs(x="Age (years)", y="Price (US dollars)")
    + theme_bw()
)

Now, split by min_impurity_decrease, where node will be split if this split induces a decrease of the impurity greater than or equal to this value.

In [None]:
cart3 = DecisionTreeRegressor(
    random_state=2018, criterion="mse", min_impurity_decrease=50000
)
# Note X should be a matrix instead of series, that's why we need double []
X = data_train[["age"]]
Y = data_train["price"]
cart3.fit(X, Y)

Test RMSE

In [None]:
rmse_cart3 = rmse(data_test["price"], cart3.predict(data_test[["age"]]))
rmse_cart3

In [None]:
jupyter_graphviz(
    cart3, filled=True, rounded=True, special_characters=True, feature_names=["age"]
)


Scatterplot with step function

In [None]:
plot_helper = np.arange(min(data_train["age"]), max(data_train["age"]))

plot_helper_df = pd.DataFrame({"age": plot_helper})
plot_helper_df["xend"] = pd.Series(plot_helper) + 1
plot_helper_df["yend"] = cart3.predict(plot_helper_df[["age"]])
pred_cart1t = cart3.predict(data_train[["age"]])


In [None]:
(
    ggplot(data_train, aes(x="age", y="price"))
    + geom_point()
    + geom_segment(
        plot_helper_df,
        aes(x="age", y="yend", xend="xend", yend="yend"),
        color="blue",
        size=1,
        na_rm=True,
    )
    + scale_y_continuous(
        expand=(0.01, 0.01), limits=(0, 20000), breaks=np.arange(0, 20000, 2500)
    )
    + scale_x_continuous(
        expand=(0.01, 0.01), limits=(0, 25), breaks=np.arange(0, 25, 5)
    )
    + labs(x="Age (years)", y="Price (US dollars)")
    + theme_bw()
)

#### Competing model: Age only linear regression
---

In [None]:
linreg1 = smf.ols("price~age", data=data_train).fit()
linreg1.summary()


In [None]:
rmse_ols1 = rmse(data_test["price"], linreg1.predict(data_test[["age"]]))
rmse_ols1 

Scatterplot with predicted values

In [None]:
pred_linreg1t = linreg1.predict(data_train)
(
    ggplot(data_train)
    + geom_point(
        aes(x="age", y="price"),
        size=1,
        alpha=0.7,
        show_legend=False,
        na_rm=True,
    )
    + geom_line(aes(x="age", y="pred_linreg1t"), colour="blue", size=0.7)
    + scale_y_continuous(
        expand=(0.01, 0.01), limits=(0, 20000), breaks=np.arange(0, 20000, 2500)
    )
    + scale_x_continuous(
        expand=(0.01, 0.01), limits=(0, 25), breaks=np.arange(0, 25, 5)
    )
    + labs(x="Age (years)", y="Price (US dollars)")
    + theme_bw()
)

### Age only with lowess

In [None]:
lowess = sm.nonparametric.KernelReg
lowess_fit = lowess(data_train.price, data_train.age, "c")

Test RMSE

In [None]:
lowess_pred = lowess_fit.fit(data_test["age"])
rmse_lowess = rmse(lowess_pred[0], data_test["price"])
rmse_lowess

In [None]:
(
    ggplot(data_train, aes(x="age", y="price"))
    + geom_point(size=1)
    + labs(x="Age", y="Price")
    + coord_cartesian(xlim=(0, 25), ylim=(0, 20000))
    + geom_smooth(method="loess", colour="blue", se=False, size=1.5)
    + theme_bw()
)

### MULTIPLE PREDICTOR VARIABLES

Linear regression with multiple variables

In [None]:
model2 = "price ~ age + odometer + LE + XLE + SE + cond_excellent + cond_good + cylind6 + dealer + chicago"
linreg2 = smf.ols(model2, data_train).fit()
linreg2.summary()

In [None]:
rmse_ols2 = rmse(data_test["price"], linreg1.predict(data_test))
rmse_ols2

Add squared for age, odometer

In [None]:
model3 = "price ~ age + agesq+ odometer+odometersq +LE + XLE + SE + cond_excellent + cond_good + cylind6 + dealer+chicago"
linreg3 = smf.ols(model3, data_train).fit()
linreg3.summary()

In [None]:
rmse_ols3 = rmse(data_test["price"], linreg3.predict(data_test))
rmse_ols3

In [None]:
Y, X = dmatrices(model2, data_train)


### Multiple predictors with Trees

 Splits at four levels, for illustrative purposes
 
(make sure it stops by setting "maxdepth" to 3)

In [None]:
cart4 = DecisionTreeRegressor(random_state=20108, criterion="mse", max_depth=3)
cart4.fit(X, Y)

In [None]:
_, X_test = dmatrices(model2, data_test)

pred_cart4 = cart4.predict(X_test)
rmse_cart4 = rmse(data_test["price"], pred_cart4)
rmse_cart4

In [None]:
feature_names_model2 = [
    "price",
    "age",
    "odometer",
    "LE",
    "XLE",
    "SE",
    "cond_excellent",
    "cond_good",
    "cylind6",
    "dealer",
    "chicago",
]

In [None]:
jupyter_graphviz(
    cart4,
    filled=True,
    rounded=True,
    special_characters=True,
    feature_names=feature_names_model2,
)


 The alternative to show the use of min_impurity_decrease slightly the same outcome


In [None]:
cart4 = DecisionTreeRegressor(
    random_state=20108,
    criterion="mse",
    min_impurity_decrease=150000,
    min_samples_split=20,
)
cart4.fit(X, Y)

y_test, X_test = dmatrices(model2, data_test)

pred_cart4 = cart4.predict(X_test)
rmse_cart4 = rmse(data_test["price"], pred_cart4)
rmse_cart4

In [None]:
jupyter_graphviz(
    cart4,
    filled=True,
    rounded=True,
    special_characters=True,
    feature_names=feature_names_model2,
)

### CART model 5

In [None]:
cart5 = DecisionTreeRegressor(
    random_state=20108,
    criterion="mse",
    min_impurity_decrease=20000,
)
cart5.fit(X, Y)


In [None]:
pred_cart5 = cart5.predict(X_test)
rmse_cart5 = rmse(data_test["price"], pred_cart5)
rmse_cart5


In [None]:
jupyter_graphviz(
    cart5,
    filled=True,
    rounded=True,
    special_characters=True,
    feature_names=feature_names_model2,
)


### Cart Model 6
#### Build very large tree and prune it

in Python this can be done in sklearn's DecisionTreeRegressor with ccp_alpha parameter

In [None]:
cart6 = DecisionTreeRegressor(
    random_state=20108, min_samples_split=4, criterion="mse", ccp_alpha=100000
)
cart6.fit(X, Y)


In [None]:
pred_cart6 = cart6.predict(X_test)
rmse_cart6 = rmse(data_test["price"], pred_cart6)
rmse_cart6


In [None]:
jupyter_graphviz(
    cart6,
    filled=True,
    rounded=True,
    special_characters=True,
    feature_names=feature_names_model2,
)


Variable permutation importance for model 5

In [None]:
perm_imp = permutation_importance(
    cart5, X, Y, n_repeats=1000, scoring="r2", max_samples=0.6, random_state=230
)

cart5_var_imp_df = (
    pd.DataFrame(perm_imp["importances_mean"], feature_names_model2)
    .iloc[1:, :]
    .rename({0: "Importance"}, axis=1)
    .sort_values(by=["Importance"], ascending=False)
    .reset_index()
    .assign(Importance=lambda x: x["Importance"] / x["Importance"].sum())
)

(
    ggplot(cart5_var_imp_df, aes(x="reorder(index, Importance)", y="Importance"))
    + geom_point(color="blue", size=2)
    + geom_segment(
        aes(x="index", xend="index", y=0, yend="Importance"), color="blue", size=1.5
    )
    + ylab("Importance")
    + xlab("Variable Name")
    + coord_flip()
    + scale_y_continuous(expand=(0.01, 0.01), labels=percent_format())
    + theme_bw()
)

Simple variable importance for model 5

In [None]:
cart5_var_imp_df = (
    pd.DataFrame(cart5.feature_importances_, feature_names_model2)
    .iloc[1:, :]
    .rename({0: "Importance"}, axis=1)
    .sort_values(by=["Importance"], ascending=False)
    .reset_index()
)
(
    ggplot(cart5_var_imp_df, aes(x="reorder(index,Importance)", y="Importance"))
    + geom_point(color="blue", size=2)
    + geom_segment(
        aes(x="index", xend="index", y=0, yend="Importance"), color="blue", size=1.5
    )
    + ylab("Importance")
    + xlab("Variable Name")
    + coord_flip()
    + scale_y_continuous(expand=(0.01, 0.01), labels=percent_format())
    + theme_bw()
)


Summary table of results

In [None]:
(
    pd.DataFrame(
        {
            "Model": ["CART M" + str(i) for i in range(1, 7)]
            + ["OLS M1", "LOWESS", "OLS M2", "OLS M3"],
            "Number of variables": [1, 1, 7, 7, 7, 7, 1, 1, 7, 7],
            "Model details": [
                "2 levels",
                "3 levels",
                "min_impurity_decrease=50000",
                "min_impurity_decrease=140000 & min_samples_split=20",
                "min_impurity_decrease=20000",
                "ccp_alpha=30000",
                "linear",
                "lowess",
                "linear",
                "w/ polynomial terms",
            ],
            "test RMSE": [
                rmse_cart1,
                rmse_cart2,
                rmse_cart3,
                rmse_cart4,
                rmse_cart5,
                rmse_cart6,
                rmse_ols1,
                rmse_lowess,
                rmse_ols2,
                rmse_ols3,
            ],
        }
    )
    .set_index("Model")
    .round(0)
)