# Demo notebook for the conditioning bias in decision trees

## Requirements:

* install: `numpy`, `sklearn`, `scipy`, `common-datasets`

In [30]:
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold, cross_validate
from scipy.stats import wilcoxon

from common_datasets.regression import load_cpu_performance

In [31]:
def wilcoxon_p(arg0, arg1, **kwargs):
    return wilcoxon(arg0, arg1, **kwargs, zero_method='zsplit').pvalue

In [32]:
dataset = load_cpu_performance()
X = dataset['data']
y = dataset['target']

# a cross-validation wrapper to simplify the code
def cv_rf(X, y, regressor=RandomForestRegressor):
    return cross_validate(
        estimator=regressor(max_depth=11, random_state=5),
        X=X, y=y,
        cv=RepeatedKFold(n_splits=5, n_repeats=400, random_state=5),
        scoring='r2'
    )['test_score']

r2_original = cv_rf(X, y)
r2_mirrored = cv_rf(-X, y)

In [48]:
print(f'original r2: {np.mean(r2_original):.4f}')
print(f'mirrored r2: {np.mean(r2_mirrored):.4f}')
print(f'p-value: {wilcoxon_p(r2_original, r2_mirrored):.4e}')

original r2: 0.8611
mirrored r2: 0.8595
p-value: 6.2667e-04


In [34]:
from sklearn.base import RegressorMixin

class UnbiasedRandomForestRegressor(RegressorMixin):

    def __init__(self, **kwargs):
        # determining the number of estimators used in the
        # two subforests (with the same overall number of trees)
        self.n_estimators = kwargs.get('n_estimators', 100)

        n_leq = int(self.n_estimators / 2) # conditioning with <= (leq)
        n_l = self.n_estimators - n_leq # conditioning with < (less)

        # instantiating the subforests
        self.rf_leq = RandomForestRegressor(**(kwargs | {'n_estimators': n_leq}))
        self.rf_l = RandomForestRegressor(**(kwargs | {'n_estimators': n_l}))

    def fit(self, X, y, sample_weight=None):
        # fitting both subforests
        self.rf_leq.fit(X, y, sample_weight)
        self.rf_l.fit(-X, y, sample_weight)

        return self

    def predict(self, X):
        # taking the average of the predictions
        return np.mean([self.rf_leq.predict(X), self.rf_l.predict(-X)], axis=0)

    def get_params(self, deep=False):
        # returning the parameters
        return self.rf_leq.get_params(deep) | {'n_estimators': self.n_estimators}

In [35]:
r2_unbiased = cv_rf(X, y, UnbiasedRandomForestRegressor)

In [36]:
print(f'original r2: {np.mean(r2_original):.4f}')
print(f'mirrored r2: {np.mean(r2_mirrored):.4f}')
print(f'unbiased r2: {np.mean(r2_unbiased):.4f}')

original r2: 0.8611
mirrored r2: 0.8595
unbiased r2: 0.8608


In [45]:
p_less = wilcoxon(r2_unbiased, r2_original, alternative='less', zero_method='zsplit').pvalue
p_greater = wilcoxon(r2_unbiased, r2_original, alternative='greater', zero_method='zsplit').pvalue
p_less, p_greater

(0.5743498351031389, 0.4256501648968612)

In [47]:
p_less = wilcoxon(r2_unbiased, r2_mirrored, alternative='less', zero_method='zsplit').pvalue
p_greater = wilcoxon(r2_unbiased, r2_mirrored, alternative='greater', zero_method='zsplit').pvalue
p_less, p_greater

(0.9999212953780328, 7.870462196726495e-05)

In [46]:
p_unb_orig_less = wilcoxon_p(r2_unbiased, r2_original, alternative="less")
p_unb_orig_great = wilcoxon_p(r2_unbiased, r2_original, alternative="greater")

print(f'{p_unb_orig_less:.4f}, {p_unb_orig_great:.4f}')

0.5743, 0.4257


In [41]:
p_unb_orig_less, p_unb_orig_great

(0.5743498351031389, 0.4256501648968612)

In [39]:
print(f'p-value: {wilcoxon_p(r2_unbiased, r2_mirrored, alternative="less"):.4f}')
print(f'p-value: {wilcoxon_p(r2_unbiased, r2_mirrored, alternative="greater"):.4f}')

p-value: 0.9999
p-value: 0.0001
