In [2]:
# robust_cart.py
import numpy as np
from collections import Counter, defaultdict
from sklearn.base import clone
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification, load_breast_cancer, load_wine, load_digits
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import pandas as pd

class RobustCART:
    def __init__(self, base_estimator=None, n_subsamples=100, max_depth=5):
        self.base_estimator = base_estimator or DecisionTreeClassifier(max_depth=1)
        self.n_subsamples = n_subsamples
        self.max_depth = max_depth
        self.tree_ = None

    def _adaptive_subsample_frac(self, n_samples):
        return min(0.7, max(0.3, 200 / n_samples))

    def _best_stable_split_cv(self, X, y):
        n_samples = len(X)
        subsample_frac = self._adaptive_subsample_frac(n_samples)
        n_sub = int(n_samples * subsample_frac)
        split_scores = defaultdict(list)

        for _ in range(self.n_subsamples):
            idx_train = np.random.choice(n_samples, size=n_sub, replace=False)
            idx_test = np.setdiff1d(np.arange(n_samples), idx_train)

            X_train, y_train = X[idx_train], y[idx_train]
            X_test, y_test = X[idx_test], y[idx_test]

            if len(np.unique(y_train)) < 2 or len(np.unique(y_test)) < 2:
                continue

            stump = clone(self.base_estimator)
            stump.fit(X_train, y_train)

            if stump.tree_.feature[0] < 0:
                continue

            feat = stump.tree_.feature[0]
            thresh = stump.tree_.threshold[0]
            preds = stump.predict(X_test)
            loss = np.mean(preds != y_test)

            split_scores[(feat, thresh)].append(loss)

        if not split_scores:
            return None

        avg_scores = {split: np.mean(scores) for split, scores in split_scores.items()}
        best_split = min(avg_scores.items(), key=lambda x: x[1])[0]
        return best_split

    def _grow_tree(self, X, y, depth):
        if depth >= self.max_depth or len(set(y)) == 1:
            return Counter(y).most_common(1)[0][0]

        split = self._best_stable_split_cv(X, y)
        if split is None:
            return Counter(y).most_common(1)[0][0]

        feat, thresh = split
        left_idx = X[:, feat] <= thresh
        right_idx = ~left_idx

        if sum(left_idx) == 0 or sum(right_idx) == 0:
            return Counter(y).most_common(1)[0][0]

        return {
            'feature': feat,
            'threshold': thresh,
            'left': self._grow_tree(X[left_idx], y[left_idx], depth + 1),
            'right': self._grow_tree(X[right_idx], y[right_idx], depth + 1)
        }

    def fit(self, X, y):
        self.tree_ = self._grow_tree(np.array(X), np.array(y), depth=0)

    def _predict_one(self, x, node):
        if not isinstance(node, dict):
            return node
        if x[node['feature']] <= node['threshold']:
            return self._predict_one(x, node['left'])
        else:
            return self._predict_one(x, node['right'])

    def predict(self, X):
        return np.array([self._predict_one(x, self.tree_) for x in X])

# Evaluation script
def evaluate():
    datasets = {
        "Low Noise": make_classification(n_samples=1000, n_features=10, n_informative=7, n_redundant=2, flip_y=0.01, random_state=0),
        "Moderate Noise": make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=2, flip_y=0.15, random_state=1),
        "Heavy Noise": make_classification(n_samples=1000, n_features=10, n_informative=3, n_redundant=2, flip_y=0.3, random_state=2),
        "Very Heavy Noise": make_classification(n_samples=1000, n_features=10, n_informative=2, n_redundant=2, flip_y=0.45, random_state=4),
        "Breast Cancer": load_breast_cancer(return_X_y=True),
        "Wine": load_wine(return_X_y=True),
        "Digits": load_digits(return_X_y=True),
    }

    results = []
    for name, (X, y) in datasets.items():
        X, y = shuffle(X, y, random_state=42)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        robust_cart = RobustCART(max_depth=5)
        robust_cart.fit(X_train, y_train)
        y_pred_robust = robust_cart.predict(X_test)

        sklearn_cart = DecisionTreeClassifier(max_depth=5, random_state=42)
        sklearn_cart.fit(X_train, y_train)
        y_pred_sklearn = sklearn_cart.predict(X_test)

        results.append({"Dataset": name, "Model": "Robust CART", "OOS Accuracy": accuracy_score(y_test, y_pred_robust)})
        results.append({"Dataset": name, "Model": "Sklearn CART", "OOS Accuracy": accuracy_score(y_test, y_pred_sklearn)})

    return pd.DataFrame(results)

if __name__ == "__main__":
    results_df = evaluate()
    print(results_df.to_string(index=False))

         Dataset        Model  OOS Accuracy
       Low Noise  Robust CART      0.770000
       Low Noise Sklearn CART      0.756667
  Moderate Noise  Robust CART      0.753333
  Moderate Noise Sklearn CART      0.776667
     Heavy Noise  Robust CART      0.816667
     Heavy Noise Sklearn CART      0.776667
Very Heavy Noise  Robust CART      0.713333
Very Heavy Noise Sklearn CART      0.700000
   Breast Cancer  Robust CART      0.918129
   Breast Cancer Sklearn CART      0.900585
            Wine  Robust CART      0.962963
            Wine Sklearn CART      0.981481
          Digits  Robust CART      0.755556
          Digits Sklearn CART      0.679630
