In [2]:
import numpy as np
from typing import Any, List, Tuple
from collections import Counter
from DecisionTree import DecisionTree

Vector = List[Any]
Matrix = List[Vector]

In [5]:
class RandomForest:

    def __init__(
        self,
        min_samples_split: int = 2,
        max_depth: int = 10,
        n_features: int | None = None,
        n_trees: int = 50,
        subset_size: float = 0.25,
    ):
        self.min_samples_split: int = min_samples_split
        self.max_depth: int = max_depth
        self.n_features_to_use: int = n_features
        self.n_trees: int = n_trees
        self.subset_size: float = subset_size
        self.trees: List[DecisionTree] = []

    def fit(self, xs: Matrix, ys: Vector) -> None:
        """Fit training data."""

        # train each decision tree on subset data
        for _ in range(self.n_trees):
            tree = DecisionTree(
                max_depth=self.max_depth, min_samples_split=self.min_samples_split
            )
            xs_subset, ys_subset = self._bootstrap_samples(xs, ys, self.subset_size)
            tree.fit(xs_subset, ys_subset)
            self.trees.append(tree)

    def _bootstrap_samples(
        self, xs: Matrix, ys: Vector, subset_size: float
    ) -> Tuple[Matrix, Vector]:
        """Return a random subset of data with proportion = subset_size
        with replacement.
        """
        n_samples = len(ys)
        subset_idx = np.random.choice(
            n_samples, int(subset_size * n_samples), replace=True
        )
        return xs[subset_idx], ys[subset_idx]

    def _majority_vote(self, labels: Vector) -> Any:
        """Returns the most common label by majority vote.."""
        return Counter(labels).most_common(1)[0][0]

    def predict(self, xs: Matrix) -> Vector:
        # list of preds, grouped by tree
        preds = np.array([tree.predict(xs) for tree in self.trees])
        # list of preds, grouped by index
        preds = np.swapaxes(preds, 0, 1)
        preds = np.array([self._majority_vote(pred) for pred in preds])
        return preds

In [6]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

data = load_breast_cancer()
xs, ys = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    xs, ys, test_size=0.32, random_state=42
)

rf = RandomForest()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
acc = sum(y_pred == y_test) / len(y_test)
acc

0.9398907103825137