import math
import numpy as np
from utils import LoggingMixin
from utils import bootstrapped_sample
class RandomForestClassifier(LoggingMixin):
A convenience wrapper on top of decision tree learning function.
Builds an ensemble of trees and provides methods to make predictions.
Methods signatures and computed attributes names follow scikit-learn
naming convention.
tree_funcs (build_fn, predict_fn):
Two functions that train a single decision tree on
provided dataset and make predictions on it. Extra arguments
could be passed to specify maximal depth, split sizes,
considered features, etc.
Number of trees in ensemble.
feature_subset_size {int, str, None}:
Size of features subset to be considered on each tree split.
Should be an integer, None (if all attributes should be taken
into account), or 'sqrt' to take a square root of total number of
dataset features.
Maximum depth of a single tree.
Minimum number of observations in a node to split the node
into two new nodes.
Minimum number of observations in decision tree leafs.
An instance of logging class.
def __init__(self, tree_funcs, n_trees: int=10,
feature_subset_size: str='sqrt', max_depth: int=5,
min_split_size: int=10, min_leaf_size: int=None,
if n_trees < 1:
raise ValueError(f'cannot build an ensemble of {n_trees:d} trees')
self.build_fn, self.predict_fn = tree_funcs
self.n_trees = n_trees
self.feature_subset_size = feature_subset_size
self.max_depth = max_depth
self.min_split_size = min_split_size
self.min_leaf_size = min_leaf_size
self.log = log
self.feature_subset_size_ = None
self.ensemble_ = None
self.classes_ = None
self.n_classes_ = None
def fit(self, X, y):
m = _validate_subset_size(X, self.feature_subset_size)
n = self.n_trees'Started building an ensemble of {n} decision trees')'Training dataset shape: {X.shape}')'Maximal tree depth: {self.max_depth}')'Minimal number of samples per node '
f'to make a split: {self.min_split_size}')'Minimal number of samples '
f'to create a leaf: {self.min_leaf_size}')'Number of random features considered '
f'per each tree split: {m}')
string_length = len(str(n))
ensemble = []
for i in range(1, n + 1):
self.debug(f'Building tree %{string_length}d of %d', i, n)
index = bootstrapped_sample(X.shape[0])
tree = self.build_fn(
X=X[index], y=y[index],
classes = np.unique(y)
self.ensemble_ = ensemble
self.classes_ = classes
self.n_classes_ = len(self.classes_)
return self
def predict_decisions(self, X, n_trees=None):
Returns matrix with predicted classes for each
instance for each of trees in ensemble.
if self.ensemble_ is None:
raise RuntimeError('fit method should be called first')
if n_trees is None:
n_trees = self.n_trees
elif n_trees > self.n_trees:
n_trees = self.n_trees
predictions = np.zeros((X.shape[0], n_trees), dtype=int)
for tree_index, tree in enumerate(self.ensemble_[:n_trees]):
predictions[:, tree_index] = self.predict_fn(tree, X)
return predictions
def predict_proba(self, X, **params):
Returns matrix with probabilities per instance per class.
predictions = self.predict_decisions(X, **params)
probabilities = np.zeros((X.shape[0], self.n_classes_), dtype=float)
for sample_index in range(X.shape[0]):
preds = predictions[sample_index, :]
counts = np.zeros(self.n_classes_)
for value in preds:
counts[value] += 1
ratios = counts / counts.sum()
probabilities[sample_index, :] = ratios
return probabilities
def predict(self, X, **params):
Returns a vector with class predictions.
probabilities = self.predict_proba(X, **params)
labels = probabilities.argmax(axis=1)
return labels
def _validate_subset_size(X, size):
Checks if features subset size is equal to one of valid values.
if size is None:
return X.shape[1]
if isinstance(size, int) and size > X.shape[1]:
raise ValueError(
f'the dataset has only {X.shape[1]:d} features, '
f'but feature subset size is equal to {size:d}')
if size == 'sqrt':
return int(math.sqrt(X.shape[1]))
if not isinstance(size, int):
raise TypeError(f'unexpected value for feature subset size: {size}')
return size