-
Notifications
You must be signed in to change notification settings - Fork 0
/
ensemble.py
166 lines (131 loc) · 5.44 KB
/
ensemble.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import math
import numpy as np
from utils import LoggingMixin
from utils import bootstrapped_sample
class RandomForestClassifier(LoggingMixin):
"""
A convenience wrapper on top of decision tree learning function.
Builds an ensemble of trees and provides methods to make predictions.
Methods signatures and computed attributes names follow scikit-learn
naming convention.
Arguments:
tree_funcs (build_fn, predict_fn):
Two functions that train a single decision tree on
provided dataset and make predictions on it. Extra arguments
could be passed to specify maximal depth, split sizes,
considered features, etc.
n_trees:
Number of trees in ensemble.
feature_subset_size {int, str, None}:
Size of features subset to be considered on each tree split.
Should be an integer, None (if all attributes should be taken
into account), or 'sqrt' to take a square root of total number of
dataset features.
max_depth:
Maximum depth of a single tree.
min_split_size:
Minimum number of observations in a node to split the node
into two new nodes.
min_leaf_size:
Minimum number of observations in decision tree leafs.
log:
An instance of logging class.
"""
def __init__(self, tree_funcs, n_trees: int=10,
feature_subset_size: str='sqrt', max_depth: int=5,
min_split_size: int=10, min_leaf_size: int=None,
log=None):
if n_trees < 1:
raise ValueError(f'cannot build an ensemble of {n_trees:d} trees')
self.build_fn, self.predict_fn = tree_funcs
self.n_trees = n_trees
self.feature_subset_size = feature_subset_size
self.max_depth = max_depth
self.min_split_size = min_split_size
self.min_leaf_size = min_leaf_size
self.log = log
self.feature_subset_size_ = None
self.ensemble_ = None
self.classes_ = None
self.n_classes_ = None
def fit(self, X, y):
m = _validate_subset_size(X, self.feature_subset_size)
n = self.n_trees
self.info(f'Started building an ensemble of {n} decision trees')
self.info(f'Training dataset shape: {X.shape}')
self.info(f'Maximal tree depth: {self.max_depth}')
self.info(f'Minimal number of samples per node '
f'to make a split: {self.min_split_size}')
self.info(f'Minimal number of samples '
f'to create a leaf: {self.min_leaf_size}')
self.info(f'Number of random features considered '
f'per each tree split: {m}')
string_length = len(str(n))
ensemble = []
for i in range(1, n + 1):
self.debug(f'Building tree %{string_length}d of %d', i, n)
index = bootstrapped_sample(X.shape[0])
tree = self.build_fn(
X=X[index], y=y[index],
max_depth=self.max_depth,
min_split_size=self.min_split_size,
min_leaf_size=self.min_leaf_size,
features_subset_size=m)
ensemble.append(tree)
classes = np.unique(y)
classes.sort()
self.ensemble_ = ensemble
self.classes_ = classes
self.n_classes_ = len(self.classes_)
return self
def predict_decisions(self, X, n_trees=None):
"""
Returns matrix with predicted classes for each
instance for each of trees in ensemble.
"""
if self.ensemble_ is None:
raise RuntimeError('fit method should be called first')
if n_trees is None:
n_trees = self.n_trees
elif n_trees > self.n_trees:
n_trees = self.n_trees
predictions = np.zeros((X.shape[0], n_trees), dtype=int)
for tree_index, tree in enumerate(self.ensemble_[:n_trees]):
predictions[:, tree_index] = self.predict_fn(tree, X)
return predictions
def predict_proba(self, X, **params):
"""
Returns matrix with probabilities per instance per class.
"""
predictions = self.predict_decisions(X, **params)
probabilities = np.zeros((X.shape[0], self.n_classes_), dtype=float)
for sample_index in range(X.shape[0]):
preds = predictions[sample_index, :]
counts = np.zeros(self.n_classes_)
for value in preds:
counts[value] += 1
ratios = counts / counts.sum()
probabilities[sample_index, :] = ratios
return probabilities
def predict(self, X, **params):
"""
Returns a vector with class predictions.
"""
probabilities = self.predict_proba(X, **params)
labels = probabilities.argmax(axis=1)
return labels
def _validate_subset_size(X, size):
"""
Checks if features subset size is equal to one of valid values.
"""
if size is None:
return X.shape[1]
if isinstance(size, int) and size > X.shape[1]:
raise ValueError(
f'the dataset has only {X.shape[1]:d} features, '
f'but feature subset size is equal to {size:d}')
if size == 'sqrt':
return int(math.sqrt(X.shape[1]))
if not isinstance(size, int):
raise TypeError(f'unexpected value for feature subset size: {size}')
return size