-
Notifications
You must be signed in to change notification settings - Fork 171
/
features.py
243 lines (204 loc) · 10.5 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
"""Feature extraction from curves."""
# License: GNU AGPLv3
from copy import deepcopy
from types import FunctionType
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted, check_array
from ._functions import _AVAILABLE_FUNCTIONS, _implemented_function_recipes, \
_parallel_featurization
from ..utils._docs import adapt_fit_transform_docs
from ..utils.validation import validate_params
@adapt_fit_transform_docs
class StandardFeatures(BaseEstimator, TransformerMixin):
"""Standard features from multi-channel curves.
A multi-channel (integer sampled) curve is a 2D array of shape
``(n_channels, n_bins)``, where each row represents the y-values in one of
the channels. This transformer applies scalar or vector-valued functions
channel-wise to extract features from each multi-channel curve in a
collection. The output is always a 2D array such that row ``i`` is the
concatenation of the outputs of the chosen functions on the channels in the
``i``-th (multi-)curve in the collection.
Parameters
----------
function : string, callable, list or tuple, optional, default: ``"max"``
Function or list/tuple of functions to apply to each channel of each
multi-channel curve. Functions can map to scalars or to 1D arrays. If a
string (see below) or a callable, then the same function is applied to
all channels. Otherwise, `function` is a list/tuple of the same length
as the number of entries along axis 1 in the collection passed to
:meth:`fit`. Lists/tuples may contain allowed strings (see below),
callables, and ``None`` in some positions to indicate that no feature
should be extracted from the corresponding channel. Available strings
are ``"identity"``, ``"argmin"``, ``"argmax"``, ``"min"``, ``"max"``,
``"mean"``, ``"std"``, ``"median"`` and ``"average"``.
function_params : dict, None, list or tuple, optional, default: ``None``
Additional keyword arguments for the function or functions in
`function`. Passing ``None`` is equivalent to passing no arguments.
Otherwise, if `function` is a single string or callable then
`function_params` must be a dictionary. For functions encoded by
allowed strings, the dictionary keys are as follows:
- If ``function == "average"``, the only key is ``"weights"``
(np.ndarray or None, default: ``None``).
- Otherwise, there are no allowed keys.
If `function` is a list or tuple, `function_params` must be a list or
tuple of dictionaries (or ``None``) as above, of the same length as
`function`.
n_jobs : int or None, optional, default: ``None``
The number of jobs to use for the computation. ``None`` means 1 unless
in a :obj:`joblib.parallel_backend` context. ``-1`` means using all
processors. Ignored if `function` is one of the allowed string options.
Attributes
----------
n_channels_ : int
Number of channels present in the 3D array passed to :meth:`fit`. Must
match the number of channels in the 3D array passed to
:meth:`transform`.
effective_function_ : callable or tuple
Callable, or tuple of callables or ``None``, describing the function(s)
used to compute features in each available channel. It is a single
callable only when `function` was passed as a string.
effective_function_params_ : dict or tuple
Dictionary or tuple of dictionaries containing all information present
in `function_params` as well as relevant quantities computed in
:meth:`fit`. It is a single dict only when `function` was passed as a
string. ``None``s are converted to empty dictionaries.
"""
_hyperparameters = {
"function": {"type": (str, FunctionType, list, tuple),
"in": tuple(_AVAILABLE_FUNCTIONS.keys()),
"of": {"type": (str, FunctionType, type(None)),
"in": tuple(_AVAILABLE_FUNCTIONS.keys())}},
"function_params": {"type": (dict, type(None), list, tuple)},
}
def __init__(self, function="max", function_params=None, n_jobs=None):
self.function = function
self.function_params = function_params
self.n_jobs = n_jobs
def _validate_params(self):
params = self.get_params().copy()
_hyperparameters = deepcopy(self._hyperparameters)
if not isinstance(self.function, str):
_hyperparameters["function"].pop("in")
try:
validate_params(params, _hyperparameters, exclude=["n_jobs"])
# Another go if we fail because function is a list/tuple containing
# instances of FunctionType and the "in" key checks fail
except ValueError as ve:
end_string = f"which is not in " \
f"{tuple(_AVAILABLE_FUNCTIONS.keys())}."
function = params["function"]
if ve.args[0].endswith(end_string) \
and isinstance(function, (list, tuple)):
params["function"] = [f for f in function
if isinstance(f, str)]
validate_params(params, _hyperparameters, exclude=["n_jobs"])
else:
raise ve
if isinstance(self.function, (list, tuple)) \
and isinstance(self.function_params, dict):
raise TypeError("If `function` is a list/tuple then "
"`function_params` must be a list/tuple of dict, "
"or None.")
elif isinstance(self.function, (str, FunctionType)) \
and isinstance(self.function_params, (list, tuple)):
raise TypeError("If `function` is a string or a callable "
"function then `function_params` must be a dict "
"or None.")
def fit(self, X, y=None):
"""Compute :attr:`n_channels_` and :attr:`effective_function_params_`.
Then, return the estimator.
This function is here to implement the usual scikit-learn API and hence
work in pipelines.
Parameters
----------
X : ndarray of shape (n_samples, n_channels, n_bins)
Input data. Collection of multi-channel curves.
y : None
There is no need for a target in a transformer, yet the pipeline
API requires this parameter.
Returns
-------
self : object
"""
check_array(X, ensure_2d=False, allow_nd=True)
if X.ndim != 3:
raise ValueError("Input must be 3-dimensional.")
self._validate_params()
self.n_channels_ = X.shape[1]
if isinstance(self.function, str):
self.effective_function_ = \
_implemented_function_recipes[self.function]
if self.function_params is None:
self.effective_function_params_ = {}
else:
validate_params(self.function_params,
_AVAILABLE_FUNCTIONS[self.function])
self.effective_function_params_ = self.function_params.copy()
elif isinstance(self.function, FunctionType):
self.effective_function_ = \
tuple([self.function] * self.n_channels_)
if self.function_params is None:
self.effective_function_params_ = \
tuple([{}] * self.n_channels_)
else:
self.effective_function_params_ = \
tuple([self.function_params.copy()] * self.n_channels_)
else:
n_functions = len(self.function)
if len(self.function) != self.n_channels_:
raise ValueError(
f"`function` has length {n_functions} while curves in `X` "
f"have {self.n_channels_} channels."
)
if self.function_params is None:
self._effective_function_params = [{}] * self.n_channels_
else:
self._effective_function_params = self.function_params
n_function_params = len(self._effective_function_params)
if n_function_params != self.n_channels_:
raise ValueError(f"`function_params` has length "
f"{n_function_params} while curves in "
f"`X` have {self.n_channels_} channels.")
self.effective_function_ = []
self.effective_function_params_ = []
for f, p in zip(self.function, self._effective_function_params):
if isinstance(f, str):
validate_params(p, _AVAILABLE_FUNCTIONS[f])
self.effective_function_.\
append(_implemented_function_recipes[f])
else:
self.effective_function_.append(f)
self.effective_function_params_.append({} if p is None
else p.copy())
self.effective_function_ = tuple(self.effective_function_)
self.effective_function_params_ = \
tuple(self.effective_function_params_)
return self
def transform(self, X, y=None):
"""Compute features of multi-channel curves.
Parameters
----------
X : ndarray of shape (n_samples, n_channels, n_bins)
Input collection of multi-channel curves.
y : None
There is no need for a target in a transformer, yet the pipeline
API requires this parameter.
Returns
-------
Xt : ndarray of shape (n_samples, n_features)
Output collection of features of multi-channel curves.
``n_features`` is the sum of the number of features output by the
(non-``None``) functions on their respective channels.
"""
check_is_fitted(self)
Xt = check_array(X, ensure_2d=False, allow_nd=True)
if Xt.ndim != 3:
raise ValueError("Input must be 3-dimensional.")
if Xt.shape[1] != self.n_channels_:
raise ValueError(f"Number of channels must be the same as in "
f"`fit`. Passed {Xt.shape[1]}, expected "
f"{self.n_channels_}.")
Xt = _parallel_featurization(Xt, self.effective_function_,
self.effective_function_params_,
self.n_jobs)
return Xt