diff --git a/examples/column_transformer.py b/examples/column_transformer.py new file mode 100644 index 0000000..ed02d8d --- /dev/null +++ b/examples/column_transformer.py @@ -0,0 +1,41 @@ +''' +========================================= +Simple SegmentedColumnTransformer Example +========================================= + +This example demonstrates how to use the SegmentedColumnTransformer on segmented data. +Note that contextual data is not supported. +''' + +# Author: Matthias Gazzari +# License: BSD + +from seglearn.transform import SegmentXY, FeatureRep, SegmentedColumnTransformer +from seglearn.feature_functions import minimum +from seglearn.base import TS_Data + +import numpy as np + +X = [np.array([[0,1], [2,3], [4,5], [6,7], [8,9], [10,11], [12,13], [14,15]])] +y = [np.array([True, False, False, True, False, True, False, True])] + +segment = SegmentXY(width=4, overlap=1) +X, y, _ = segment.fit_transform(X, y) + +print('After segmentation:') +print(X, X.shape) +print(y, y.shape) + +col_trans = SegmentedColumnTransformer([ + ('a', FeatureRep(features={'min_0': minimum}), 0), + ('b', FeatureRep(features={'min_1': minimum}), 1), + ('c', FeatureRep(features={'min_all': minimum}), [0,1]), + # alternative column specifications: + #('c', FeatureRep(features={'min_all': minimum}), lambda x: [0,1]), + #('c', FeatureRep(features={'min_all': minimum}), slice(0,2)), + #('c', FeatureRep(features={'min_all': minimum}), [True, True]), +]) + +print('After column-wise feature extraction:') +X = col_trans.fit_transform(X, y) +print(X, X.shape) diff --git a/requirements.txt b/requirements.txt index 8299ec7..4a94fbe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ numpy scipy -scikit-learn>=0.19 +scikit-learn>=0.20 diff --git a/seglearn/__init__.py b/seglearn/__init__.py index b11b6f5..690fb44 100644 --- a/seglearn/__init__.py +++ b/seglearn/__init__.py @@ -13,8 +13,9 @@ from . import transform, pipe, util, split, datasets, feature_functions __all__ = ['TS_Data', 'FeatureRep', 'PadTrunc', 'Interp', 'Pype', 'SegmentX', 'SegmentXY', - 'SegmentXYForecast', 'TemporalKFold', 'temporal_split', 'check_ts_data', 'ts_stats', - 'get_ts_data_parts', 'all_features', 'base_features', 'load_watch', '__version__'] + 'SegmentXYForecast', 'SegmentedColumnTransformer', 'TemporalKFold', 'temporal_split', + 'check_ts_data', 'ts_stats', 'get_ts_data_parts', 'all_features', 'base_features', + 'load_watch', '__version__'] __author__ = 'David Burns david.mo.burns@gmail.com' diff --git a/seglearn/tests/test_transform.py b/seglearn/tests/test_transform.py index 8e51b2b..63e19ff 100644 --- a/seglearn/tests/test_transform.py +++ b/seglearn/tests/test_transform.py @@ -371,3 +371,33 @@ def test_interp(): assert len(Xc[0]) == N / 5 assert len(yc[0]) == N / 5 assert np.all(np.isin(yc, np.arange(6))) + + +def test_columntransformer(): + Nt = 100 + width = 5 + nvars = 5 + seg = transform.SegmentXY(width=width) + colTrans = transform.SegmentedColumnTransformer(transformers=[ + ("a", transform.FeatureRep(features={"mean": mean}), 0) + ]) + + # multivariate ts data without context data + X = [np.random.rand(Nt, nvars), np.random.rand(Nt, nvars), np.random.rand(Nt, nvars)] + y = [np.random.rand(Nt), np.random.rand(Nt), np.random.rand(Nt)] + seg.fit(X, y) + Xs, ys, _ = seg.transform(X, y) + colTrans.fit(Xs) + Xf = colTrans.transform(Xs) + N = len(ys) + assert Xf.shape == (N, 1) + + # univariate ts data without context data + X = [np.random.rand(Nt), np.random.rand(2 * Nt), np.random.rand(3 * Nt)] + y = [np.random.rand(Nt), np.random.rand(2 * Nt), np.random.rand(3 * Nt)] + seg.fit(X, y) + Xs, ys, _ = seg.transform(X, y) + colTrans.fit(Xs) + Xf = colTrans.transform(Xs) + N = len(ys) + assert Xf.shape == (N, 1) diff --git a/seglearn/transform.py b/seglearn/transform.py index 4c85999..b403e65 100644 --- a/seglearn/transform.py +++ b/seglearn/transform.py @@ -5,16 +5,18 @@ # License: BSD import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils import check_random_state, check_array +from sklearn.base import BaseEstimator, TransformerMixin, clone +from sklearn.utils import check_random_state, check_array, Parallel, delayed from sklearn.exceptions import NotFittedError +from sklearn.compose import ColumnTransformer from scipy.interpolate import interp1d from .feature_functions import base_features from .base import TS_Data from .util import get_ts_data_parts, check_ts_data -__all__ = ['SegmentX', 'SegmentXY', 'SegmentXYForecast', 'PadTrunc', 'Interp', 'FeatureRep'] +__all__ = ['SegmentX', 'SegmentXY', 'SegmentXYForecast', 'PadTrunc', 'Interp', 'FeatureRep', + 'SegmentedColumnTransformer'] class XyTransformerMixin(object): @@ -915,3 +917,58 @@ def _generate_feature_labels(self, X): f_labels += s_labels return f_labels + + +# TODO: Add support for contextual data. +class SegmentedColumnTransformer(ColumnTransformer): + ''' + Apply specified transformers to columns of a numpy array of segmented time series data. + + EXPERIMENTAL: This transformer is based on the sklearn ColumnTransformer which may change + without deprecation warnings between releases. + + This transformer allows the application of specified groups of feature functions (using + FeatureRep) to a subset of columns, e.g. when dealing with heterogeneous data. The order of the + final output is determined by the transformers list. Non-specified columns are dropped. + + Parameters + ---------- + transformers : list of tuples (name, transformer, column(s)) + sparse_threshold : float (default=0.3) threshold value to switch between dense and sparse output + n_jobs : int or None (default=None) to specify the number of jobs to run in parallel + transformer_weights : dict (default=None) to specify the multiplicative weight of a transformer + + Consult the ColumnTransformer documentation for more detailed information on the parameters. + + Attributes + ---------- + transformers_ : list of fitted transformers as tuples of (name, fitted_transformer, column) + named_transformers_ : (read-only) Bunch object of fitted transformers indexed by their names + sparse_output : boolean indicating whether the output is a sparse matrix or a dense numpy array + + Consult the ColumnTransformer documentation for more detailed information on the attributes. + ''' + + def __init__(self, transformers, sparse_threshold=0.3, n_jobs=None, transformer_weights=None): + # changes to original: remove the remainder parameter + super(SegmentedColumnTransformer, self).__init__( + transformers=transformers, + sparse_threshold=sparse_threshold, + n_jobs=n_jobs, + transformer_weights=transformer_weights + ) + + def _validate_remainder(self, X): + # changes to original: disable remainder handling + self._remainder = ('remainder', None, None) + + def _fit_transform(self, X, y, func, fitted=False): + # changes to original: + # - replace _get_column(X, column) with np.atleast_3d(X)[:, :, column] + # - replace_strings=False (disable 'passthrough' and 'drop' handling) + # - remove 2D data specific exception handling + return Parallel(n_jobs=self.n_jobs)( + delayed(func)( + clone(trans) if not fitted else trans, np.atleast_3d(X)[:, :, column], y, weight + ) for _, trans, column, weight in self._iter(fitted=fitted, replace_strings=False) + )