From 1cbb0d9eddda704dd664fd5f116478df12eeb3b7 Mon Sep 17 00:00:00 2001 From: Matthias Gazzari Date: Wed, 28 Nov 2018 02:52:36 +0100 Subject: [PATCH 1/3] transform: Add SegmentedColumnTransformer The main use case for this transformer is to enable the application of specified groups of feature functions to specified columns of data, e.g. when dealing with heterogeneous data. The SegmentedColumnTransformer is derived from the sklearn ColumnTransformer and adapted to be used inside a Pype object after a segment transformation. The adaption mainly consists of: - adapt the notation of a column (ColumnTransformer iterates over the second dimension, segmented data must be iterated over the third dimension). - disable "drop" and "passthrough" transform options for simplicity and drop non-specified columns by default Note: SegmentedColumnTransformer does not support contextual data. --- requirements.txt | 2 +- seglearn/__init__.py | 5 ++-- seglearn/transform.py | 63 ++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8299ec7..4a94fbe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ numpy scipy -scikit-learn>=0.19 +scikit-learn>=0.20 diff --git a/seglearn/__init__.py b/seglearn/__init__.py index b11b6f5..690fb44 100644 --- a/seglearn/__init__.py +++ b/seglearn/__init__.py @@ -13,8 +13,9 @@ from . import transform, pipe, util, split, datasets, feature_functions __all__ = ['TS_Data', 'FeatureRep', 'PadTrunc', 'Interp', 'Pype', 'SegmentX', 'SegmentXY', - 'SegmentXYForecast', 'TemporalKFold', 'temporal_split', 'check_ts_data', 'ts_stats', - 'get_ts_data_parts', 'all_features', 'base_features', 'load_watch', '__version__'] + 'SegmentXYForecast', 'SegmentedColumnTransformer', 'TemporalKFold', 'temporal_split', + 'check_ts_data', 'ts_stats', 'get_ts_data_parts', 'all_features', 'base_features', + 'load_watch', '__version__'] __author__ = 'David Burns david.mo.burns@gmail.com' diff --git a/seglearn/transform.py b/seglearn/transform.py index 4c85999..b403e65 100644 --- a/seglearn/transform.py +++ b/seglearn/transform.py @@ -5,16 +5,18 @@ # License: BSD import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils import check_random_state, check_array +from sklearn.base import BaseEstimator, TransformerMixin, clone +from sklearn.utils import check_random_state, check_array, Parallel, delayed from sklearn.exceptions import NotFittedError +from sklearn.compose import ColumnTransformer from scipy.interpolate import interp1d from .feature_functions import base_features from .base import TS_Data from .util import get_ts_data_parts, check_ts_data -__all__ = ['SegmentX', 'SegmentXY', 'SegmentXYForecast', 'PadTrunc', 'Interp', 'FeatureRep'] +__all__ = ['SegmentX', 'SegmentXY', 'SegmentXYForecast', 'PadTrunc', 'Interp', 'FeatureRep', + 'SegmentedColumnTransformer'] class XyTransformerMixin(object): @@ -915,3 +917,58 @@ def _generate_feature_labels(self, X): f_labels += s_labels return f_labels + + +# TODO: Add support for contextual data. +class SegmentedColumnTransformer(ColumnTransformer): + ''' + Apply specified transformers to columns of a numpy array of segmented time series data. + + EXPERIMENTAL: This transformer is based on the sklearn ColumnTransformer which may change + without deprecation warnings between releases. + + This transformer allows the application of specified groups of feature functions (using + FeatureRep) to a subset of columns, e.g. when dealing with heterogeneous data. The order of the + final output is determined by the transformers list. Non-specified columns are dropped. + + Parameters + ---------- + transformers : list of tuples (name, transformer, column(s)) + sparse_threshold : float (default=0.3) threshold value to switch between dense and sparse output + n_jobs : int or None (default=None) to specify the number of jobs to run in parallel + transformer_weights : dict (default=None) to specify the multiplicative weight of a transformer + + Consult the ColumnTransformer documentation for more detailed information on the parameters. + + Attributes + ---------- + transformers_ : list of fitted transformers as tuples of (name, fitted_transformer, column) + named_transformers_ : (read-only) Bunch object of fitted transformers indexed by their names + sparse_output : boolean indicating whether the output is a sparse matrix or a dense numpy array + + Consult the ColumnTransformer documentation for more detailed information on the attributes. + ''' + + def __init__(self, transformers, sparse_threshold=0.3, n_jobs=None, transformer_weights=None): + # changes to original: remove the remainder parameter + super(SegmentedColumnTransformer, self).__init__( + transformers=transformers, + sparse_threshold=sparse_threshold, + n_jobs=n_jobs, + transformer_weights=transformer_weights + ) + + def _validate_remainder(self, X): + # changes to original: disable remainder handling + self._remainder = ('remainder', None, None) + + def _fit_transform(self, X, y, func, fitted=False): + # changes to original: + # - replace _get_column(X, column) with np.atleast_3d(X)[:, :, column] + # - replace_strings=False (disable 'passthrough' and 'drop' handling) + # - remove 2D data specific exception handling + return Parallel(n_jobs=self.n_jobs)( + delayed(func)( + clone(trans) if not fitted else trans, np.atleast_3d(X)[:, :, column], y, weight + ) for _, trans, column, weight in self._iter(fitted=fitted, replace_strings=False) + ) From fc53756ad3a111b70282e334b62f32b9383206d6 Mon Sep 17 00:00:00 2001 From: Matthias Gazzari Date: Wed, 28 Nov 2018 04:55:21 +0100 Subject: [PATCH 2/3] tests: Add test for SegmentedColumnTransformer --- seglearn/tests/test_transform.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/seglearn/tests/test_transform.py b/seglearn/tests/test_transform.py index 8e51b2b..63e19ff 100644 --- a/seglearn/tests/test_transform.py +++ b/seglearn/tests/test_transform.py @@ -371,3 +371,33 @@ def test_interp(): assert len(Xc[0]) == N / 5 assert len(yc[0]) == N / 5 assert np.all(np.isin(yc, np.arange(6))) + + +def test_columntransformer(): + Nt = 100 + width = 5 + nvars = 5 + seg = transform.SegmentXY(width=width) + colTrans = transform.SegmentedColumnTransformer(transformers=[ + ("a", transform.FeatureRep(features={"mean": mean}), 0) + ]) + + # multivariate ts data without context data + X = [np.random.rand(Nt, nvars), np.random.rand(Nt, nvars), np.random.rand(Nt, nvars)] + y = [np.random.rand(Nt), np.random.rand(Nt), np.random.rand(Nt)] + seg.fit(X, y) + Xs, ys, _ = seg.transform(X, y) + colTrans.fit(Xs) + Xf = colTrans.transform(Xs) + N = len(ys) + assert Xf.shape == (N, 1) + + # univariate ts data without context data + X = [np.random.rand(Nt), np.random.rand(2 * Nt), np.random.rand(3 * Nt)] + y = [np.random.rand(Nt), np.random.rand(2 * Nt), np.random.rand(3 * Nt)] + seg.fit(X, y) + Xs, ys, _ = seg.transform(X, y) + colTrans.fit(Xs) + Xf = colTrans.transform(Xs) + N = len(ys) + assert Xf.shape == (N, 1) From e1a9df1b78ea0d970730d712b87281230c9a35dd Mon Sep 17 00:00:00 2001 From: Matthias Gazzari Date: Wed, 28 Nov 2018 04:59:34 +0100 Subject: [PATCH 3/3] examples: Add SegmentedColumnTransformer example --- examples/column_transformer.py | 41 ++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 examples/column_transformer.py diff --git a/examples/column_transformer.py b/examples/column_transformer.py new file mode 100644 index 0000000..ed02d8d --- /dev/null +++ b/examples/column_transformer.py @@ -0,0 +1,41 @@ +''' +========================================= +Simple SegmentedColumnTransformer Example +========================================= + +This example demonstrates how to use the SegmentedColumnTransformer on segmented data. +Note that contextual data is not supported. +''' + +# Author: Matthias Gazzari +# License: BSD + +from seglearn.transform import SegmentXY, FeatureRep, SegmentedColumnTransformer +from seglearn.feature_functions import minimum +from seglearn.base import TS_Data + +import numpy as np + +X = [np.array([[0,1], [2,3], [4,5], [6,7], [8,9], [10,11], [12,13], [14,15]])] +y = [np.array([True, False, False, True, False, True, False, True])] + +segment = SegmentXY(width=4, overlap=1) +X, y, _ = segment.fit_transform(X, y) + +print('After segmentation:') +print(X, X.shape) +print(y, y.shape) + +col_trans = SegmentedColumnTransformer([ + ('a', FeatureRep(features={'min_0': minimum}), 0), + ('b', FeatureRep(features={'min_1': minimum}), 1), + ('c', FeatureRep(features={'min_all': minimum}), [0,1]), + # alternative column specifications: + #('c', FeatureRep(features={'min_all': minimum}), lambda x: [0,1]), + #('c', FeatureRep(features={'min_all': minimum}), slice(0,2)), + #('c', FeatureRep(features={'min_all': minimum}), [True, True]), +]) + +print('After column-wise feature extraction:') +X = col_trans.fit_transform(X, y) +print(X, X.shape)