dmbee · qtux · Nov 28, 2018 · Nov 28, 2018 · Nov 28, 2018
diff --git a/examples/column_transformer.py b/examples/column_transformer.py
@@ -0,0 +1,41 @@
+'''
+=========================================
+Simple SegmentedColumnTransformer Example
+=========================================
+
+This example demonstrates how to use the SegmentedColumnTransformer on segmented data.
+Note that contextual data is not supported.
+'''
+
+# Author: Matthias Gazzari
+# License: BSD
+
+from seglearn.transform import SegmentXY, FeatureRep, SegmentedColumnTransformer
+from seglearn.feature_functions import minimum
+from seglearn.base import TS_Data
+
+import numpy as np
+
+X = [np.array([[0,1], [2,3], [4,5], [6,7], [8,9], [10,11], [12,13], [14,15]])]
+y = [np.array([True, False, False, True, False, True, False, True])]
+
+segment = SegmentXY(width=4, overlap=1)
+X, y, _ = segment.fit_transform(X, y)
+
+print('After segmentation:')
+print(X, X.shape)
+print(y, y.shape)
+
+col_trans = SegmentedColumnTransformer([
+    ('a', FeatureRep(features={'min_0': minimum}), 0),
+    ('b', FeatureRep(features={'min_1': minimum}), 1),
+    ('c', FeatureRep(features={'min_all': minimum}), [0,1]),
+    # alternative column specifications:
+    #('c', FeatureRep(features={'min_all': minimum}), lambda x: [0,1]),
+    #('c', FeatureRep(features={'min_all': minimum}), slice(0,2)),
+    #('c', FeatureRep(features={'min_all': minimum}), [True, True]),
+])
+
+print('After column-wise feature extraction:')
+X = col_trans.fit_transform(X, y)
+print(X, X.shape)
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,3 @@
 numpy
 scipy
-scikit-learn>=0.19
+scikit-learn>=0.20
diff --git a/seglearn/__init__.py b/seglearn/__init__.py
@@ -13,8 +13,9 @@
 from . import transform, pipe, util, split, datasets, feature_functions
 
 __all__ = ['TS_Data', 'FeatureRep', 'PadTrunc', 'Interp', 'Pype', 'SegmentX', 'SegmentXY',
-           'SegmentXYForecast', 'TemporalKFold', 'temporal_split', 'check_ts_data', 'ts_stats',
-           'get_ts_data_parts', 'all_features', 'base_features', 'load_watch', '__version__']
+           'SegmentXYForecast', 'SegmentedColumnTransformer', 'TemporalKFold', 'temporal_split',
+           'check_ts_data', 'ts_stats', 'get_ts_data_parts', 'all_features', 'base_features',
+           'load_watch', '__version__']
 
 __author__ = 'David Burns david.mo.burns@gmail.com'
 
diff --git a/seglearn/tests/test_transform.py b/seglearn/tests/test_transform.py
@@ -371,3 +371,33 @@ def test_interp():
     assert len(Xc[0]) == N / 5
     assert len(yc[0]) == N / 5
     assert np.all(np.isin(yc, np.arange(6)))
+
+
+def test_columntransformer():
+    Nt = 100
+    width = 5
+    nvars = 5
+    seg = transform.SegmentXY(width=width)
+    colTrans = transform.SegmentedColumnTransformer(transformers=[
+        ("a", transform.FeatureRep(features={"mean": mean}), 0)
+    ])
+
+    # multivariate ts data without context data
+    X = [np.random.rand(Nt, nvars), np.random.rand(Nt, nvars), np.random.rand(Nt, nvars)]
+    y = [np.random.rand(Nt), np.random.rand(Nt), np.random.rand(Nt)]
+    seg.fit(X, y)
+    Xs, ys, _ = seg.transform(X, y)
+    colTrans.fit(Xs)
+    Xf = colTrans.transform(Xs)
+    N = len(ys)
+    assert Xf.shape == (N, 1)
+
+    # univariate ts data without context data
+    X = [np.random.rand(Nt), np.random.rand(2 * Nt), np.random.rand(3 * Nt)]
+    y = [np.random.rand(Nt), np.random.rand(2 * Nt), np.random.rand(3 * Nt)]
+    seg.fit(X, y)
+    Xs, ys, _ = seg.transform(X, y)
+    colTrans.fit(Xs)
+    Xf = colTrans.transform(Xs)
+    N = len(ys)
+    assert Xf.shape == (N, 1)
diff --git a/seglearn/transform.py b/seglearn/transform.py
@@ -5,16 +5,18 @@
 # License: BSD
 
 import numpy as np
-from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn.utils import check_random_state, check_array
+from sklearn.base import BaseEstimator, TransformerMixin, clone
+from sklearn.utils import check_random_state, check_array, Parallel, delayed
 from sklearn.exceptions import NotFittedError
+from sklearn.compose import ColumnTransformer
 from scipy.interpolate import interp1d
 
 from .feature_functions import base_features
 from .base import TS_Data
 from .util import get_ts_data_parts, check_ts_data
 
-__all__ = ['SegmentX', 'SegmentXY', 'SegmentXYForecast', 'PadTrunc', 'Interp', 'FeatureRep']
+__all__ = ['SegmentX', 'SegmentXY', 'SegmentXYForecast', 'PadTrunc', 'Interp', 'FeatureRep',
+           'SegmentedColumnTransformer']
 
 
 class XyTransformerMixin(object):
@@ -915,3 +917,58 @@ def _generate_feature_labels(self, X):
             f_labels += s_labels
 
         return f_labels
+
+
+# TODO: Add support for contextual data.
+class SegmentedColumnTransformer(ColumnTransformer):
+    '''
+    Apply specified transformers to columns of a numpy array of segmented time series data.
+
+    EXPERIMENTAL: This transformer is based on the sklearn ColumnTransformer which may change
+    without deprecation warnings between releases.
+
+    This transformer allows the application of specified groups of feature functions (using
+    FeatureRep) to a subset of columns, e.g. when dealing with heterogeneous data. The order of the
+    final output is determined by the transformers list. Non-specified columns are dropped.
+
+    Parameters
+    ----------
+    transformers : list of tuples (name, transformer, column(s))
+    sparse_threshold : float (default=0.3) threshold value to switch between dense and sparse output
+    n_jobs : int or None (default=None) to specify the number of jobs to run in parallel
+    transformer_weights : dict (default=None) to specify the multiplicative weight of a transformer
+
+    Consult the ColumnTransformer documentation for more detailed information on the parameters.
+
+    Attributes
+    ----------
+    transformers_ : list of fitted transformers as tuples of (name, fitted_transformer, column)
+    named_transformers_ : (read-only) Bunch object of fitted transformers indexed by their names
+    sparse_output : boolean indicating whether the output is a sparse matrix or a dense numpy array
+
+    Consult the ColumnTransformer documentation for more detailed information on the attributes.
+    '''
+
+    def __init__(self, transformers, sparse_threshold=0.3, n_jobs=None, transformer_weights=None):
+        # changes to original: remove the remainder parameter
+        super(SegmentedColumnTransformer, self).__init__(
+            transformers=transformers,
+            sparse_threshold=sparse_threshold,
+            n_jobs=n_jobs,
+            transformer_weights=transformer_weights
+        )
+
+    def _validate_remainder(self, X):
+        # changes to original: disable remainder handling
+        self._remainder = ('remainder', None, None)
+
+    def _fit_transform(self, X, y, func, fitted=False):
+        # changes to original:
+        # - replace _get_column(X, column) with np.atleast_3d(X)[:, :, column]
+        # - replace_strings=False (disable 'passthrough' and 'drop' handling)
+        # - remove 2D data specific exception handling
+        return Parallel(n_jobs=self.n_jobs)(
+            delayed(func)(
+                clone(trans) if not fitted else trans, np.atleast_3d(X)[:, :, column], y, weight
+            ) for _, trans, column, weight in self._iter(fitted=fitted, replace_strings=False)
+        )