Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Column transformer for segmented data #9

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
41 changes: 41 additions & 0 deletions examples/column_transformer.py
@@ -0,0 +1,41 @@
'''
=========================================
Simple SegmentedColumnTransformer Example
=========================================

This example demonstrates how to use the SegmentedColumnTransformer on segmented data.
Note that contextual data is not supported.
'''

# Author: Matthias Gazzari
# License: BSD

from seglearn.transform import SegmentXY, FeatureRep, SegmentedColumnTransformer
from seglearn.feature_functions import minimum
from seglearn.base import TS_Data

import numpy as np

X = [np.array([[0,1], [2,3], [4,5], [6,7], [8,9], [10,11], [12,13], [14,15]])]
y = [np.array([True, False, False, True, False, True, False, True])]

segment = SegmentXY(width=4, overlap=1)
X, y, _ = segment.fit_transform(X, y)

print('After segmentation:')
print(X, X.shape)
print(y, y.shape)

col_trans = SegmentedColumnTransformer([
('a', FeatureRep(features={'min_0': minimum}), 0),
('b', FeatureRep(features={'min_1': minimum}), 1),
('c', FeatureRep(features={'min_all': minimum}), [0,1]),
# alternative column specifications:
#('c', FeatureRep(features={'min_all': minimum}), lambda x: [0,1]),
#('c', FeatureRep(features={'min_all': minimum}), slice(0,2)),
#('c', FeatureRep(features={'min_all': minimum}), [True, True]),
])

print('After column-wise feature extraction:')
X = col_trans.fit_transform(X, y)
print(X, X.shape)
2 changes: 1 addition & 1 deletion requirements.txt
@@ -1,3 +1,3 @@
numpy
scipy
scikit-learn>=0.19
scikit-learn>=0.20
5 changes: 3 additions & 2 deletions seglearn/__init__.py
Expand Up @@ -13,8 +13,9 @@
from . import transform, pipe, util, split, datasets, feature_functions

__all__ = ['TS_Data', 'FeatureRep', 'PadTrunc', 'Interp', 'Pype', 'SegmentX', 'SegmentXY',
'SegmentXYForecast', 'TemporalKFold', 'temporal_split', 'check_ts_data', 'ts_stats',
'get_ts_data_parts', 'all_features', 'base_features', 'load_watch', '__version__']
'SegmentXYForecast', 'SegmentedColumnTransformer', 'TemporalKFold', 'temporal_split',
'check_ts_data', 'ts_stats', 'get_ts_data_parts', 'all_features', 'base_features',
'load_watch', '__version__']

__author__ = 'David Burns david.mo.burns@gmail.com'

30 changes: 30 additions & 0 deletions seglearn/tests/test_transform.py
Expand Up @@ -371,3 +371,33 @@ def test_interp():
assert len(Xc[0]) == N / 5
assert len(yc[0]) == N / 5
assert np.all(np.isin(yc, np.arange(6)))


def test_columntransformer():
Nt = 100
width = 5
nvars = 5
seg = transform.SegmentXY(width=width)
colTrans = transform.SegmentedColumnTransformer(transformers=[
("a", transform.FeatureRep(features={"mean": mean}), 0)
])

# multivariate ts data without context data
X = [np.random.rand(Nt, nvars), np.random.rand(Nt, nvars), np.random.rand(Nt, nvars)]
y = [np.random.rand(Nt), np.random.rand(Nt), np.random.rand(Nt)]
seg.fit(X, y)
Xs, ys, _ = seg.transform(X, y)
colTrans.fit(Xs)
Xf = colTrans.transform(Xs)
N = len(ys)
assert Xf.shape == (N, 1)

# univariate ts data without context data
X = [np.random.rand(Nt), np.random.rand(2 * Nt), np.random.rand(3 * Nt)]
y = [np.random.rand(Nt), np.random.rand(2 * Nt), np.random.rand(3 * Nt)]
seg.fit(X, y)
Xs, ys, _ = seg.transform(X, y)
colTrans.fit(Xs)
Xf = colTrans.transform(Xs)
N = len(ys)
assert Xf.shape == (N, 1)
63 changes: 60 additions & 3 deletions seglearn/transform.py
Expand Up @@ -5,16 +5,18 @@
# License: BSD

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_random_state, check_array
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.utils import check_random_state, check_array, Parallel, delayed
from sklearn.exceptions import NotFittedError
from sklearn.compose import ColumnTransformer
from scipy.interpolate import interp1d

from .feature_functions import base_features
from .base import TS_Data
from .util import get_ts_data_parts, check_ts_data

__all__ = ['SegmentX', 'SegmentXY', 'SegmentXYForecast', 'PadTrunc', 'Interp', 'FeatureRep']
__all__ = ['SegmentX', 'SegmentXY', 'SegmentXYForecast', 'PadTrunc', 'Interp', 'FeatureRep',
'SegmentedColumnTransformer']


class XyTransformerMixin(object):
Expand Down Expand Up @@ -915,3 +917,58 @@ def _generate_feature_labels(self, X):
f_labels += s_labels

return f_labels


# TODO: Add support for contextual data.
class SegmentedColumnTransformer(ColumnTransformer):
'''
Apply specified transformers to columns of a numpy array of segmented time series data.

EXPERIMENTAL: This transformer is based on the sklearn ColumnTransformer which may change
without deprecation warnings between releases.

This transformer allows the application of specified groups of feature functions (using
FeatureRep) to a subset of columns, e.g. when dealing with heterogeneous data. The order of the
final output is determined by the transformers list. Non-specified columns are dropped.

Parameters
----------
transformers : list of tuples (name, transformer, column(s))
sparse_threshold : float (default=0.3) threshold value to switch between dense and sparse output
n_jobs : int or None (default=None) to specify the number of jobs to run in parallel
transformer_weights : dict (default=None) to specify the multiplicative weight of a transformer

Consult the ColumnTransformer documentation for more detailed information on the parameters.

Attributes
----------
transformers_ : list of fitted transformers as tuples of (name, fitted_transformer, column)
named_transformers_ : (read-only) Bunch object of fitted transformers indexed by their names
sparse_output : boolean indicating whether the output is a sparse matrix or a dense numpy array

Consult the ColumnTransformer documentation for more detailed information on the attributes.
'''

def __init__(self, transformers, sparse_threshold=0.3, n_jobs=None, transformer_weights=None):
# changes to original: remove the remainder parameter
super(SegmentedColumnTransformer, self).__init__(
transformers=transformers,
sparse_threshold=sparse_threshold,
n_jobs=n_jobs,
transformer_weights=transformer_weights
)

def _validate_remainder(self, X):
# changes to original: disable remainder handling
self._remainder = ('remainder', None, None)

def _fit_transform(self, X, y, func, fitted=False):
# changes to original:
# - replace _get_column(X, column) with np.atleast_3d(X)[:, :, column]
# - replace_strings=False (disable 'passthrough' and 'drop' handling)
# - remove 2D data specific exception handling
return Parallel(n_jobs=self.n_jobs)(
delayed(func)(
clone(trans) if not fitted else trans, np.atleast_3d(X)[:, :, column], y, weight
) for _, trans, column, weight in self._iter(fitted=fitted, replace_strings=False)
)