Merge a61f90f into d8d7039

dmbee · Nov 7, 2019 · 5555a02 · 5555a02
2 parents d8d7039 + a61f90f
commit 5555a02
Show file tree

Hide file tree

Showing 21 changed files with 711 additions and 149 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -4,13 +4,13 @@ os:
   - linux
 
 python:
-  - "2.7"
   - "3.5"
   - "3.6"
 
 install:
   - pip install numpy
   - pip install scipy
+  - pip install pandas
   - pip install -r requirements.txt
   - pip install .
   - pip install 'pytest>=3.6'

diff --git a/README.rst b/README.rst
@@ -38,12 +38,12 @@ documentation_.
 Dependencies
 ~~~~~~~~~~~~
 
-seglearn is tested to work under Python 2.7 and Python 3.5.
+seglearn is tested to work under Python 3.5.
 The dependency requirements are based on the last scikit-learn release:
 
-* scipy(>=0.13.3)
-* numpy(>=1.8.2)
-* scikit-learn(>=0.19.0)
+* scipy(>=0.17.0)
+* numpy(>=1.11.0)
+* scikit-learn(>=0.21.3)
 
 Additionally, to run the examples, you need:
 

diff --git a/appveyor.yml b/appveyor.yml
@@ -2,19 +2,12 @@ build: false
 
 environment:
   matrix:
-    - PYTHON: "C:\\Miniconda-x64"
-      PYTHON_VERSION: "2.7.x"
-      PYTHON_ARCH: "64"
-      NUMPY_VERSION: "1.13.1"
-      SCIPY_VERSION: "0.19.1"
-      SKLEARN_VERSION: "0.19.1"
-
     - PYTHON: "C:\\Miniconda3-x64"
       PYTHON_VERSION: "3.5.x"
       PYTHON_ARCH: "64"
       NUMPY_VERSION: "1.13.1"
       SCIPY_VERSION: "0.19.1"
-      SKLEARN_VERSION: "0.19.1"
+      SKLEARN_VERSION: "0.21.3"
 
     - PYTHON: "C:\\Miniconda3-x64"
       PYTHON_VERSION: "3.6.x"

diff --git a/doc/change_log.rst b/doc/change_log.rst
@@ -1,6 +1,18 @@
 Change Log
 ==========
 
+Version 1.0.9
+
+* allows for vector targets, to support one-hot or multi-label encoding
+* supports unsorted time series data
+* supports time series data with duplicate time stamps (fixing scipy interp sorting)
+* fixed numpy deprecation warning for test_base
+
+Version 1.0.8
+
+* added function for creating TS_Data from pandas
+* minor bug fixes
+
 Version 1.0.7
 
 * step parameter defined for Segment transformers, which can be used to specify the sliding

diff --git a/doc/install.rst b/doc/install.rst
@@ -5,12 +5,12 @@ Install and contribution
 Dependencies
 ============
 
-Seglearn is tested to work under Python 2.7 and Python 3.5.
+Seglearn is tested to work under Python 3.5.
 The dependency requirements are based on the last scikit-learn release:
 
-* scipy(>=0.13.3)
-* numpy(>=1.8.2)
-* scikit-learn(>=0.19.0)
+* scipy(>=0.17.0)
+* numpy(>=1.11.0)
+* scikit-learn(>=0.21.3)
 
 Additionally, to run the examples, you need:
 

diff --git a/doc/user_guide.rst b/doc/user_guide.rst
@@ -15,7 +15,7 @@ Learning multivariate sequential data with the sliding window method is useful i
 Time Series Data
 ----------------
 
-Sequence and time series data have a general formulation as sequence pairs :math:`\{(\mathbf{X}_i,\mathbf{y}_i)\}_{i=1}^{N}`, where each :math:`\mathbf{X}_i` is a multivariate sequence with :math:`T_i` samples :math:`\langle \mathbf{x}_{i,1}, \mathbf{x}_{i,2},...,\mathbf{x}_{i,T_i} \rangle` and each :math:`\mathbf{y}_i` target is a univariate sequence with :math:`T_i` samples :math:`\langle \mathbf{x}_{i,1}, \mathbf{x}_{i,2},...,\mathbf{x}_{i,T_i} \rangle` and each :math:`\mathbf{y}_i` target is a univariate sequence with :math:`T_i` samples :math:`\langle y_{i,1}, y_{i,2},..., y_{i,T_i} \rangle`. The targets :math:`\mathbf{y}_i` can either be sequences of categorical class labels (for classification problems), or sequences of continuous data (for regression problems). The number of samples :math:`T_i` varies between the sequence pairs in the data set. Time series' with a regular sampling period may be treated equivalently to sequences. Irregularly sampled time series are formulated with an additional sequence variable :math:`\mathbf{t}_i` that increases monotonically and indicates the timing of samples in the data set :math:`\{(\mathbf{t}_i, \mathbf{X}_i,\mathbf{y}_i)\}_{i=1}^{N}`.
+Sequence and time series data have a general formulation as sequence pairs :math:`\{(\mathbf{X}_i,\mathbf{y}_i)\}_{i=1}^{N}`, where each :math:`\mathbf{X}_i` is a multivariate sequence with :math:`T_i` samples :math:`\langle \mathbf{x}_{i,1}, \mathbf{x}_{i,2},...,\mathbf{x}_{i,T_i} \rangle` and each :math:`\mathbf{y}_i` target is a univariate sequence with :math:`T_i` samples :math:`\langle \mathbf{x}_{i,1}, \mathbf{x}_{i,2},...,\mathbf{x}_{i,T_i} \rangle`. The targets :math:`\mathbf{y}_i` can either be sequences of categorical class labels (for classification problems), or sequences of continuous data (for regression problems). The number of samples :math:`T_i` varies between the sequence pairs in the data set. Time series' with a regular sampling period may be treated equivalently to sequences. Irregularly sampled time series are formulated with an additional sequence variable :math:`\mathbf{t}_i` that increases monotonically and indicates the timing of samples in the data set :math:`\{(\mathbf{t}_i, \mathbf{X}_i,\mathbf{y}_i)\}_{i=1}^{N}`.
 
 Important sub-classes of the general sequence learning problem are sequence classification and sequence prediction. In sequence classification problems (eg song genre classification), the target for each sequence is a fixed class label :math:`y_i` and the data takes the form :math:`\{(\mathbf{X}_i, y_i)\}_{i=1}^{N}`. Sequence prediction involves predicting a future value of the target :math:`(y_{i,t+f})` or future values :math:`\langle y_{i,t+1}, y_{i,t+2},..., y_{i,t+f} \rangle`, given :math:`\langle \mathbf{x}_{i,1}, \mathbf{x}_{i,2},...,\mathbf{x}_{i,t} \rangle, \langle y_{i,1}, y_{i,2},..., y_{i,t} \rangle`, and sometimes also :math:`\langle \mathbf{x}_{i,t+1}, \mathbf{x}_{i,t+2},...,\mathbf{x}_{i,t+f} \rangle`.
 
@@ -88,6 +88,13 @@ The ``TS_Data`` class is provided as an indexable / iterable that can store time
     >>> Xc = rand(3,2)
     >>> X = TS_Data(Xt, Xc)
 
+``TS_Data`` can be initialized from a pandas dataframe using column 'ts_data' for the time series::
+
+    >>> import pandas as pd
+    >>> df = pd.DataFrame(Xc)
+    >>> df['ts_data'] = Xt
+    >>> X = TS_Data.from_df(df)
+
 There is a caveat for datasets that are a single time series. For compatibility with the seglearn segmenter classes, they need to be represented as a list::
 
     >>> X = [rand(1000,10)]

diff --git a/examples/plot_imblearn.py b/examples/plot_imblearn.py
@@ -0,0 +1,84 @@
+'''
+===============================
+Simple imbalanced-learn example
+===============================
+
+This example demonstrates how to use imbalanced-learn resample transforms inside a seglearn Pype.
+'''
+
+# Author: Matthias Gazzari
+# License: BSD
+
+import numpy as np
+
+from sklearn.dummy import DummyClassifier
+
+from seglearn.pipe import Pype
+from seglearn.transform import SegmentXY, patch_sampler, FeatureRep
+from seglearn.feature_functions import minimum
+from seglearn.split import temporal_split
+
+
+from imblearn.under_sampling import RandomUnderSampler
+
+# Single univariate time series with 10 samples
+X = [np.array([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5,6], [6, 7], [7, 8], [8, 9], [9, 10]])]
+# Time series target (imbalanced towards False)
+y = [np.array([True, False, False, False, False, False, True, False, False, False])]
+
+print("Implementation details: transform and fit_transform methods:")
+
+pipe = Pype([
+    ('segment', SegmentXY(width=1, overlap=0)),
+    ('resample', patch_sampler(RandomUnderSampler)()),
+])
+print("Pipeline:", pipe)
+
+print("Calling a transform on the data does not change it ...")
+Xf, yf = pipe.transform(X, y)
+print("X (flattened):", Xf.flatten())
+print("y", yf)
+
+print("... but calling fit_transform resamples the data.")
+Xf, yf = pipe.fit_transform(X, y)
+print("X (flattened):", Xf.flatten())
+print("y", yf)
+
+print()
+print("VerboseDummyClassifier example:")
+print()
+
+class VerboseDummyClassifier(DummyClassifier):
+    def fit(self, X, y, sample_weight=None):
+        print("Fitting X (flattened):", X.flatten(), "on y:", y)
+        return super(VerboseDummyClassifier, self).fit(X, y, sample_weight)
+    def predict(self, X):
+        print("Predicting X (flattened):", X.flatten())
+        return super(VerboseDummyClassifier, self).predict(X)
+    def score(self, X, y, sample_weight=None):
+        print("Scoring X (flattened):", X.flatten(), "on y:", y)
+        return super(VerboseDummyClassifier, self).score(X, y, sample_weight)
+
+pipe = Pype([
+    ('segment', SegmentXY(width=1, overlap=0)),
+    ('resample', patch_sampler(RandomUnderSampler)(shuffle=True)),
+    ('feature', FeatureRep(features={"min":minimum})),
+    ('estimator', VerboseDummyClassifier(strategy="constant", constant=True)),
+])
+print("Pipeline:", pipe)
+
+print("Split the data into half training and half test data:")
+X_train, X_test, y_train, y_test = temporal_split(X, y, 0.5)
+print("X_train:", X_train)
+print("y_train:", y_train)
+print("X_test:", X_test)
+print("y_test:", y_test)
+print()
+
+print("Fit on the training data (this includes resampling):")
+pipe.fit(X_train, y_train)
+print()
+
+print("Score the fitted estimator on test data (this excludes resampling):")
+score = pipe.score(X_test, y_test)
+print("Score: ", score)
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,3 @@
 numpy
 scipy
-scikit-learn>=0.19
+scikit-learn>=0.21.3
diff --git a/seglearn/__init__.py b/seglearn/__init__.py
@@ -9,13 +9,14 @@
 from .pipe import Pype
 from .preprocessing import TargetRunLengthEncoder
 from .split import TemporalKFold, temporal_split
-from .transform import SegmentX, SegmentXY, SegmentXYForecast, PadTrunc, Interp, FeatureRep
+from .transform import SegmentX, SegmentXY, SegmentXYForecast, PadTrunc, Interp, InterpLongToWide, FeatureRep, \
+    FeatureRepMix, FunctionTransformer
 from .util import check_ts_data, check_ts_data_with_ts_target, ts_stats, get_ts_data_parts
 
-__all__ = ['TS_Data', 'FeatureRep', 'FeatureRepMix', 'PadTrunc', 'Interp', 'Pype', 'SegmentX',
+__all__ = ['TS_Data', 'FeatureRep', 'FeatureRepMix', 'PadTrunc', 'Interp', 'InterpLongToWide', 'Pype', 'SegmentX',
            'SegmentXY', 'SegmentXYForecast', 'TemporalKFold', 'temporal_split', 'check_ts_data',
            'check_ts_data_with_ts_target', 'ts_stats', 'get_ts_data_parts', 'all_features',
            'base_features', 'load_watch', 'TargetRunLengthEncoder', 'FunctionTransformer',
-           '__version__']
+           'patch_sampler', '__version__']
 
 __author__ = 'David Burns david.mo.burns@gmail.com'
diff --git a/seglearn/_version.py b/seglearn/_version.py
@@ -1 +1 @@
-__version__ = "1.0.7"
+__version__ = "1.0.10"
diff --git a/seglearn/base.py b/seglearn/base.py
@@ -32,6 +32,10 @@ def __init__(self, ts_data, context_data):
         self.N = N
         self.shape = [N]  # need for safe_indexing with sklearn
 
+    @classmethod
+    def from_df(cls, df):
+        return cls(np.array(df['ts_data']), np.array(df.drop(columns=['ts_data'])))
+
     def __iter__(self):
         return self
 
@@ -46,3 +50,4 @@ def __next__(self):
 
     def __len__(self):
         return self.N
+
diff --git a/seglearn/datasets.py b/seglearn/datasets.py
@@ -42,5 +42,5 @@ def load_watch():
     >>> print(data.keys())
     '''
     module_path = dirname(__file__)
-    data = np.load(module_path + "/data/watch_dataset.npy").item()
+    data = np.load(module_path + "/data/watch_dataset.npy", allow_pickle=True).item()
     return data
diff --git a/seglearn/feature_functions.py b/seglearn/feature_functions.py
@@ -65,6 +65,7 @@ def all_features():
                 'abs_energy': abs_energy,
                 'std': std,
                 'var': var,
+                'mad': median_absolute_deviation,
                 'variation': variation,
                 'min': minimum,
                 'max': maximum,
@@ -156,6 +157,11 @@ def var(X):
     return np.var(X, axis=1)
 
 
+def median_absolute_deviation(X):
+    ''' median absolute deviation for each variable in a segmented time series '''
+    return stats.median_absolute_deviation(X, axis=1)
+
+
 def variation(X):
     ''' coefficient of variation '''
     return stats.variation(X, axis=1)

diff --git a/seglearn/pipe.py b/seglearn/pipe.py
@@ -7,7 +7,6 @@
 
 from sklearn.base import BaseEstimator
 from sklearn.pipeline import Pipeline
-from sklearn.externals import six
 
 from .transform import XyTransformerMixin
 
@@ -104,7 +103,7 @@ def _fit(self, X, y=None, **fit_params):
 
         fit_params_steps = dict((name, {}) for name, step in self.steps
                                 if step is not None)
-        for pname, pval in six.iteritems(fit_params):
+        for pname, pval in fit_params.items():
             step, param = pname.split('__', 1)
             fit_params_steps[step][param] = pval
 
@@ -353,7 +352,7 @@ def set_params(self, **params):
         items = self.steps
         names, _ = zip(*items)
 
-        keys = list(six.iterkeys(params))
+        keys = list(params.keys())
 
         for name in keys:
             if '__' not in name and name in names:

diff --git a/seglearn/tests/test_base.py b/seglearn/tests/test_base.py
@@ -2,6 +2,7 @@
 # License: BSD
 
 import numpy as np
+import pandas as pd
 
 from seglearn.datasets import load_watch
 from seglearn.base import TS_Data
@@ -14,8 +15,7 @@ def test_ts_data():
     data = TS_Data(ts, c)
 
     assert np.array_equal(data.context_data, c)
-    assert np.array_equal(data.ts_data, ts)
-
+    assert np.all([np.array_equal(data.ts_data[i], ts[i]) for i in range(len(ts))])
     assert isinstance(data[1], TS_Data)
     assert np.array_equal(data[1].ts_data, ts[1])
     assert np.array_equal(data[1].context_data, c[1])
@@ -43,3 +43,15 @@ def test_watch():
     df = load_watch()
     data = TS_Data(df['X'], df['side'])
     assert isinstance(data, TS_Data)
+
+def test_pd():
+    ts = np.array([np.random.rand(100, 10), np.random.rand(200, 10), np.random.rand(20, 10)])
+    c = np.random.rand(3, 10)
+
+    df = pd.DataFrame(c)
+    df['ts_data'] = ts
+    data = TS_Data.from_df(df)
+
+    assert np.all([np.array_equal(data.ts_data[i], ts[i]) for i in range(len(ts))])
+    assert np.array_equal(data.context_data, c)
+