Skip to content

Commit

Permalink
first working seq-seq prediction
Browse files Browse the repository at this point in the history
  • Loading branch information
dmbee committed May 29, 2019
1 parent 0a421fc commit dbc7ae0
Show file tree
Hide file tree
Showing 7 changed files with 166 additions and 81 deletions.
55 changes: 55 additions & 0 deletions examples/plot_predict_series.py
@@ -0,0 +1,55 @@
'''
===================
Predict Time Series
===================
In this example, we use the pipeline to conduct quasi sequence to sequence predictions
'''
# Author: David Burns
# License: BSD


import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression

from seglearn.pipe import Pype
from seglearn.split import temporal_split
from seglearn.transform import FeatureRep, SegmentXY, last
from seglearn.base import TS_Data

# for a single time series, we need to make it a list
X = [np.arange(10000) / 100.]
y = [np.sin(X[0]) * X[0] * 3 + X[0] * X[0]]
t = [np.arange(len(y[0]))]

X = TS_Data(X, timestamps=t)

# split the data along the time axis (our only option since we have only 1 time series)
X_train, X_test, y_train, y_test = temporal_split(X, y)

# SegmentXY segments both X and y (as the name implies)
# setting y_func = last, selects the last value from each y segment as the target
# other options include transform.middle, or you can make your own function
# see the API documentation for further details

pipe = Pype([('seg', SegmentXY(width=200, overlap=0.5, y_func=last)),
('features', FeatureRep()),
('lin', LinearRegression())])

# fit and score
pipe.fit(X_train, y_train)

# Pype.predict_series() provides timestamps in addition to the predictions themselves
tpt, ypt = pipe.predict_series(X_test)

# plot the sequence prediction
plt.plot(X_train.timestamps[0], y_train[0], '.', label="train")
plt.plot(X_test.timestamps[0], y_test[0], '.', label="test")
plt.plot(tpt[0], ypt[0], label="predict")
plt.xlabel("Time")
plt.ylabel("Target")
plt.legend()
plt.show()

15 changes: 5 additions & 10 deletions seglearn/base.py
Expand Up @@ -24,13 +24,12 @@ class TS_Data(object):
'''

def __init__(self, ts_data, context_data, timestamps=None, sernum=None):
def __init__(self, ts_data, context_data=None, timestamps=None):
N = len(ts_data)
self.ts_data = np.atleast_1d(ts_data)
self.context_data = np.atleast_1d(context_data) if context_data is not None else None
self.timestamps = np.atleast_1d(timestamps) if timestamps is not None \
else np.array([np.arange(len(ts_data[i])) for i in np.arange(N)])
self.sernum = np.atleast_1d(sernum) if sernum is not None else np.arange(N)
self.index = 0
self.N = N
self.shape = [N] # need for safe_indexing with sklearn
Expand All @@ -39,13 +38,10 @@ def __init__(self, ts_data, context_data, timestamps=None, sernum=None):
def from_df(cls, df):
ts_data = np.array(df['ts_data'])
timestamp = np.array(df['timestamps']) if 'timestamps' in df else None
sernum = np.array(df['sernum']) if 'sernum' in df else None
clabs = np.array(['ts_data', 'timestamps', 'sernum'])

context_data = df.drop(columns=clabs[np.isin(clabs, df.columns)])
context_data = df.drop(columns=['ts_data', 'timestamps'], errors='ignore')
context_data = np.array(context_data) if not context_data.empty else None

return cls(ts_data, context_data, timestamp, sernum)
return cls(ts_data, context_data, timestamp)

def concat(self):
pass
Expand All @@ -57,10 +53,9 @@ def __getitem__(self, indices):
ts_data = self.ts_data[indices]
context_data = self.context_data[indices] if self.context_data is not None else None
timestamps = self.timestamps[indices]
sernum = self.sernum[indices]

# return TS_Data(ts_data, context_data, timestamps, sernum)
return ts_data
return TS_Data(ts_data, context_data, timestamps)
# return ts_data

def __next__(self):
if self.index == self.N:
Expand Down
55 changes: 45 additions & 10 deletions seglearn/pipe.py
Expand Up @@ -5,11 +5,14 @@
# Author: David Burns
# License: BSD

import numpy as np
# from six import iteritems
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.externals import six


from .transform import XyTransformerMixin
from .base import TS_Data


class Pype(Pipeline):
Expand Down Expand Up @@ -104,7 +107,8 @@ def _fit(self, X, y=None, **fit_params):

fit_params_steps = dict((name, {}) for name, step in self.steps
if step is not None)
for pname, pval in six.iteritems(fit_params):

for pname, pval in fit_params.items():
step, param = pname.split('__', 1)
fit_params_steps[step][param] = pval

Expand All @@ -131,14 +135,17 @@ def _transform(self, X, y=None, sample_weight=None):
Xt = X
yt = y
swt = sample_weight
ts = None

for name, transformer in self.steps[:-1]: # iterate through all but last
if isinstance(transformer, XyTransformerMixin):
Xt, yt, swt = transformer.transform(Xt, yt, swt)
if isinstance(Xt, TS_Data):
ts = Xt.timestamps
else:
Xt = transformer.transform(Xt)

return Xt, yt, swt
return Xt, yt, ts, swt

def transform(self, X, y=None):
"""
Expand All @@ -161,7 +168,7 @@ def transform(self, X, y=None):
yt : array-like, shape = [n_samples]
Transformed target
"""
Xt, yt, _ = self._transform(X, y)
Xt, yt, _, _ = self._transform(X, y)

if isinstance(self._final_estimator, XyTransformerMixin):
Xt, yt, _ = self._final_estimator.transform(Xt, yt)
Expand Down Expand Up @@ -228,9 +235,37 @@ def predict(self, X):
yp : array-like
Predicted transformed target
"""
Xt, _, _ = self._transform(X)
Xt, _, _, _ = self._transform(X)
return self._final_estimator.predict(Xt)

def predict_series(self, X):
Xt = X

if not isinstance(Xt, TS_Data):
Warning("Creating TS_Data object - inferring time stamps")
Xt = TS_Data(Xt)

Xt, _, ts, _ = self._transform(Xt)

if ts.ndim != 2:
raise Exception("timestamps not available for predict_series")

yp = self._final_estimator.predict(Xt)

y = []
t = []

for s in np.unique(ts[:, 0]): # todo: remove shuffle option from segmenters
idx = ts[:, 0] == s
ti = ts[idx, 1]
yi = yp[idx]
isx = np.argsort(ti)
y.append(yi[isx])
t.append(ti[isx])

return t, y


def transform_predict(self, X, y):
"""
Apply transforms to the data, and predict with the final estimator.
Expand All @@ -251,7 +286,7 @@ def transform_predict(self, X, y):
yp : array-like
Predicted transformed target
"""
Xt, yt, _ = self._transform(X, y)
Xt, yt, _, _ = self._transform(X, y)
yp = self._final_estimator.predict(Xt)
return yt, yp

Expand All @@ -276,7 +311,7 @@ def score(self, X, y=None, sample_weight=None):
score : float
"""

Xt, yt, swt = self._transform(X, y, sample_weight)
Xt, yt, _, swt = self._transform(X, y, sample_weight)

self.N_test = len(yt)

Expand Down Expand Up @@ -304,7 +339,7 @@ def predict_proba(self, X):
y_proba : array-like, shape = [n_samples, n_classes]
Predicted probability of each class
"""
Xt, _, _ = self._transform(X)
Xt, _, _, _ = self._transform(X)
return self._final_estimator.predict_proba(Xt)

def decision_function(self, X):
Expand All @@ -321,7 +356,7 @@ def decision_function(self, X):
-------
y_score : array-like, shape = [n_samples, n_classes]
"""
Xt, _, _ = self._transform(X)
Xt, _, _, _ = self._transform(X)
return self._final_estimator.decision_function(Xt)

def predict_log_proba(self, X):
Expand All @@ -338,7 +373,7 @@ def predict_log_proba(self, X):
-------
y_score : array-like, shape = [n_samples, n_classes]
"""
Xt, _, _ = self._transform(X)
Xt, _, _, _ = self._transform(X)
return self._final_estimator.predict_log_proba(Xt)

def set_params(self, **params):
Expand Down
18 changes: 12 additions & 6 deletions seglearn/split.py
Expand Up @@ -16,11 +16,11 @@
import numpy as np
from sklearn.model_selection._split import _build_repr

from .util import check_ts_data, get_ts_data_parts
from .util import check_ts_data, get_ts_data_parts, get_ts_parts
from .base import TS_Data


class TemporalKFold(object):
class TemporalKFold(object): # todo: fix for new TS_Data
'''
K-fold iterator variant for temporal splitting of time series data
Expand Down Expand Up @@ -158,7 +158,7 @@ def temporal_split(X, y, test_size=0.25):

Ns = len(y) # number of series
check_ts_data(X, y)
Xt, Xc = get_ts_data_parts(X)
Xt, Xc, ts = get_ts_parts(X)

train_size = 1. - test_size

Expand All @@ -168,9 +168,15 @@ def temporal_split(X, y, test_size=0.25):
X_train = [Xt[i][train_ind[i]] for i in range(Ns)]
X_test = [Xt[i][test_ind[i]] for i in range(Ns)]

if Xc is not None:
X_train = TS_Data(X_train, Xc)
X_test = TS_Data(X_test, Xc)
if ts is not None:
t_train = [ts[i][train_ind[i]] for i in range(Ns)]
t_test = [ts[i][test_ind[i]] for i in range(Ns)]
else:
t_train, t_test = None, None

if isinstance(X, TS_Data):
X_train = TS_Data(X_train, Xc, t_train)
X_test = TS_Data(X_test, Xc, t_test)

if len(np.atleast_1d(y[0])) == len(Xt[0]):
# y is a time series
Expand Down
61 changes: 31 additions & 30 deletions seglearn/tests/test_base.py
Expand Up @@ -8,36 +8,37 @@
from seglearn.base import TS_Data


# def test_ts_data():
# # time series data
# ts = np.array([np.random.rand(100, 10), np.random.rand(200, 10), np.random.rand(20, 10)])
# c = np.random.rand(3, 10)
# data = TS_Data(ts, c)
#
# assert np.array_equal(data.context_data, c)
# assert np.array_equal(data.ts_data, ts)
#
# assert isinstance(data[1], TS_Data)
# assert np.array_equal(data[1].ts_data, ts[1])
# assert np.array_equal(data[1].context_data, c[1])
#
# # segmented time series data
#
# sts = np.random.rand(100, 10, 6)
# c = np.random.rand(100, 6)
#
# data = TS_Data(sts, c)
# assert isinstance(data[4:10], TS_Data)
# assert np.array_equal(data[4:10].ts_data, sts[4:10])
# assert np.array_equal(data[4:10].context_data, c[4:10])
#
# sts = np.random.rand(100, 10)
# c = np.random.rand(100)
#
# data = TS_Data(sts, c)
# assert isinstance(data[4:10], TS_Data)
# assert np.array_equal(data[4:10].ts_data, sts[4:10])
# assert np.array_equal(data[4:10].context_data, c[4:10])
def test_ts_data():
# time series data
ts = np.array([np.random.rand(100, 10), np.random.rand(200, 10), np.random.rand(20, 10)])
c = np.random.rand(3, 10)
data = TS_Data(ts, c)

assert np.array_equal(data.context_data, c)
for i in range(len(ts)):
assert np.array_equal(data.ts_data[i], ts[i])

assert isinstance(data[1], TS_Data)
assert np.array_equal(data[1].ts_data, ts[1])
assert np.array_equal(data[1].context_data, c[1])

# segmented time series data

sts = np.random.rand(100, 10, 6)
c = np.random.rand(100, 6)

data = TS_Data(sts, c)
assert isinstance(data[4:10], TS_Data)
assert np.array_equal(data[4:10].ts_data, sts[4:10])
assert np.array_equal(data[4:10].context_data, c[4:10])

sts = np.random.rand(100, 10)
c = np.random.rand(100)

data = TS_Data(sts, c)
assert isinstance(data[4:10], TS_Data)
assert np.array_equal(data[4:10].ts_data, sts[4:10])
assert np.array_equal(data[4:10].context_data, c[4:10])


def test_watch():
Expand Down

0 comments on commit dbc7ae0

Please sign in to comment.