In [1]:
import l2f_tda as tda
import numpy as np
import pandas as pd
import gudhi as gd
import sklearn as sk
import dask_ml as daml
from sklearn_tda.hera_wasserstein import wasserstein

import matplotlib.pyplot as plt
%matplotlib inline

import sklearn.utils as skutils
from sklearn.model_selection import TimeSeriesSplit
import sklearn.preprocessing as skprep
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from keras.models import Sequential
import keras.layers as klayers
import keras.optimizers as koptimizers
from keras.wrappers.scikit_learn import KerasRegressor

Using TensorFlow backend.


In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1" 

In [3]:
import keras
import tensorflow as tf

config = tf.ConfigProto( device_count = {'CPU': 48} ) 
# config = tf.ConfigProto( device_count = {'GPU': 1} ) 

sess = tf.Session(config=config) 
keras.backend.set_session(sess)

In [4]:
pipeline = Pipeline([
    ('sampling', tda.Sampler(removeWeekends=False, samplingPeriod='1h')),
    ('embedding', tda.TakensEmbedder()),
    ('labelling', tda.LorenzLabeller()),
    ('diagram', tda.VietorisRipsDiagram()),
    ('centroids', tda.CentroidsDistance()),
    ('scaling', tda.ScalerWrapper(copy=True)),
    ('formulation', tda.FormulationTransformer()),
    ('regression', tda.KerasRegressorWrapper(numberFeatures=1))
])

In [5]:
pipeline.get_params()

{'memory': None,
 'steps': [('sampling',
   Sampler(removeWeekends=False, samplingPeriod='1h', samplingType='periodic')),
  ('embedding', TakensEmbedder(innerWindowDuration=5, innerWindowStride=1,
           outerWindowDuration=20, outerWindowStride=2)),
  ('labelling', LorenzLabeller()),
  ('diagram', VietorisRipsDiagram(dataType='points', homologyDimensions=[0, 1],
             maxEdgeLength=inf)),
  ('centroids', CentroidsDistance(distance_kwargs={})),
  ('scaling', ScalerWrapper(copy=True, feature_range=(0, 1))),
  ('formulation',
   FormulationTransformer(numberStepsInPast=10, stepInFuture=1)),
  ('regression',
   <l2f_tda.KerasWrapper.KerasRegressorWrapper at 0x7f79484765f8>)],
 'sampling': Sampler(removeWeekends=False, samplingPeriod='1h', samplingType='periodic'),
 'embedding': TakensEmbedder(innerWindowDuration=5, innerWindowStride=1,
         outerWindowDuration=20, outerWindowStride=2),
 'labelling': LorenzLabeller(),
 'diagram': VietorisRipsDiagram(dataType='points', homolo

In [6]:
# Sampling
sampling_param = {}
sampling_param_grid = {'sampling__' + k: v for k, v in sampling_param.items()}

# Embedding
embedding_param = {}
embedding_param['outerWindowDuration'] = [ 100, 200 ]
embedding_param['outerWindowStride'] = [ 20, 30 ]
embedding_param['innerWindowDuration'] = [ 20, 30 ]
embedding_param['innerWindowStride'] = [ 1, 2 ]
embedding_param_grid = {'embedding__' + k: v for k, v in embedding_param.items()}


# Diagram
diagram_param = {}
diagram_param['homologyDimensions'] = [ [0], [ 0, 1], [1] ]
diagram_param_grid = {'diagram__' + k: v for k, v in diagram_param.items()}

# Centroids
centroids_param = {}
centroids_param['distance_kwargs'] = [ {'metric': gd.bottleneck_distance}, {'metric': wasserstein} ]
centroids_param_grid = {'centroids__' + k: v for k, v in centroids_param.items()}

# Scaling
scaling_param = {}
scaling_param_grid = {'scaling__' + k: v for k, v in scaling_param.items()}

# Regression
regression_param = {}
regression_param['modelSteps'] = [ [{'layerClass': klayers.LSTM, 'units': units, 'activation': 'tanh'}] 
                                  for units in [2, 4] ]

regression_param['optimizer_kwargs'] = [ {'lr': lr}
                                         for lr in [0.01] ]
regression_param['batch_size'] =  [ 100 ]
regression_param['epochs'] =  [ 3000 ]
regression_param_grid = {'regression__' + k: v for k, v in regression_param.items()}

param_grid = {**sampling_param_grid, **embedding_param_grid, **diagram_param_grid, **centroids_param_grid, 
              **scaling_param_grid, **regression_param_grid}

In [7]:
import pickle as pkl

numberTrain = 6000

data = pkl.load(open('example_time_series.pkl', 'rb'))
regime = pkl.load(open('example_regime.pkl', 'rb'))
print(data.shape)
X_train = pd.concat([data[:numberTrain], regime[:numberTrain]], axis = 1)
X_train.columns = range(3)
y_train = np.empty((X_train.shape[0], 1))
X_train

(50000, 1)


Unnamed: 0,0,1,2
1970-01-01 00:00:00+00:00,10.000000,0,0
1970-01-01 01:00:00+00:00,9.633333,0,0
1970-01-01 02:00:00+00:00,9.386394,0,0
1970-01-01 03:00:00+00:00,9.242918,0,0
1970-01-01 04:00:00+00:00,9.186662,0,0
1970-01-01 05:00:00+00:00,9.202042,0,0
1970-01-01 06:00:00+00:00,9.274441,0,0
1970-01-01 07:00:00+00:00,9.390337,0,0
1970-01-01 08:00:00+00:00,9.537337,0,0
1970-01-01 09:00:00+00:00,9.704184,0,0


In [8]:
X_test = pd.concat([data[numberTrain:], regime[numberTrain:]], axis = 1)
X_test.columns = range(3)
y_test = np.empty((X_test.shape[0], 1))
X_test

Unnamed: 0,0,1,2
1970-09-08 00:00:00+00:00,3.758295,0,0
1970-09-08 01:00:00+00:00,3.759955,0,0
1970-09-08 02:00:00+00:00,3.761996,0,0
1970-09-08 03:00:00+00:00,3.764409,0,0
1970-09-08 04:00:00+00:00,3.767187,0,0
1970-09-08 05:00:00+00:00,3.770319,0,0
1970-09-08 06:00:00+00:00,3.773795,0,0
1970-09-08 07:00:00+00:00,3.777607,0,0
1970-09-08 08:00:00+00:00,3.781741,0,0
1970-09-08 09:00:00+00:00,3.786188,0,0


In [9]:
from dask_ml.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV

# from dask.distributed import Client
# client = Client()
# skutils.parallel_backend(backend='multiprocessing')

In [10]:
pipeline.fit(X_train, y_train)

labels: [[0. 0.]
 [0. 1.]
 [1. 0.]
 [1. 1.]]
numberCentroids:  4
(4, 2995)
numberFeatures:  4


ValueError: Error when checking input: expected lstm_1_input to have shape (10, 4) but got array with shape (10, 2995)

In [None]:
%%time

cv = TimeSeriesSplit(n_splits = 2)
grid = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=cv, n_jobs=16, verbose=100, error_score='raise') #iid=False ???
grid_result = grid.fit(X_train, y_train)

In [None]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
y_predict_train, y_true_train = grid.predict(X_train)

In [None]:
plt.plot(y_predict_train, marker='x')
plt.plot(y_true_train)

In [None]:
y_predict_test, y_true_test = grid.predict(X_test)

In [None]:
plt.plot(y_predict_test, marker='x')
plt.plot(y_true_test)