# Can Topological Divergences Help Predict the Largest Lyapunov Exponent?

## Overview

This notebook generates dynamic system trajectory data then analyses multiple features for supervised learning of the largest Lyapunov exponent (classification and regression). Classical numeric methods, TDA-based methods, Horizontal Visibility methods, and our newly introduced topological divergences are compared.

- classic neighbour-tracing estimators from Rosenstein, Eckmann, and Kantz
- ordinal partition network embedded persistence measures from Myers
- $k$-nearest neighbour graph embedded persistence measures from Myers
- Betti vector norms on embedded trajectories from Güzel
- topological divergences (the main contribution)

Topological divergences are scalar or vector valued measures of the difference between the sublevel and superlevel filtrations over a scalar function.

In [1]:
# collect imports for cells below

import numpy as np
import numpy.ma as ma
import pandas as pd
from scipy import stats
from functools import partial
from TimeSeriesMergeTreeSimple import TimeSeriesMergeTree as TSMT
from ipyparallel import require
import ipyparallel as ipp
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from tree_offset_divergence import get_offset_divergences
from tree_offset_divergence import div_names as merge_tree_divergence_names
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import RocCurveDisplay
from sklearn.pipeline import Pipeline
from trajectories import generate_trajectories



In [2]:
# Provide clients to an ipyparallel cluster for faster parallel processing

clients = ipp.Client()
dv = clients.direct_view()
lbv = clients.load_balanced_view()

In [3]:
# Preprocessing function to remove scale bias in supervised learning

def scale(ts):
    """Make range of ts fall between 0 and 1"""
    scaler = MinMaxScaler()
    return scaler.fit_transform(ts.reshape(-1, 1)).flatten()


## Chaotic system data

In [35]:
# Specify the trajectory generation criteria

SEED = 54321  # consistent random number generation
SAMPLES = 5000  # number of trajectories
LENGTH = 2000  # number of points per trajectory

In [36]:
# Generate the actual system data to analyse

import pickle
import os

filename_train_data = os.path.join("outputs/white_noise", "__".join(map(str, [SEED, LENGTH, SAMPLES])) + "__train_data.pkl")
if not os.path.exists(filename_train_data):
    with open(filename_train_data, "wb") as file:
        data_ = generate_trajectories(RANDOM_SEED=SEED, TS_LENGTH=LENGTH, CONTROL_PARAM_SAMPLES=SAMPLES)
        pickle.dump(data_, file)

with open(filename_train_data, "rb") as file:
    system_training_data = pickle.load(file)


In [37]:
# Remove relative scale (amplitude) as a feature that could be used in supevised learning

for system in system_training_data:
    trajectories = system_training_data[system]["trajectories"]
    trajectories = list(map(scale, trajectories))
    system_training_data[system]["trajectories"] = trajectories


## Supervised learning

In [38]:
# Define machine learning models to train on the Lyapunov estimates

def score_features_train(feature_names, features, y_true, cv=5, n_repeats=5, ML_SEED=123):
    """Score various supervised ML models on supplied features give a ground truth.
    
    For classification, assumes ground truth y_true>0 is the positive class.
    """

    # assume vectorial features; if scalar, add an extra dimension
    features = np.array(features)
    if features.ndim == 2:
        features = features[..., np.newaxis]
    n_samples, n_features, feature_vector_length = features.shape

    CLASSIFIER_CV = RepeatedStratifiedKFold(n_splits=cv, random_state=ML_SEED, n_repeats=n_repeats)
    REGRESSOR_CV = RepeatedKFold(n_splits=cv, random_state=ML_SEED*2, n_repeats=n_repeats)

    y = y_true
    pos_mask = y>0
    y_classes = y>0

    classification_scorer = "f1"
    regression_scorer = "neg_mean_squared_error"


    for i in range(n_features):
        feature_name = feature_names[i]
        X = features[:, i, :].reshape(n_samples, -1)

        SVC_pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC(random_state=ML_SEED))])
        SVC_clf = GridSearchCV(SVC_pipe, {'svc__C':[0.01, 0.1, 1, 10, 100]}, scoring=classification_scorer, n_jobs=-2, refit=True, cv=CLASSIFIER_CV)
        SVC_clf.fit(X, y_classes)
        SVC_scores = cross_val_score(SVC_clf.best_estimator_, X, y_classes, scoring=classification_scorer, cv=CLASSIFIER_CV, n_jobs=-2)

        KNC_pipe = Pipeline([('scaler', StandardScaler()), ('knc', KNeighborsClassifier())])
        KNC_clf = GridSearchCV(KNC_pipe, {'knc__n_neighbors':[5, 10, 15, 20, 25, 30]}, scoring=classification_scorer, n_jobs=-2, refit=True, cv=CLASSIFIER_CV)
        KNC_clf.fit(X, y_classes)
        KNC_scores = cross_val_score(KNC_clf.best_estimator_, X, y_classes, scoring=classification_scorer, cv=CLASSIFIER_CV, n_jobs=-2)

        MLPC_pipe = Pipeline([('scaler', StandardScaler()), ('mlp', MLPClassifier(learning_rate='adaptive', random_state=ML_SEED, max_iter=400))])
        MLPC_clf = GridSearchCV(MLPC_pipe, {'mlp__alpha':[0.00001, 0.0001, 0.001, 0.01], 'mlp__hidden_layer_sizes':[(10,), (20,), (10,5,), (20,5)]}, scoring=classification_scorer, n_jobs=-2, refit=True, cv=CLASSIFIER_CV)
        MLPC_clf.fit(X, y_classes)
        MLPC_scores = cross_val_score(MLPC_clf.best_estimator_, X, y_classes, scoring=classification_scorer, cv=CLASSIFIER_CV, n_jobs=-2)

        KNR_all_pipe = Pipeline([('scaler', StandardScaler()), ('knr', KNeighborsRegressor(weights='distance'))])
        KNR_all_clf = GridSearchCV(KNR_all_pipe, {'knr__n_neighbors': [5, 10, 15, 20, 25, 30]}, n_jobs=-2, scoring=regression_scorer, cv=REGRESSOR_CV, refit=True)
        KNR_all_clf.fit(X, y)
        KNR_all_scores = cross_val_score(KNR_all_clf.best_estimator_, X, y, scoring=regression_scorer, cv=REGRESSOR_CV, n_jobs=-2)

        # KNR_chaos_pipe = Pipeline([('scaler', StandardScaler()), ('knr', KNeighborsRegressor(weights='distance'))])
        # KNR_chaos_clf = GridSearchCV(KNR_chaos_pipe, {'knr__n_neighbors': [5, 10, 15, 20, 25, 30]}, n_jobs=-2, scoring=regression_scorer, cv=REGRESSOR_CV, refit=True)
        # KNR_chaos_clf.fit(X[pos_mask], y[pos_mask])
        # KNR_chaos_scores = cross_val_score(KNR_chaos_clf.best_estimator_, X[pos_mask], y[pos_mask], scoring=regression_scorer, cv=REGRESSOR_CV, n_jobs=-2)

        SVR_all_pipe = Pipeline([('scaler', StandardScaler()), ('svr', SVR())])
        SVR_all_clf = GridSearchCV(SVR_all_pipe, {'svr__C':[0.01, 0.1, 1, 10, 100]}, scoring=regression_scorer, n_jobs=-2, refit=True, cv=REGRESSOR_CV)
        SVR_all_clf.fit(X, y)
        SVR_all_scores = cross_val_score(SVR_all_clf.best_estimator_, X, y, scoring=regression_scorer, cv=REGRESSOR_CV, n_jobs=-2)

        # SVR_chaos_pipe = Pipeline([('scaler', StandardScaler()), ('svr', SVR())])
        # SVR_chaos_clf = GridSearchCV(SVR_chaos_pipe, {'svr__C':[0.01, 0.1, 1, 10, 100]}, scoring=regression_scorer, n_jobs=-2, refit=True, cv=REGRESSOR_CV)
        # SVR_chaos_clf.fit(X[pos_mask], y[pos_mask])
        # SVR_chaos_scores = cross_val_score(SVR_chaos_clf.best_estimator_, X[pos_mask], y[pos_mask], scoring=regression_scorer, cv=REGRESSOR_CV, n_jobs=-2)

        MLPR_all_pipe = Pipeline([('scaler', StandardScaler()), ('mlp', MLPRegressor(learning_rate='adaptive', random_state=ML_SEED, max_iter=400))])
        MLPR_all_clf = GridSearchCV(MLPR_all_pipe, {'mlp__alpha':[0.00001, 0.0001, 0.001, 0.01], 'mlp__hidden_layer_sizes':[(10,), (20,), (10,5,), (20,5)]}, scoring=regression_scorer, n_jobs=-2, refit=True, cv=REGRESSOR_CV)
        MLPR_all_clf.fit(X, y)
        MLPR_all_scores = cross_val_score(MLPR_all_clf.best_estimator_, X, y, scoring=regression_scorer, cv=REGRESSOR_CV, n_jobs=-2)

        # MLPR_chaos_pipe = Pipeline([('scaler', StandardScaler()), ('mlp', MLPRegressor(learning_rate='adaptive', random_state=ML_SEED, max_iter=400))])
        # MLPR_chaos_clf = GridSearchCV(MLPR_chaos_pipe, {'mlp__alpha':[0.00001, 0.0001, 0.001, 0.01], 'mlp__hidden_layer_sizes':[(10,), (20,), (10,5,), (20,5)]}, scoring=regression_scorer, n_jobs=-2, refit=True, cv=REGRESSOR_CV)
        # MLPR_chaos_clf.fit(X[pos_mask], y[pos_mask])
        # MLPR_chaos_scores = cross_val_score(MLPR_chaos_clf.best_estimator_, X[pos_mask], y[pos_mask], scoring=regression_scorer, cv=REGRESSOR_CV, n_jobs=-2)

        # add attribute to check if a model is for all data or just chaotic data
        # setattr(KNR_chaos_clf, "chaos", True)
        # setattr(SVR_chaos_clf, "chaos", True)
        # setattr(MLPR_chaos_clf, "chaos", True)

        yield {
            feature_name: {
                "scores": {
                    "SVC": SVC_scores,
                    "SVR": SVR_all_scores,
                    "MLPC": MLPC_scores,
                    "MLPR": MLPR_all_scores,
                    "KNC": KNC_scores,
                    "KNR": KNR_all_scores,
                    # "KNR_chaos": KNR_chaos_scores,
                    # "SVR_chaos": SVR_chaos_scores,
                    # "MLPR_chaos": MLPR_chaos_scores,
                },
                "models": {
                    "SVC": SVC_clf,
                    "SVR": SVR_all_clf,
                    "MLPC": MLPC_clf,
                    "MLPR": MLPR_all_clf,
                    "KNC": KNC_clf,
                    "KNR": KNR_all_clf,
                    # "KNR_chaos": KNR_chaos_clf,
                    # "SVR_chaos": SVR_chaos_clf,
                    # "MLPR_chaos": MLPR_chaos_clf,
                }
            }
        }


In [39]:
# Apply trained machine models to features from new unseen data

def score_features_test(feature_names, features, y_true, trained_models):
    """Predict using features as input to trained models and score against ground truth.
    
    For classification, assumes ground truth y_true>0 is the positive class.
    """

    # assume vectorial features; if scalar, add an extra dimension
    features = np.array(features)
    if features.ndim == 2:
        features = features[..., np.newaxis]
    n_samples, n_features, feature_vector_length = features.shape
    
    is_classifier = lambda clf: hasattr(clf, "classes_")

    pos_mask = y_true>0

    for i in range(n_features):
        feature_name = feature_names[i]
        X = features[:, i, :].reshape(n_samples, -1)

        yield {
            feature_name: {
                # "predictions": {
                #     model_name: trained_model.predict(
                #         X[pos_mask] if getattr(trained_model, "chaos", False) else X
                #     ) for model_name, trained_model in trained_models[feature_name].items()
                # },
                # "r2_scores": {
                #     model_name: trained_model.score(
                #         (X[pos_mask] if getattr(trained_model, "chaos", False) else X),
                #         (y_true[pos_mask] if getattr(trained_model, "chaos", False) else (pos_mask if is_classifier(trained_model) else y_true))
                #     )
                #     for model_name, trained_model in trained_models[feature_name].items()
                # },
                "predictions": {
                    model_name: trained_model.predict(X) for model_name, trained_model in trained_models[feature_name].items()
                },
                "r2_scores": {
                    model_name: trained_model.score(X, (pos_mask if is_classifier(trained_model) else y_true))
                    for model_name, trained_model in trained_models[feature_name].items()
                },
            }
        }

        

#### Test Data

In [40]:
TEST_SEED = SEED * 2
TEST_LENGTH = LENGTH
TEST_SAMPLES = 1001


In [41]:
# Generate the test system data to analyse

import pickle
import os

filename_test_data = os.path.join("outputs/white_noise", "__".join(map(str, [SEED, LENGTH, SAMPLES, TEST_SEED, TEST_LENGTH, TEST_SAMPLES])) + "__test_data.pkl")
if not os.path.exists(filename_test_data):
    with open(filename_test_data, "wb") as file:
        data_ = generate_trajectories(RANDOM_SEED=TEST_SEED, TS_LENGTH=TEST_LENGTH, CONTROL_PARAM_SAMPLES=TEST_SAMPLES)
        pickle.dump(data_, file)

with open(filename_test_data, "rb") as file:
    system_test_data = pickle.load(file)


In [42]:
# Remove relative scale (amplitude) as a feature that could be used in supevised learning

for system in system_test_data:
    trajectories = system_test_data[system]["trajectories"]
    trajectories = list(map(scale, trajectories))
    system_test_data[system]["trajectories"] = trajectories


In [43]:
# define utility functions

def make_inf_column_finite(arr):
    """Convert -inf and +inf to min finite value in each column."""

    arr_isinf = np.isinf(arr)
    col_mins = np.min(ma.masked_array(arr, mask=arr_isinf, fill_value=0), axis=0)

    for row_idx in range(arr.shape[0]):
        for col_idx in range(arr.shape[1]):
            if np.isinf(arr[row_idx, col_idx]):
                arr[row_idx, col_idx] = col_mins[col_idx]
                arr[np.isnan(arr)] = -1e-12

    return arr

def get_scores_from_predictions(y_pred, y_true=None):
    """Compute f1 and negative mean squared error scores for predictions."""
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import f1_score
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    chaos = y_true > 0
    mse_all = mean_squared_error(y_true, y_pred)
    # mse_chaos = mean_squared_error(y_true[chaos], y_pred[chaos])
    f1_all = f1_score(chaos, y_pred>0)
    spearmanr_all = stats.spearmanr(y_pred, y_true)[0]
    pearsonr_all = stats.pearsonr(y_pred, y_true)[0]
    # spearmanr_chaos = stats.spearmanr(y_pred[chaos], y_true[chaos])[0]
    # pearsonr_chaos = stats.pearsonr(y_pred[chaos], y_true[chaos])[0]

    results = {
        "F1": f1_all,
        "MSE": -mse_all,
        # "mse_chaos": -mse_chaos,
        "Spearman": spearmanr_all,
        # "spearmanr_chaos": spearmanr_chaos,
        "Pearson": pearsonr_all,
        # "pearsonr_chaos": pearsonr_chaos,
    }

    return results


## $\lambda_{\max}$ Estimator Pipeline

A generic pipeline to compute a set of features and apply them to predicting $\lambda_{\max}$.

In [44]:
# import the feature function and the list of names of features
from hvg_estimates import get_hvg_estimates, hvg_names
from tree_offset_divergence import get_offset_divergences_vec, div_names
from crocker_estimates import get_crocker_estimates, crocker_names

# set the system for analysis
SYSTEM = "logistic"

In [47]:
def feature_scoring(feature_func, feature_names, trajectories_train, trajectories_test, y_train, y_test, lbv, plot=True):

    # compute features for training and test data sets
    batch_size = 1000

    train_data_features = []
    for batch_start_idx in range(0, len(trajectories_train), batch_size):
        batch_end_idx = min(batch_start_idx + batch_size, len(trajectories_train))
        train_data_features += list(lbv.map_sync(feature_func, trajectories_train[batch_start_idx:batch_end_idx]))
    train_data_features = np.array(train_data_features)

    test_data_features = []
    for batch_start_idx in range(0, len(trajectories_test), batch_size):
        batch_end_idx = min(batch_start_idx + batch_size, len(trajectories_test))
        test_data_features += list(lbv.map_sync(feature_func, trajectories_test))
    test_data_features = np.array(test_data_features)
    
    # train the models and gather the results
    training_results = {}
    for result in score_features_train(feature_names, train_data_features, y_train):
        training_results |= result

    # extract scores and trained models
    training_scores = {k:v["scores"] for k,v in training_results.items()}
    trained_models = {k:v["models"] for k,v in training_results.items()}

    # average the scores for each feature and model over all cross validation runs
    training_scores_df = pd.DataFrame(training_scores)
    training_scores_df = training_scores_df.applymap(np.mean).T

    # apply the trained models to new data and gather the results
    test_results = {}
    for result in score_features_test(feature_names, test_data_features, y_test, trained_models):
            test_results |= result

    # extract scores, predictions, and correlations on the test data
    test_scores = {k:v["r2_scores"] for k,v in test_results.items()}
    test_predictions = {k:v["predictions"] for k,v in test_results.items()}
    test_correlations = {
        k: {
            "SVR Spearman": stats.spearmanr(v["SVR"], y_test)[0],
            "SVR Pearson": stats.pearsonr(v["SVR"], y_test)[0],
            # "MLPR Spearman": stats.spearmanr(v["MLPR"], y_test)[0],
            # "MLPR Pearson": stats.pearsonr(v["MLPR"], y_test)[0],
        }
        for k,v in test_predictions.items()
    }

    # get scores for each feature and model as a dataframe
    test_scores_df = pd.DataFrame(test_scores).T
    test_correlations_df = pd.DataFrame(test_correlations).T

    # also get correlations and scoring metrics for the raw feature values
    if len(test_data_features.shape) == 1:
        raw_scores = map(partial(get_scores_from_predictions, y_true=y_test), test_data_features.T)
        raw_scores_df = pd.DataFrame(raw_scores)
        raw_scores_df.index = feature_names
    else:
        raw_scores_df = pd.DataFrame()

    return training_scores_df, test_scores_df, test_correlations_df, raw_scores_df, test_predictions

In [68]:
results = feature_scoring(
    get_hvg_estimates,
    hvg_names,
    system_training_data[SYSTEM]["trajectories"],
    system_test_data[SYSTEM]["trajectories"],
    system_training_data[SYSTEM]["lces"],
    system_test_data[SYSTEM]["lces"],
    lbv,
    plot=False,
)


In [46]:
results = feature_scoring(
    get_crocker_estimates,
    crocker_names,
    system_training_data[SYSTEM]["trajectories"],
    system_test_data[SYSTEM]["trajectories"],
    system_training_data[SYSTEM]["lces"],
    system_test_data[SYSTEM]["lces"],
    lbv,
    plot=False,
)


CompositeError: one or more exceptions raised in: get_crocker_estimates
[Engine Exception]EngineError: Engine b'6bf98cd1-44db25901e72b7f254595aef' died while running task 'a68c70c0-ab84aa0e6dfd7b1764fd5488_3630286_4932'
[Engine Exception]EngineError: Engine b'7e8ec939-8dc618a065123a0c8046268b' died while running task 'a68c70c0-ab84aa0e6dfd7b1764fd5488_3630286_5240'
[Engine Exception]EngineError: Engine b'acdefa09-024e4a671011214140af1965' died while running task 'a68c70c0-ab84aa0e6dfd7b1764fd5488_3630286_6048'
[Engine Exception]EngineError: Engine b'7ff97a31-b0c70e5c50a3d22ac0889df0' died while running task 'a68c70c0-ab84aa0e6dfd7b1764fd5488_3630286_6052'
.... 24 more exceptions ...

In [76]:
discrete=True
offsets=range(1, 252, 50)
get_offset_divergences_vec_func = partial(get_offset_divergences_vec, offsets=offsets, discrete=discrete)


In [77]:

results = feature_scoring(
    get_offset_divergences_vec_func,
    div_names,
    system_training_data[SYSTEM]["trajectories"],
    system_test_data[SYSTEM]["trajectories"],
    system_training_data[SYSTEM]["lces"],
    system_test_data[SYSTEM]["lces"],
    lbv,
    plot=False,
)




In [17]:
results[2]

NameError: name 'results' is not defined