# Analyse topological divergences and the baselines for Lyapunov estimation

In [1]:
import pandas as pd
import os
import pickle

RESULTS_DIR = "outputs/data/divergence_results"
pkl_files = [
    f
    for f in os.listdir(RESULTS_DIR)
    if f.endswith(("_500.pkl", "_750.pkl", "_1000.pkl"))
]

data_frames = []

for pkl_file in pkl_files:
    # Load the dictionary from the .pkl file
    with open(f"{RESULTS_DIR}/{pkl_file}", "rb") as file:
        data_dict = pickle.load(file)

    # Convert dictionary into a Pandas DataFrame
    data_frame = pd.DataFrame(list(data_dict.items()), columns=["Keys", "Values"])

    # Split the tuple keys into separate columns
    data_frame[["Measure", "System", "Length"]] = pd.DataFrame(
        data_frame["Keys"].tolist(), index=data_frame.index
    )

    # Drop the original keys column
    data_frame = data_frame.drop(["Keys"], axis=1)

    # Use the new columns as a multi-index
    # data_frame.set_index(["measure", "system", "length"], inplace=True)

    # Add the dataframe to the list
    data_frames.append(data_frame)

# Concatenate the data frames
merged_data = pd.concat(data_frames)

# Split values dictionaries into separate columns and drop original column
merged_data = pd.concat(
    [merged_data.drop(["Values"], axis=1), merged_data["Values"].apply(pd.Series)],
    axis=1,
)

# Further split the two correlation columns into the coefficient and pvalues
merged_data[["Spearman Rho (All)", "Spearman p (All)"]] = merged_data[
    "spearmanr"
].apply(pd.Series)
merged_data[["Spearman Rho (Chaos)", "Spearman p (Chaos)"]] = merged_data[
    "pos_spearmanr"
].apply(pd.Series)

# Forget the original columns that were split
merged_data = merged_data.drop(columns=["spearmanr", "pos_spearmanr"])

# Forget useless divergence
ignore_measures = (
    "interleaving_length_0.5_None divergence",
    "interleaving_length_0.4_None divergence",
    "interleaving_length_0.3_None divergence",
    "interleaving_length_0.2_None divergence",
    "interleaving_length_0.1_None divergence",
    "cophenetic_length",
    "cophenetic_reverse_length",
)
merged_data = merged_data[~merged_data["Measure"].isin(ignore_measures)]

# Ensure the system names are consistent across records
merged_data["System"] = merged_data["System"].replace("Hénon", "Henon")
merged_data["System"] = merged_data["System"].replace("henon", "Henon")
merged_data["System"] = merged_data["System"].replace("ikeda", "Ikeda")
merged_data["System"] = merged_data["System"].replace("logistic", "Logistic")
merged_data["System"] = merged_data["System"].replace("tinkerbell", "Tinkerbell")

# Create readable column names
merged_data.rename(
    columns={
        "classification_f1": "Random Forest F1 Score",
        "regression_neg_mean_absolute": "Random Forest MSE (All)",
        "pos_regression_neg_mean_absolute": "Random Forest MSE (Chaos)",
        "regression_neg_mean_absolute_poly": "kNN MSE (All)",
        "pos_regression_neg_mean_absolute_poly": "kNN MSE (Chaos)",        
    },
    inplace=True
)


In [2]:
merged_data.head()

Unnamed: 0,Measure,System,Length,Random Forest F1 Score,Random Forest MSE (All),Random Forest MSE (Chaos),kNN MSE (All),kNN MSE (Chaos),Spearman Rho (All),Spearman p (All),Spearman Rho (Chaos),Spearman p (Chaos)
0,cophenetic divergence,Henon,500,0.770146,-0.088087,-0.065053,-0.085484,-0.059962,0.736482,1.396494e-86,0.746799,3.699274e-43
1,cophenetic_reverse divergence,Henon,500,0.78474,-0.118038,-0.093484,-0.093558,-0.0679,0.798787,5.628399e-112,0.831906,1.592406e-61
2,interleaving_0.5_None divergence,Henon,500,0.77945,-0.094765,-0.092251,-0.085155,-0.076143,0.795286,2.5524839999999998e-110,0.606278,5.620223e-25
4,interleaving_edge_0.5_None divergence,Henon,500,0.782874,-0.102633,-0.085643,-0.090005,-0.078395,0.790853,2.875452e-108,0.699864,6.632495e-36
5,interleaving_0.4_None divergence,Henon,500,0.790967,-0.119352,-0.084975,-0.086889,-0.076964,0.769395,5.357184e-99,0.585371,5.2421130000000003e-23


In [3]:
measure_long_name_map = {
    'cophenetic divergence': 'DMT Cophenetic Divergence',
    'cophenetic_reverse divergence': 'DMT Cophenetic (Reverse) Divergence',
    'interleaving_0.5_None divergence': 'PLMT Interleaving (0.5) Divergence',
    'interleaving_0.4_None divergence': 'PLMT Interleaving (0.4) Divergence',
    'interleaving_0.3_None divergence': 'PLMT Interleaving (0.3) Divergence',
    'interleaving_0.2_None divergence': 'PLMT Interleaving (0.2) Divergence',
    'interleaving_0.1_None divergence': 'PLMT Interleaving (0.1) Divergence',
    'interleaving_edge_0.5_None divergence': 'PLMT Interleaving (0.5) Divergence (Edge-Normalised)',
    'interleaving_edge_0.4_None divergence': 'PLMT Interleaving (0.4) Divergence (Edge-Normalised)',
    'interleaving_edge_0.3_None divergence': 'PLMT Interleaving (0.3) Divergence (Edge-Normalised)',
    'interleaving_edge_0.2_None divergence': 'PLMT Interleaving (0.2) Divergence (Edge-Normalised)',
    'interleaving_edge_0.1_None divergence': 'PLMT Interleaving (0.1) Divergence (Edge-Normalised)',
    'bottleneck divergence': 'Persistence Bottleneck Divergence',
    'wasserstein divergence': 'Persistence Wasserstein Divergence',
    '$k$-NN graph $M(D)$': 'kNN Graph PD Homology Class Ratio - M(D)',
    '$k$-NN graph $P(D)$': 'kNN Graph PD Maximum Persistence - P(D)',
    '$k$-NN graph $E\'(D)$': 'kNN Graph PD Normalised Persistent Entropy - E\'(D)',
    'Ordinal graph $M(D)$': 'Ordinal Partition Network Graph PD Homology Class Ratio - M(D)',
    'Ordinal graph $P(D)$': 'Ordinal Partition Network Graph PD Maximum Persistence - P(D)',
    'Ordinal graph $E\'(D)$': 'Ordinal Partition Network Graph PD Normalised Persistent Entropy - E\'(D)',
    'Rosenstein': 'Rosenstein Scaling',
    'Eckmann': 'Eckmann Scaling',
    'Kantz': 'Kantz Scaling',
    'cophenetic': 'PLMT Cophenetic Divergence',
    'cophenetic_reverse': 'PLMT Cophenetic (Reverse) Divergence',
    'cophenetic_edge': 'PLMT Cophenetic Divergence (Edge-Normalised)',
    'cophenetic_reverse_edge': 'PLMT Cophenetic (Reverse) Divergence (Edge-Normalised)',
    '$\\Delta$VGA ($L_1$)': 'HVG Top-Bottom Degree Distribution L1 Distance',
    '$\\Delta$VGA ($W_1$)': 'HVG Top-Bottom Degree Distribution Wasserstein Distance',    
    'Betti Vector L1 Norm': 'Persistence Diagram Betti Vector L1 Norm',
}

measure_short_name_map = {
    'cophenetic divergence': 'DMT Cophenetic',
    'cophenetic_reverse divergence': 'DMT Cophenetic (Reverse)',
    'interleaving_0.5_None divergence': 'PLMT Interleaving (0.5)',
    'interleaving_0.4_None divergence': 'PLMT Interleaving (0.4)',
    'interleaving_0.3_None divergence': 'PLMT Interleaving (0.3)',
    'interleaving_0.2_None divergence': 'PLMT Interleaving (0.2)',
    'interleaving_0.1_None divergence': 'PLMT Interleaving (0.1)',
    'interleaving_edge_0.5_None divergence': 'PLMT Interleaving (0.5) / Edges',
    'interleaving_edge_0.4_None divergence': 'PLMT Interleaving (0.4) / Edges',
    'interleaving_edge_0.3_None divergence': 'PLMT Interleaving (0.3) / Edges',
    'interleaving_edge_0.2_None divergence': 'PLMT Interleaving (0.2) / Edges',
    'interleaving_edge_0.1_None divergence': 'PLMT Interleaving (0.1) / Edges',
    'bottleneck divergence': 'PD Bottleneck',
    'wasserstein divergence': 'PD Wasserstein',
    '$k$-NN graph $M(D)$': 'kNN Homology Class Ratio',
    '$k$-NN graph $P(D)$': 'kNN Maximum Persistence',
    '$k$-NN graph $E\'(D)$': 'kNN Persistent Entropy',
    'Ordinal graph $M(D)$': 'OPN Homology Class Ratio',
    'Ordinal graph $P(D)$': 'OPN Maximum Persistence',
    'Ordinal graph $E\'(D)$': 'OPN Persistent Entropy',
    'Rosenstein': 'Rosenstein',
    'Eckmann': 'Eckmann',
    'Kantz': 'Kantz',
    'cophenetic': 'PLMT Cophenetic',
    'cophenetic_reverse': 'PLMT Cophenetic (Reverse)',
    'cophenetic_edge': 'PLMT Cophenetic / Edges',
    'cophenetic_reverse_edge': 'PLMT Cophenetic (Reverse) / Edges',
    '$\\Delta$VGA ($L_1$)': 'HVG L1',
    '$\\Delta$VGA ($W_1$)': 'HVG W1',
    'Betti Vector L1 Norm': 'Betti L1',
}

measure_type_map = {
    'cophenetic divergence': 'DMT Cophenetic',
    'cophenetic_reverse divergence': 'DMT Cophenetic',
    'interleaving_0.5_None divergence': 'PLMT Interleaving',
    'interleaving_0.4_None divergence': 'PLMT Interleaving',
    'interleaving_0.3_None divergence': 'PLMT Interleaving',
    'interleaving_0.2_None divergence': 'PLMT Interleaving',
    'interleaving_0.1_None divergence': 'PLMT Interleaving',
    'interleaving_edge_0.5_None divergence': 'PLMT Interleaving (Normalised)',
    'interleaving_edge_0.4_None divergence': 'PLMT Interleaving (Normalised)',
    'interleaving_edge_0.3_None divergence': 'PLMT Interleaving (Normalised)',
    'interleaving_edge_0.2_None divergence': 'PLMT Interleaving (Normalised)',
    'interleaving_edge_0.1_None divergence': 'PLMT Interleaving (Normalised)',
    'bottleneck divergence': 'TDA Divergence',
    'wasserstein divergence': 'TDA Divergence',
    '$k$-NN graph $M(D)$': 'TDA Embedding',
    '$k$-NN graph $P(D)$': 'TDA Embedding',
    '$k$-NN graph $E\'(D)$': 'TDA Embedding',
    'Ordinal graph $M(D)$': 'TDA Embedding',
    'Ordinal graph $P(D)$': 'TDA Embedding',
    'Ordinal graph $E\'(D)$': 'TDA Embedding',
    'Rosenstein': 'Classical Embedding',
    'Eckmann': 'Classical Embedding',
    'Kantz': 'Classical Embedding',
    'cophenetic': 'PLMT Cophenetic',
    'cophenetic_reverse': 'PLMT Cophenetic',
    'cophenetic_edge': 'PLMT Cophenetic',
    'cophenetic_reverse_edge': 'PLMT Cophenetic',
    '$\\Delta$VGA ($L_1$)': 'HVG Graph Distance',
    '$\\Delta$VGA ($W_1$)': 'HVG Graph Distance',
    'Betti Vector L1 Norm': 'TDA Direct',
}

In [4]:
# add measure type column
get_type = lambda row: measure_type_map[row['Measure']]
merged_data['Measure Type'] = merged_data.apply(get_type, axis=1)

In [5]:
# standardise measure names with more useful info
for k, v in measure_short_name_map.items():
    merged_data["Measure"] = merged_data["Measure"].replace(k, v)


In [6]:
merged_data.head()

Unnamed: 0,Measure,System,Length,Random Forest F1 Score,Random Forest MSE (All),Random Forest MSE (Chaos),kNN MSE (All),kNN MSE (Chaos),Spearman Rho (All),Spearman p (All),Spearman Rho (Chaos),Spearman p (Chaos),Measure Type
0,DMT Cophenetic,Henon,500,0.770146,-0.088087,-0.065053,-0.085484,-0.059962,0.736482,1.396494e-86,0.746799,3.699274e-43,DMT Cophenetic
1,DMT Cophenetic (Reverse),Henon,500,0.78474,-0.118038,-0.093484,-0.093558,-0.0679,0.798787,5.628399e-112,0.831906,1.592406e-61,DMT Cophenetic
2,PLMT Interleaving (0.5),Henon,500,0.77945,-0.094765,-0.092251,-0.085155,-0.076143,0.795286,2.5524839999999998e-110,0.606278,5.620223e-25,PLMT Interleaving
4,PLMT Interleaving (0.5) / Edges,Henon,500,0.782874,-0.102633,-0.085643,-0.090005,-0.078395,0.790853,2.875452e-108,0.699864,6.632495e-36,PLMT Interleaving (Normalised)
5,PLMT Interleaving (0.4),Henon,500,0.790967,-0.119352,-0.084975,-0.086889,-0.076964,0.769395,5.357184e-99,0.585371,5.2421130000000003e-23,PLMT Interleaving


## Regression Performance: Predicting Largest Lyapunov Exponent $\lambda_{\max}$ in the Chaotic Regime

This table reports results of $k$-nearest neighbour regression using $k=10$ neighbours and uniform distance weighting of each neighbour. Values reported in the four system columns are means of (negative) mean squared error (MSE) values over 10 train-test splits of chaotic trajectories of length 500, with the corresponding largest Lyapunov exponent (LLE) as ground truth, from the given systems. Control parameter values generating each system trajectory are sampled uniformly from pre-chosen ranges to ensure inclusion of samples close to bifurcation points. Ground truth LLEs were calculated using Benettin's Jacobian-matrix-based algorithm, on which basis trajectories were then filtered, with those satisfying $\lambda_{\max}>0$ retained. This provided a wide range of representative positive $\lambda_{\max}$ values for each of the systems.

The _Overall Rank_ column gives the overall ranked performance of each measure across all the systems considered.

The best performing feature for prediction of $\lambda_{\max}$ for short-length time series is the _discrete merge tree cophenetic divergence_. This outperforms the well-established and widely used Kantz algorithm, which ranks second, and it achieves lower variance in its respective performance.

In [8]:
df = merged_data
filtered_df = df[df['Length'] == 500]
knn_mse_pivot_table = pd.pivot_table(filtered_df, values='kNN MSE (Chaos)', index=['Measure Type', 'Measure'], columns=['System'], aggfunc=lambda x: x)
rank_columns = []
for col in knn_mse_pivot_table.columns:
    rank_col_name = f"{col} Rank"
    knn_mse_pivot_table[rank_col_name] = knn_mse_pivot_table[col].rank(method='min', ascending=False).astype(int)
    rank_columns.append(rank_col_name)
knn_mse_pivot_table['Mean Rank'] = knn_mse_pivot_table[rank_columns].mean(axis=1)
knn_mse_pivot_table['STD Rank'] = knn_mse_pivot_table[rank_columns].std(axis=1)
knn_mse_pivot_table['Overall Rank'] = knn_mse_pivot_table['Mean Rank'].rank(method='min', ascending=True).astype(int)
pd.options.display.float_format = '{:,.3f}'.format
display(knn_mse_pivot_table)

Unnamed: 0_level_0,System,Henon,Ikeda,Logistic,Tinkerbell,Henon Rank,Ikeda Rank,Logistic Rank,Tinkerbell Rank,Mean Rank,STD Rank,Overall Rank
Measure Type,Measure,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Classical Embedding,Eckmann,-0.165,-0.072,-0.227,-0.033,30,7,30,22,22.25,10.844,26
Classical Embedding,Kantz,-0.023,-0.095,-0.06,-0.016,1,18,5,2,6.5,7.853,2
Classical Embedding,Rosenstein,-0.097,-0.12,-0.159,-0.016,24,27,28,1,20.0,12.78,24
DMT Cophenetic,DMT Cophenetic,-0.06,-0.069,-0.064,-0.02,7,3,6,6,5.5,1.732,1
DMT Cophenetic,DMT Cophenetic (Reverse),-0.068,-0.076,-0.044,-0.023,11,11,2,10,8.5,4.359,4
HVG Graph Distance,HVG L1,-0.115,-0.108,-0.091,-0.04,28,22,13,26,22.25,6.652,26
HVG Graph Distance,HVG W1,-0.088,-0.123,-0.09,-0.055,22,28,12,30,23.0,8.083,28
PLMT Cophenetic,PLMT Cophenetic,-0.047,-0.135,-0.152,-0.022,5,30,26,9,17.5,12.342,19
PLMT Cophenetic,PLMT Cophenetic (Reverse),-0.043,-0.129,-0.149,-0.021,4,29,25,8,16.5,12.342,17
PLMT Cophenetic,PLMT Cophenetic (Reverse) / Edges,-0.073,-0.108,-0.139,-0.019,13,23,21,5,15.5,8.226,15


In [215]:
print(knn_mse_pivot_table.to_latex(float_format="%.4f"))

\begin{tabular}{llrrrrrrrrrrr}
\toprule
              & System &   Henon &   Ikeda &  Logistic &  Tinkerbell &  Henon Rank &  Ikeda Rank &  Logistic Rank &  Tinkerbell Rank &  Mean Rank &  STD Rank &  Place \\
Measure Type & Measure &         &         &           &             &             &             &                &                  &            &           &        \\
\midrule
Classical Embedding & Eckmann & -0.1654 & -0.0721 &   -0.2270 &     -0.0331 &          29 &           6 &             29 &               22 &    21.5000 &   10.8474 &     25 \\
              & Kantz & -0.0231 & -0.0950 &   -0.0604 &     -0.0159 &           1 &          17 &              5 &                2 &     6.2500 &    7.3655 &      2 \\
              & Rosenstein & -0.0967 & -0.1198 &   -0.1589 &     -0.0157 &          23 &          26 &             27 &                1 &    19.2500 &   12.2848 &     22 \\
DMT Cophenetic & DMT Cophenetic & -0.0600 & -0.0691 &   -0.0636 &     -0.0197 &           7

  print(knn_mse_pivot_table.to_latex(float_format="%.4f"))


## Statistical Dependence Performance

### Spearman's $\rho$ in the Chaotic Regime

Correlation results for the studied measures appear in the table. Values reported in the system columns are Spearman's $\rho$ coefficient, measuring the rank correlation, or strength of monotonic (but possibly non-linear) dependence, between the studied measures and the actual system $\lambda_{\max}$ value. Corresponding $p$-values are generally of the order of $10^{-20}$ or less whenever $|\rho|>0.5$. Trajectory length is 500.

The most highly correlated measures with $\lambda_{\max}$, for the short-length time series considered, are the _discrete merge tree cophenetic divergence_ and its reversed variant (in which the order of leaves in the superlevel tree is reversed for the divergence calculation). These are both discrete merge tree topological divergences. The next most highly correlated group of measures are those in the piecewise linear merge tree interleaving divergences. Despite showing less monotonic dependence on $\lambda_{\max}$ than the discrete merge tree measures do, these are more strongly correlated with $\lambda_{\max}$ than all other existing measures.

### Correlation across all dynamic regimes

It should be noted that the classical embedding-based estimators of Eckmann, Kantz, and Rosenstein are less highly correlated with $\lambda_{\max}$ in the chaotic regime, for the time series considered, than topological divergences are. However, the former estimators are more highly correlated with $\lambda_{\max}$ when non-chaotic sequences are included. Topological divergences are therefore not a general purpose measure for $\lambda_{\max}$ when the true value of the latter may be negative. A split pipeline of chaos detection (classification) followed by Lyapunov exponent estimation using either topological divergences (when chaos has been identified) or other estimators (when chaos is not present) is likely to be more effective than using one or other type of measure exclusively.


In [9]:
df = merged_data
filtered_df = df[df['Length'] == 500]
spearman_rho_pivot_table = pd.pivot_table(filtered_df, values='Spearman Rho (Chaos)', index=['Measure Type', 'Measure'], columns=['System'], aggfunc=lambda x: x)
rank_columns = []
for col in spearman_rho_pivot_table.columns:
    rank_col_name = f"{col} Rank"
    spearman_rho_pivot_table[rank_col_name] = spearman_rho_pivot_table[col].abs().rank(method='min', ascending=False).astype(int)
    rank_columns.append(rank_col_name)
spearman_rho_pivot_table['Mean Rank'] = spearman_rho_pivot_table[rank_columns].mean(axis=1)
spearman_rho_pivot_table['STD Rank'] = spearman_rho_pivot_table[rank_columns].std(axis=1)
spearman_rho_pivot_table['Place'] = spearman_rho_pivot_table['Mean Rank'].rank(method='min', ascending=True).astype(int)
display(spearman_rho_pivot_table)

Unnamed: 0_level_0,System,Henon,Ikeda,Logistic,Tinkerbell,Henon Rank,Ikeda Rank,Logistic Rank,Tinkerbell Rank,Mean Rank,STD Rank,Place
Measure Type,Measure,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Classical Embedding,Eckmann,-0.117,-0.825,-0.553,0.414,29,11,20,23,20.75,7.5,22
Classical Embedding,Kantz,0.753,0.329,0.624,0.622,6,22,19,19,16.5,7.141,17
Classical Embedding,Rosenstein,0.235,-0.56,-0.514,0.801,27,17,21,3,17.0,10.198,18
DMT Cophenetic,DMT Cophenetic,0.747,0.876,0.889,0.761,7,3,9,12,7.75,3.775,2
DMT Cophenetic,DMT Cophenetic (Reverse),0.832,0.887,0.934,0.775,1,1,5,10,4.25,4.272,1
HVG Graph Distance,HVG L1,0.439,0.178,0.693,0.15,23,24,17,25,22.25,3.594,26
HVG Graph Distance,HVG W1,0.366,0.197,0.852,0.065,25,23,12,27,21.75,6.702,24
PLMT Cophenetic,PLMT Cophenetic,0.733,0.043,0.197,0.814,9,29,28,2,17.0,13.589,18
PLMT Cophenetic,PLMT Cophenetic (Reverse),0.745,0.062,0.185,0.817,8,27,29,1,16.25,13.889,16
PLMT Cophenetic,PLMT Cophenetic (Reverse) / Edges,0.816,0.437,0.462,0.79,2,19,24,4,12.25,10.905,11
