# How to create topological features for time series analysis

Topological data analysis is the study of shapes of point clouds and can be used in time series analysis.

The goal of this notebook is to showcase how the topological data analysis library ``giotto-tda`` can be used together with ``giotto-time`` to create topological features for time series analysis.

The **fit_transform** paradigm used in both libraries allows to create scikit-learn-like pipelines.

## Imports

In [1]:
import pandas as pd
import numpy as np
import gtda.time_series as ts
import gtda.diagrams as diag

from sklearn.preprocessing import FunctionTransformer
from gtda.pipeline import Pipeline
from gtda.homology import VietorisRipsPersistence
from gtime.compose import FeatureCreation
from gtime.feature_extraction import CustomFeature

# Helper functions to make giotto-time and giotto-tda work together
from util_funcs import compute_n_points, align_indices

## Create the data

In [2]:
df = pd.DataFrame()
idx = pd.date_range(start='1986-01-02', end='2019-12-31')
df['value'] = np.random.rand(len(idx))
df.index = idx

## Parameters
Various parameters related to the TDA functions

In [3]:
embedding_dimension = 10
embedding_time_delay = 1
takens_dimension = 3
takens_stride = 1
takens_time_delay = 1
takens_parameters_type = 'fixed'
sliding_window_width = 10
sliding_stride = 1
diags_metric = "euclidean"
diags_coeff = 2
diags_max_edge_length = np.inf
diags_homology_dimensions = (0, 1, 2)
diags_infinity_values = None
amplitude_metric = 'wasserstein'
amplitude_order = 2

## TDA transformers
For more information regarding ``giotto-tda`` visit: https://github.com/giotto-ai/giotto-tda. 
Documentation: https://docs-tda.giotto.ai

In [4]:
takens_embedding = ts.TakensEmbedding(
    parameters_type=takens_parameters_type,
    dimension=embedding_dimension,
    stride=takens_stride,
    time_delay=embedding_time_delay,
    n_jobs=-1,
)

sliding_window = ts.SlidingWindow(
    width=sliding_window_width, stride=sliding_stride
)

vietoris_rips_persistence = VietorisRipsPersistence(
    metric=diags_metric,
    coeff=diags_coeff,
    max_edge_length=diags_max_edge_length,
    homology_dimensions=diags_homology_dimensions,
    infinity_values=diags_infinity_values,
    n_jobs=-1,
)

diagram_scaler = diag.Scaler()

amplitude = diag.Amplitude(
    metric=amplitude_metric,
    order=amplitude_order,
    n_jobs=-1,
)

## Feature definition
We present four different features:
- Average lifetime
- Number of relevant holes
- Amplitude
- Mean support feature and argmax feature of the Betti curve

In [5]:
def compute_average_lifetime(persistence_diagrams, h_dim):
    """From the persistence diagrams detect the average lifetime for a given homology dimension. 
       Then, assign a value to each initial data points.

       Parameters
       ----------
       persistence_diagrams : np.array, required
            The persistence diagrams on which to compute the feature_extraction.
    
       h_dim : int, required
           The dimension of the homology to consider for the feature creation.

       Returns
       -------
       avg_lifetime : np.array
           The average lifetime feature values.

    """
    avg_lifetime = []

    for i in range(persistence_diagrams.shape[0]):
        persistence_table = pd.DataFrame(
            persistence_diagrams[i], columns=["birth", "death", "homology"]
        )
        persistence_table["lifetime"] = (
            persistence_table["death"] - persistence_table["birth"]
        )
        avg_lifetime.append(
            persistence_table[persistence_table["homology"] == h_dim][
                "lifetime"
            ].mean()
        )

    return avg_lifetime

In [6]:
def compute_num_relevant_holes(persistence_diagrams, h_dim, theta):
    """From the persistence diagrams detect the average lifetime for a given homology dimension. 
       Then, assign a value to each initial data points.

       Parameters
       ----------
       persistence_diagrams : np.array, required
            The persistence diagrams on which to compute the feature_extraction.
    
       h_dim : int, required
           The dimension of the homology to consider for the feature creation.
    
       theta : float, required
           The threshold ratio. Value between 0 and 1.

       Returns
       -------
       n_rel_holes : np.array
           The 'number of relevant holes' feature values.
           
    """
    n_rel_holes = []
    for i in range(persistence_diagrams.shape[0]):
        pers_table = pd.DataFrame(
            persistence_diagrams[i], columns=["birth", "death", "homology"]
        )

        pers_table["lifetime"] = pers_table["death"] - pers_table["birth"]
        threshold = (
            pers_table[pers_table["homology"] == h_dim]["lifetime"].max()
            * theta
        )
        n_rel_holes.append(
            pers_table[
                (pers_table["lifetime"] > threshold)
                & (pers_table["homology"] == h_dim)
            ].shape[0]
        )
    return n_rel_holes

In [7]:
def find_mean_nonzero(g):
    """ Helper function for Betti features.
    """
    if g.to_numpy().nonzero()[1].any():
        return g.to_numpy().nonzero()[1].mean()
    else:
        return 0
    
def compute_betti_features(X_betti_curves, betti_mode, betti_homology_dimension=0, betti_rolling=3):
    """ Calculate the feature from the Betti curves.
    
        Parameters
        ----------
        X_betti_curves : np.array, required
            The Betti curve to create the feature from.

        betti_mode : string, required
            Choose the type of feature: Either 'mean' or 'arg_max'. 

        betti_homology_dimension : int, default=0
            Dimension of the homology to use.

        betti_rolling : int, default=3
            The rolling window size for the feature creation.

        Returns
        -------
        betti_features : np.array
            The 'Betti curves' feature values.

    """
    betti_curves = pd.DataFrame(X_betti_curves[:, betti_homology_dimension, :])

    if betti_mode == "mean":
        betti_features = compute_betti_mean(betti_curves, betti_rolling)

    elif betti_mode == "arg_max":
        betti_features = compute_arg_max_by_time(betti_curves)

    else:
        raise ValueError(
            f"The valid values for 'betti_mode' are 'mean' "
            f"or 'arg_max', instead has value "
            f"{betti_mode}."
        )

    return betti_features

def compute_betti_mean(betti_surface, betti_rolling):
    """Helper function for Betti features.
    """
    betti_means = (betti_surface.groupby(betti_surface.index)
                   .apply(lambda g: find_mean_nonzero(g))
                   .rolling(betti_rolling)
                   .mean()
                   .values)
    return betti_means

def compute_arg_max_by_time(betti_surfaces):
    """Helper function for Betti features.
    """
    betti_arg_maxes = []
    for betti_surface in betti_surfaces:
        arg_max = np.argmax(np.array(betti_surface), axis=1)
        betti_arg_maxes.append(arg_max)

    return betti_arg_maxes

In [8]:
func = lambda d: compute_average_lifetime(d, 0)
avg_liftime_ft = FunctionTransformer(func)

func = lambda d: compute_num_relevant_holes(d, 0, 0.7)
num_rel_holes_ft = FunctionTransformer(func)

func = lambda bs: compute_betti_features(bs, betti_mode='mean')
betti_features_ft = FunctionTransformer(func)

## Define the TDA pipelines
We can define pipelines that take the TDA methods defined above. Then, the output can be reshaped into a pandas dataframe with the appropriate size.

In [9]:
def get_avg_lifetime_feature(df):
    tda_pipeline_avg_lifetime = Pipeline(steps=[ 
                                                 ('takens', takens_embedding),
                                                 ('sliding_window', sliding_window),
                                                 ('vietoris_rips', vietoris_rips_persistence),
                                                 ('scaler', diagram_scaler),
                                                 ('avg_lifetime', avg_liftime_ft)
                                               ])

    lifetime_feature = tda_pipeline_avg_lifetime.fit_transform(df)
    n_points = compute_n_points(len(lifetime_feature), 
                                sliding_stride, 
                                sliding_window_width, 
                                takens_stride, 
                                takens_dimension, 
                                embedding_time_delay)

    res = align_indices(df, n_points, lifetime_feature)
    return res

In [10]:
def get_num_rel_holes_feature(df):
    tda_pipeline_num_rel_holes = Pipeline(steps=[ 
                                                  ('takens', takens_embedding),
                                                  ('sliding_window', sliding_window),
                                                  ('vietoris_rips', vietoris_rips_persistence),
                                                  ('scaler', diagram_scaler),
                                                  ('num_rel_holes', num_rel_holes_ft)
                                                ])

    holes_feature = tda_pipeline_num_rel_holes.fit_transform(df)
    n_points = compute_n_points(len(holes_feature), 
                                sliding_stride, 
                                sliding_window_width, 
                                takens_stride, 
                                takens_dimension, 
                                embedding_time_delay)

    res = align_indices(df, n_points, holes_feature)
    return res

In [11]:
def get_betti_feature(df):
    tda_pipeline_betti_feature = Pipeline(steps=[ 
                                                  ('takens', takens_embedding),
                                                  ('sliding_window', sliding_window),
                                                  ('vietoris_rips', vietoris_rips_persistence),
                                                  ('scaler', diagram_scaler),
                                                  ('betti_feature', betti_features_ft)
                                                ])

    betti_feature = tda_pipeline_betti_feature.fit_transform(df)
    n_points = compute_n_points(len(betti_feature), 
                                sliding_stride, 
                                sliding_window_width, 
                                takens_stride, 
                                takens_dimension, 
                                embedding_time_delay)

    res = align_indices(df, n_points, betti_feature)
    return res

In [12]:
def get_amplitude(df):
    tda_pipeline_amplitude_feature = Pipeline(steps=[ 
                                                      ('takens', takens_embedding),
                                                      ('sliding_window', sliding_window),
                                                      ('vietoris_rips', vietoris_rips_persistence),
                                                      ('scaler', diagram_scaler),
                                                      ('amplitude', amplitude)
                                                    ])

    amplitude_feature = tda_pipeline_amplitude_feature.fit_transform(df)
    n_points = compute_n_points(len(amplitude_feature), 
                                sliding_stride, 
                                sliding_window_width, 
                                takens_stride, 
                                takens_dimension, 
                                embedding_time_delay)

    res = align_indices(df, n_points, amplitude_feature)
    return res

## Define the time series feature creation pipeline
Now we can use the transformers defined above in the feature creation methods of giotto-time.

In [13]:
feature_creation = FeatureCreation([
                                    ('average_lifetime', CustomFeature(get_avg_lifetime_feature), ['value']),
                                    ('num_holes', CustomFeature(get_num_rel_holes_feature), ['value']),
                                    ('betti_feature', CustomFeature(get_betti_feature), ['value']),
                                    ('amplitude', CustomFeature(get_amplitude), ['value'])
                                   ])

res = feature_creation.fit_transform(df)

In [14]:
res.head(20)

Unnamed: 0,average_lifetime__value__CustomFeature,num_holes__value__CustomFeature,betti_feature__value__CustomFeature,amplitude__value__CustomFeature
1986-01-02,,,,
1986-01-03,,,,
1986-01-04,,,,
1986-01-05,,,,
1986-01-06,,,,
1986-01-07,,,,
1986-01-08,,,,
1986-01-09,,,,
1986-01-10,1.033121,10.0,,2.324197
1986-01-11,1.033121,10.0,,2.324197
