In [92]:
import perfplot
from typing import Union, Callable
import polars as pl
import pandas as pd
from functime.feature_extraction import tsfresh as f_ts
from tsfresh.feature_extraction import feature_calculators as tsfresh

## Global Variable

We are using the M4 dataset. We create a `pd.DataFrame`, `pl.DataFrame` and `pl.LazyFrame`. Then we define a list of dictionnary with the following structure:
<br>
(<br>
&emsp;  `<functime_function_name>`,<br>
&emsp;  `<tsfresh_function_name>`,<br>
&emsp;  `<parameters_for_functime_function>`,<br>
&emsp;   `<parameters_for_tsfresh_function>`<br>
)<br>

In [93]:
_M4_DATASET = "../data/m4_1d_train.parquet"

DF_PANDAS = pd.melt(pd.read_parquet(_M4_DATASET)).drop(columns=["variable"]).dropna().reset_index(drop=True)
DF_PL_EAGER = pl.from_pandas(DF_PANDAS)
DF_PL_LAZY = DF_PL_EAGER.lazy()

In [94]:
_FUNC_PARAMS_BENCH  = [
    (f_ts.absolute_energy, tsfresh.abs_energy, {}, {}),
    (f_ts.absolute_maximum, tsfresh.absolute_maximum, {}, {}),
    (f_ts.absolute_sum_of_changes, tsfresh.absolute_sum_of_changes, {}, {}),
    (f_ts.approximate_entropy, tsfresh.approximate_entropy, {"run_length": 2, "filtering_level": 0.5}, {"m": 2, "r": 0.5}),
    # (f_ts.augmented_dickey_fuller, tsfresh.augmented_dickey_fuller, "param")
    (f_ts.autocorrelation, tsfresh.autocorrelation, {"n_lags": 4}, {"lag": 4}),
    (f_ts.autoregressive_coefficients, tsfresh.ar_coefficient, {"n_lags": 4}, {"param": [{"coeff": i, "k": 4}] for i in range(5)}),
    (f_ts.benford_correlation2, tsfresh.benford_correlation, {}, {}),
    (f_ts.benford_correlation, tsfresh.benford_correlation, {}, {}),
    (f_ts.binned_entropy, tsfresh.binned_entropy, {"bin_count": 10}, {"max_bins": 10}),
    (f_ts.c3, tsfresh.c3, {"n_lags": 10}, {"lag": 10}),
    (f_ts.change_quantiles, tsfresh.change_quantiles, {"q_low": 0.1, "q_high": 0.9, "is_abs": True}, {"ql": 0.1, "qh": 0.9, "isabs": True, "f_agg": "mean"}),
    (f_ts.cid_ce, tsfresh.cid_ce, {"normalize": True}, {"normalize": True}),
    (f_ts.count_above, tsfresh.count_above, {"threshold": 0.0}, {"t": 0.0}),
    (f_ts.count_above_mean, tsfresh.count_above_mean, {}, {}),
    (f_ts.count_below, tsfresh.count_below, {"threshold": 0.0}, {"t": 0.0}),
    (f_ts.count_below_mean, tsfresh.count_below_mean, {}, {}),
    # (f_ts.cwt_coefficients, tsfresh.cwt_coefficients, {"widths": (1, 2, 3), "n_coefficients": 2},{"param": {"widths": (1, 2, 3), "coeff": 2, "w": 1}}),
    (f_ts.energy_ratios, tsfresh.energy_ratio_by_chunks, {"n_chunks": 6}, {"param": [{"num_segments": 6, "segment_focus": i} for i in range(6)]}),
    (f_ts.first_location_of_maximum, tsfresh.first_location_of_maximum, {}, {}),
    (f_ts.first_location_of_minimum, tsfresh.first_location_of_minimum, {}, {}),
    # (f_ts.fourier_entropy, tsfresh.fourier_entropy, {"n_bins": 10}, {"bins": 10}),
    # (f_ts.friedrich_coefficients, tsfresh.friedrich_coefficients, {"polynomial_order": 3, "n_quantiles": 30}, {"params": [{"m": 3, "r": 30}]}),
    (f_ts.has_duplicate, tsfresh.has_duplicate, {}, {}),
    (f_ts.has_duplicate_max, tsfresh.has_duplicate_max, {}, {}),
    (f_ts.has_duplicate_min, tsfresh.has_duplicate_min, {}, {}),
    (f_ts.index_mass_quantile, tsfresh.index_mass_quantile, {"q": 0.5}, {"param": [{"q": 0.5}]}),
    (f_ts.large_standard_deviation, tsfresh.large_standard_deviation, {"ratio": 0.25}, {"r": 0.25}),
    (f_ts.last_location_of_maximum, tsfresh.last_location_of_maximum, {}, {}),
    (f_ts.last_location_of_minimum, tsfresh.last_location_of_minimum, {}, {}),
    # (f_ts.lempel_ziv_complexity, tsfresh.lempel_ziv_complexity, {"n_bins": 5}, {"bins": 5}),
    # (f_ts.linear_trend, tsfresh.linear_trend, {}, {"params": [{"attr": "slope"}, {"attr": "intercept"}]}),
    (f_ts.longest_streak_above_mean, tsfresh.longest_strike_above_mean, {}, {}),
    (f_ts.longest_streak_below_mean, tsfresh.longest_strike_below_mean, {}, {}),
    (f_ts.mean_abs_change, tsfresh.mean_abs_change, {}, {}),
    (f_ts.mean_change, tsfresh.mean_change, {}, {}),
    (f_ts.mean_n_absolute_max, tsfresh.mean_n_absolute_max, {"n_maxima": 20}, {"number_of_maxima": 20}),
    (f_ts.mean_second_derivative_central, tsfresh.mean_second_derivative_central, {}, {}),
    (f_ts.number_crossings, tsfresh.number_crossing_m, {"crossing_value": 0.0}, {"m": 0.0}),
    (f_ts.number_cwt_peaks, tsfresh.number_cwt_peaks, {"max_width": 5}, {"n": 5}),
    (f_ts.number_peaks, tsfresh.number_peaks, {"support": 5}, {"n": 5}),
    # (f_ts.partial_autocorrelation, tsfresh.partial_autocorrelation, "param"),
    (f_ts.percent_reoccurring_values, tsfresh.percentage_of_reoccurring_values_to_all_values, {}, {}),
    (f_ts.percent_reoccurring_points, tsfresh.percentage_of_reoccurring_datapoints_to_all_datapoints, {}, {}),
    (f_ts.permutation_entropy, tsfresh.permutation_entropy, {"tau": 1,"n_dims": 3}, {"tau": 1,"dimension": 3}),
    (f_ts.range_count, tsfresh.range_count, {"lower": 0, "upper": 9, "closed": 'none'}, {"min": 0, "max": 9}),
    (f_ts.ratio_beyond_r_sigma, tsfresh.ratio_beyond_r_sigma, {"ratio": 2}, {"r": 2}),
    (f_ts.ratio_n_unique_to_length, tsfresh.ratio_value_number_to_time_series_length, {}, {}),
    (f_ts.root_mean_square, tsfresh.root_mean_square, {}, {}),
    (f_ts.sample_entropy, tsfresh.sample_entropy, {}, {}),
    (f_ts.spkt_welch_density, tsfresh.spkt_welch_density, {"n_coeffs": 10}, {"param": [{"coeff": i} for i in range(10)]}),
    (f_ts.sum_reoccurring_points, tsfresh.sum_of_reoccurring_data_points, {}, {}),
    (f_ts.sum_reoccurring_values, tsfresh.sum_of_reoccurring_values, {}, {}),
    (f_ts.symmetry_looking, tsfresh.symmetry_looking, {"ratio": 0.25}, {"param": [{"r": 0.25}]}),
    (f_ts.time_reversal_asymmetry_statistic, tsfresh.time_reversal_asymmetry_statistic, {"n_lags": 3}, {"lag": 3}),
    (f_ts.variation_coefficient, tsfresh.variation_coefficient, {}, {}),
    (f_ts.var_gt_std, tsfresh.variance_larger_than_standard_deviation, {}, {})
]

## Benchmark tsfresh vs functime

Benchmark function to compare the performance of functime's against tsfresh's' function. You need to provide the content from the list `_FUNC_PARAMS_BENCH`. The time series length is 100_000, 1_000_000 and 8_000_000 (except 10_000 for entropy related features)

In [95]:
def benchmark(f_feat: Callable, ts_feat: Callable, f_params: dict, ts_params: dict, expr: bool):
    if f_feat.__name__ in ("binned_entropy", "approximate_entropy", "permutation_entropy", "sample_entropy"):
        n_range = [10_000]
    else:
        n_range = [100_000, 1_000_000, 9_000_000]
    benchmark = perfplot.bench(
        setup = lambda n: (DF_PANDAS.head(n), DF_PL_EAGER.head(n)),
        kernels = [
            lambda x, _y: ts_feat(x["value"], **ts_params),
            lambda _x, y: f_feat(y["value"], **f_params) if not expr else y.select(f_feat(pl.col("value"), **f_params))
        ],
        n_range = n_range,
        equality_check=False,
        labels=["tsfresh", "tsfresh", "functime"]
    )
    return benchmark

## Benchmark tsfresh vs functime for all the functions

Loop over `_FUNC_PARAMS_BENCH` and call `benchmark()` for each iteration.

In [97]:
def all_benchmarks(params: list[tuple], expr: bool)-> list:
    bench_df = pl.DataFrame(
        schema={
            "id": pl.Utf8,
            "n": pl.Int64,
            "tfresh (ms)": pl.Float64,
            "functime (ms)": pl.Float64,
            "diff (ms)": pl.Float64,
            "diff %": pl.Float64,
            "speedup": pl.Float64
        }
    )
    for x in params:
        try:
            print("Feature: {}".format(x[0].__name__))
            bench = benchmark(
                f_feat = x[0],
                ts_feat = x[1],
                f_params = x[2],
                ts_params = x[3],
                expr = expr
            )
            bench_df = pl.concat([
                pl.DataFrame({
                    "id": [x[0].__name__]*len(bench.n_range),
                    "n": bench.n_range,
                    "tfresh (ms)": bench.timings_s[0]*1_000,
                    "functime (ms)": bench.timings_s[1]*1_000,
                    "diff (ms)": (bench.timings_s[1] - bench.timings_s[0])*1_000,
                    "diff %": 100*(bench.timings_s[1] - bench.timings_s[0]) / bench.timings_s[0],
                    "speedup": bench.timings_s[0] / bench.timings_s[1]
                }),
                bench_df]
            )
        except:
            print("Failure for feature: {}".format(x[0].__name__))
    return bench_df

## Run and save the benchmark

In [98]:
def benchmark_table(df: pl.DataFrame, n: int):
    df_pivot = df.filter(pl.col("n") == n)
    df = df_pivot.with_columns(
        avg = df_pivot.mean(axis=1)
    )
    return df_pivot

In [99]:
bench_expr = all_benchmarks(params = _FUNC_PARAMS_BENCH, expr = True)
bench_series = all_benchmarks(params = _FUNC_PARAMS_BENCH, expr = False)

Feature: absolute_energy


Feature: absolute_maximum


Feature: absolute_sum_of_changes


Feature: approximate_entropy


INFO:functime.feature_extraction.tsfresh:Expression version of approximate_entropy is not yet implemented due to technical difficulty regarding Polars Expression Plugins.


Failure for feature: approximate_entropy
Feature: autocorrelation


Feature: autoregressive_coefficients


INFO:functime.feature_extraction.tsfresh:Expression version of autoregressive_coefficients is not yet implemented due to technical difficulty regarding Polars Expression Plugins.


Failure for feature: autoregressive_coefficients
Feature: benford_correlation2


Feature: benford_correlation


Feature: binned_entropy


Feature: c3


Feature: change_quantiles


Feature: cid_ce


Feature: count_above


Feature: count_above_mean


Feature: count_below


Feature: count_below_mean


Feature: energy_ratios


Feature: first_location_of_maximum


Feature: first_location_of_minimum


Feature: has_duplicate


Feature: has_duplicate_max


Feature: has_duplicate_min


Feature: index_mass_quantile


Feature: large_standard_deviation


Feature: last_location_of_maximum


Feature: last_location_of_minimum


Feature: longest_streak_above_mean


Feature: longest_streak_below_mean


Feature: mean_abs_change


Feature: mean_change


Feature: mean_n_absolute_max


Feature: mean_second_derivative_central


Feature: number_crossings


Feature: number_cwt_peaks


Failure for feature: number_cwt_peaks
Feature: number_peaks


Feature: percent_reoccurring_values


Feature: percent_reoccurring_points


Feature: permutation_entropy


Feature: range_count


Feature: ratio_beyond_r_sigma


Feature: ratio_n_unique_to_length


Feature: root_mean_square


Feature: sample_entropy


INFO:functime.feature_extraction.tsfresh:Expression version of sample_entropy is not yet implemented due to technical difficulty regarding Polars Expression Plugins.


Failure for feature: sample_entropy
Feature: spkt_welch_density


INFO:functime.feature_extraction.tsfresh:Expression version of spkt_welch_density is not yet implemented due to technical difficulty regarding Polars Expression Plugins.


Failure for feature: spkt_welch_density
Feature: sum_reoccurring_points


Feature: sum_reoccurring_values


Feature: symmetry_looking


Feature: time_reversal_asymmetry_statistic


Feature: variation_coefficient


Feature: var_gt_std


Feature: absolute_energy


Feature: absolute_maximum


Feature: absolute_sum_of_changes


Feature: approximate_entropy


Feature: autocorrelation


Feature: autoregressive_coefficients


Failure for feature: autoregressive_coefficients
Feature: benford_correlation2


Feature: benford_correlation


Feature: binned_entropy


Feature: c3


Feature: change_quantiles


Feature: cid_ce


Feature: count_above


Feature: count_above_mean


Feature: count_below


Feature: count_below_mean


Feature: energy_ratios


Feature: first_location_of_maximum


Feature: first_location_of_minimum


Feature: has_duplicate


Feature: has_duplicate_max


Feature: has_duplicate_min


Feature: index_mass_quantile


Feature: large_standard_deviation


Feature: last_location_of_maximum


Feature: last_location_of_minimum


Feature: longest_streak_above_mean


Feature: longest_streak_below_mean


Feature: mean_abs_change


Feature: mean_change


Feature: mean_n_absolute_max


Feature: mean_second_derivative_central


Feature: number_crossings


Feature: number_cwt_peaks


Feature: number_peaks


Feature: percent_reoccurring_values


In [89]:
df_expr = benchmark_table(bench_expr, n = 1_000_000)

In [None]:
bench_expr

In [90]:
df_expr

id,n,tsfresh,functime
str,i64,f64,f64
"""var_gt_std""",1000000,0.002881,0.002606
"""variation_coef…",1000000,0.004798,0.002714
"""time_reversal_…",1000000,0.015402,0.009836
"""symmetry_looki…",1000000,0.02267,0.008817
"""sum_reoccurrin…",1000000,0.104558,0.196502
"""sum_reoccurrin…",1000000,0.102836,0.067817
"""root_mean_squa…",1000000,0.004307,0.002179
"""ratio_n_unique…",1000000,0.097095,0.026159
"""ratio_beyond_r…",1000000,0.011673,0.007678
"""range_count""",1000000,0.002964,0.001395


In [77]:
df_expr = benchmark_table(bench_expr)
# "benchmark_tsfresh_vs_functime_expr.parquet" doesn't work
df_expr.write_parquet("../benchmarks/benchmark_tsfresh_vs_functime_expr.parquet")
df_expr.write_csv("../benchmarks/benchmark_tsfresh_vs_functime_expr.csv")

In [78]:
df_series = benchmark_table(bench_series)
# "benchmark_tsfresh_vs_functime_series.parquet" doesn't work
df_series.write_parquet("../benchmarks/benchmark_tsfresh_vs_functime_series.parquet")
df_series.write_csv("../benchmarks/benchmark_tsfresh_vs_functime_series.csv")

In [88]:
df_series.sort("avg", descending=False)

id,64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,avg
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""index_mass_qua…",0.174521,0.198189,0.201079,0.208554,0.229889,0.259474,0.304603,0.40529,0.503005,0.612446,0.677704,0.747499,0.83376,0.981431,0.981555,0.941583,0.909031,0.929857,0.561082
"""autocorrelatio…",0.296323,0.354122,0.310108,0.358624,0.362209,0.389928,0.396484,0.447311,0.501751,0.639353,0.630226,0.686942,0.99332,1.16388,1.232318,1.174866,0.07986,0.108567,0.562566
"""permutation_en…",0.336182,0.356549,0.440328,0.46933,0.542925,0.657977,0.778233,0.857372,0.92097,,,,,,,,,,0.595541
"""absolute_sum_o…",0.320174,0.326045,0.329683,0.347966,0.352572,0.359892,0.397486,0.483152,0.556492,0.65382,0.772187,0.867855,1.048993,1.313984,1.322091,1.32463,1.257814,1.407517,0.746797
"""mean_abs_chang…",0.390833,0.385852,0.401242,0.419968,0.420954,0.427715,0.461077,0.541752,0.617988,0.699121,0.782452,0.906158,1.08846,1.357252,1.311463,1.374376,1.266798,1.336957,0.788357
"""last_location_…",0.358994,0.369827,0.369568,0.376912,0.388438,0.410136,0.452418,0.572225,0.662741,0.787734,0.905737,0.977296,0.992894,1.384916,1.333577,1.335416,1.34799,1.300527,0.795964
"""last_location_…",0.361468,0.372508,0.363029,0.374872,0.391914,0.415788,0.452898,0.572696,0.670314,0.792815,0.910206,0.970728,0.991174,1.388127,1.328401,1.345984,1.345677,1.321067,0.798315
"""sum_reoccurrin…",0.272408,0.295867,0.309543,0.356601,0.483572,0.617228,0.796433,1.094123,1.230199,1.19711,1.199312,1.112347,1.064209,1.1651,1.041117,0.954386,0.865803,0.794312,0.824982
"""time_reversal_…",0.339864,0.359472,0.361553,0.391658,0.407593,0.439316,0.508921,0.622822,0.812009,0.877346,0.912115,1.023982,1.422121,1.463408,1.423925,1.408475,1.443441,1.42626,0.869127
"""percent_reoccu…",0.921396,0.931667,0.919712,0.95868,1.061106,1.067157,1.145341,1.363867,1.411684,1.395673,1.362543,1.265665,1.159791,1.263522,1.042474,1.014166,0.816558,0.629554,1.096142
