In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from hrvanalysis import get_time_domain_features, get_frequency_domain_features
from hrvanalysis.preprocessing import get_nn_intervals
import biosppy.signals.ecg as ecg
from tqdm import tqdm
import neurokit2 as nk
from sklearn.utils import class_weight
import json
from tsfresh import extract_features
import sys

import warnings
warnings.simplefilter("ignore")

sys.path.append('..')
from util import load_base_data, make_serializable, normalize, load_datasets_concat

plt.rcParams['figure.figsize'] = [20, 5]

In [3]:
from tsfresh.feature_extraction.feature_calculators import \
absolute_sum_of_changes, absolute_maximum, \
agg_autocorrelation, approximate_entropy, \
benford_correlation, cid_ce, \
count_above_mean, count_below_mean, \
kurtosis, longest_strike_above_mean, \
longest_strike_below_mean, mean_abs_change, \
mean_second_derivative_central, number_crossing_m, \
percentage_of_reoccurring_datapoints_to_all_datapoints, \
percentage_of_reoccurring_values_to_all_values, \
ratio_value_number_to_time_series_length, root_mean_square, \
sample_entropy, skewness, standard_deviation, \
sum_of_reoccurring_data_points, sum_values, \
value_count, variance_larger_than_standard_deviation, \
variation_coefficient

In [4]:
%%time
SAMPLING_RATE = 300
X_train, y_train, X_test = load_base_data(just_train=False, standard=True, flip=True)

CPU times: user 1min 31s, sys: 6.31 s, total: 1min 37s
Wall time: 1min 48s


In [10]:
x = X_train.iloc[0].dropna().to_numpy(dtype='float32')
len(get_features(x))==len(feature_names)

True

In [7]:
feature_names = [
    "absolute_maximum",
    "absolute_sum_of_changes",
    "benford_correlation",
    "cid_ce",
    "count_above_mean",
    "count_below_mean",
    "kurtosis",
    "longest_strike_above_mean",
    "longest_strike_below_mean",
    "mean_abs_change",
    "mean_second_derivative_central",
    "number_crossing_0.0",
    "number_crossing_0.5",
    "number_crossing_-0.5",
    "number_crossing_1.0",
    "number_crossing_-1.0",
    "ratio_value_number_to_time_series_length",
    "root_mean_square",
    "skewness",
    "standard_deviation",
    "sum_of_reoccurring_data_points",
    "sum_values",
    "value_count",
    "variance_larger_than_standard_deviation",
    "variation_coefficient"
]

In [12]:
def get_features(x):
    am = absolute_maximum(x)
    asc = absolute_sum_of_changes(x)/x.shape[-1]
    bc = benford_correlation(x)
    cc = cid_ce(x, normalize=True)
    cam = count_above_mean(x)/x.shape[-1]
    cbm = count_below_mean(x)/x.shape[-1]
    kur = kurtosis(x) # already have?
    lsam = longest_strike_above_mean(x)
    lsbm = longest_strike_below_mean(x)
    mac = mean_abs_change(x)
    msdc = mean_second_derivative_central(x)
    ncmP00 = number_crossing_m(x, 0)/x.shape[-1] # can tune value
    ncmP05 = number_crossing_m(x, 0.5)/x.shape[-1] # can tune value
    ncmM05 = number_crossing_m(x, -0.5)/x.shape[-1] # can tune value
    ncmP10 = number_crossing_m(x, 1.0)/x.shape[-1] # can tune value
    ncmM10 = number_crossing_m(x, -1.0)/x.shape[-1] # can tune value
    rvnttsl = ratio_value_number_to_time_series_length(x) 
    rms = root_mean_square(x)
    skew = skewness(x) # already have?
    sd = standard_deviation(x) # already have?
    srdp = sum_of_reoccurring_data_points(x)/x.shape[-1]
    sv = sum_values(x)/x.shape[-1]
    vc = value_count(x, 0.)/x.shape[-1] # can tune value
    vlsd = variance_larger_than_standard_deviation(x) # bool
    vc = variation_coefficient(x)
    return np.array([am, asc, bc, cc, cam, cbm, kur, lsam, lsbm, mac, msdc, ncmP00, ncmP05, ncmM05, ncmP10, ncmM10, rvnttsl,
                    rms, skew, sd, srdp, sv, vc, vlsd, vc])

# def generate_features(data, norm=False):
def generate_features(data):
    feature_vecs = []
    for i in tqdm(range(len(data))):
        signal = data.loc[i].dropna().to_numpy(dtype='float32')
#         if norm:
#             signal = normalize(signal)
        features = get_features(signal)
        feature_vecs.append(features)
    return np.array(feature_vecs)

In [13]:
X_test_features = generate_features(X_test)

100%|██████████████████| 3411/3411 [01:13<00:00, 46.41it/s]


In [14]:
X_test_df = pd.DataFrame(X_test_features, columns=feature_names)
X_test_df.to_csv("X_test.csv", index_label="id")

In [15]:
X_train_features = generate_features(X_train)

100%|██████████████████| 5117/5117 [02:01<00:00, 41.96it/s]


In [16]:
X_train_df = pd.DataFrame(X_train_features, columns=feature_names)
X_train_df.to_csv("X_train.csv", index_label="id")

In [13]:
def read_data(dataname):
    X_test = pd.read_csv(f'../data/{dataname}/X_test.csv', index_col='id')
    X_train = pd.read_csv(f'../data/{dataname}/X_train.csv', index_col='id')
    return X_train, X_test
def concat_data(x1, x2):
    concated_x = pd.concat([x1, x2], axis=1)
    return concated_x

In [None]:
concat_data(X_test1, X_test2).columns

In [14]:
X_train1, X_test1 = read_data("rpqst")
X_train2, X_test2 = read_data("tsfresh/norm")
# concat_data(X_train1, X_train2)
# concat_data(X_test1, X_test2)

In [17]:
X_train2

Unnamed: 0_level_0,absolute_maximum,absolute_sum_of_changes,benford_correlation,cid_ce,count_above_mean,count_below_mean,kurtosis,longest_strike_above_mean,longest_strike_below_mean,mean_abs_change,...,number_crossing_m,ratio_value_number_to_time_series_length,root_mean_square,skewness,standard_deviation,sum_of_reoccurring_data_points,sum_values,value_count,variance_larger_than_standard_deviation,variation_coefficient
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,8.908099,0.983158,25.469281,0.318833,0.681167,5.868714,157.0,147.0,8.908645,...,0.027019,0.043867,109.379965,2.092335,108.995377,5.633807,9.164318,11.893452,1.0,11.893452
1,1.0,8.908099,0.983158,25.469281,0.318833,0.681167,5.868714,157.0,147.0,8.908645,...,0.027019,0.043867,109.379965,2.092335,108.995377,5.633807,9.164318,11.893452,1.0,11.893452
2,1.0,8.908099,0.983158,25.469281,0.318833,0.681167,5.868714,157.0,147.0,8.908645,...,0.027019,0.043867,109.379965,2.092335,108.995377,5.633807,9.164318,11.893452,1.0,11.893452
3,1.0,8.908099,0.983158,25.469281,0.318833,0.681167,5.868714,157.0,147.0,8.908645,...,0.027019,0.043867,109.379965,2.092335,108.995377,5.633807,9.164318,11.893452,1.0,11.893452
4,1.0,8.908099,0.983158,25.469281,0.318833,0.681167,5.868714,157.0,147.0,8.908645,...,0.027019,0.043867,109.379965,2.092335,108.995377,5.633807,9.164318,11.893452,1.0,11.893452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5112,1.0,8.908099,0.983158,25.469281,0.318833,0.681167,5.868714,157.0,147.0,8.908645,...,0.027019,0.043867,109.379965,2.092335,108.995377,5.633807,9.164318,11.893452,1.0,11.893452
5113,1.0,8.908099,0.983158,25.469281,0.318833,0.681167,5.868714,157.0,147.0,8.908645,...,0.027019,0.043867,109.379965,2.092335,108.995377,5.633807,9.164318,11.893452,1.0,11.893452
5114,1.0,8.908099,0.983158,25.469281,0.318833,0.681167,5.868714,157.0,147.0,8.908645,...,0.027019,0.043867,109.379965,2.092335,108.995377,5.633807,9.164318,11.893452,1.0,11.893452
5115,1.0,8.908099,0.983158,25.469281,0.318833,0.681167,5.868714,157.0,147.0,8.908645,...,0.027019,0.043867,109.379965,2.092335,108.995377,5.633807,9.164318,11.893452,1.0,11.893452


In [10]:
concated = load_datasets_concat(["rpqst", "tsfresh/norm"])
concated[0]

Unnamed: 0_level_0,r_peaks_mean,r_peaks_std,p_peaks_mean,p_peaks_std,q_peaks_mean,q_peaks_std,s_peaks_mean,s_peaks_std,t_peaks_mean,t_peaks_std,...,number_crossing_m,ratio_value_number_to_time_series_length,root_mean_square,skewness,standard_deviation,sum_of_reoccurring_data_points,sum_values,value_count,variance_larger_than_standard_deviation,variation_coefficient
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,360.973552,60.549258,23.024318,53.339918,-75.462762,36.933234,-94.187858,38.943724,212.437735,54.624509,...,0.027019,0.043867,109.379965,2.092335,108.995377,5.633807,9.164318,11.893452,1.0,11.893452
1,172.301849,99.888525,46.429803,93.417720,-56.266026,78.056169,-130.146795,92.378322,126.716768,88.432758,...,0.027019,0.043867,109.379965,2.092335,108.995377,5.633807,9.164318,11.893452,1.0,11.893452
2,694.707115,112.790543,75.581344,46.206998,-20.526530,34.777344,-185.048683,138.027523,66.282233,163.354397,...,0.027019,0.043867,109.379965,2.092335,108.995377,5.633807,9.164318,11.893452,1.0,11.893452
3,373.304298,206.867062,159.766363,110.401163,-89.654383,98.177982,-417.869831,119.835301,175.863735,91.360514,...,0.027019,0.043867,109.379965,2.092335,108.995377,5.633807,9.164318,11.893452,1.0,11.893452
4,142.958246,98.393425,46.176553,83.475237,-92.118180,66.808841,-330.187234,73.435437,198.051724,90.103197,...,0.027019,0.043867,109.379965,2.092335,108.995377,5.633807,9.164318,11.893452,1.0,11.893452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5112,490.969645,447.664723,217.495209,318.722379,-210.245599,319.695741,-194.492001,417.841756,333.605075,264.987635,...,0.027019,0.043867,109.379965,2.092335,108.995377,5.633807,9.164318,11.893452,1.0,11.893452
5113,338.683656,114.511938,77.365426,134.405367,-20.937974,123.244059,-87.732166,50.107700,-22.544351,27.265979,...,0.027019,0.043867,109.379965,2.092335,108.995377,5.633807,9.164318,11.893452,1.0,11.893452
5114,501.862512,210.136661,63.117555,238.620584,-91.615532,153.481973,-274.956505,210.286097,275.287498,93.969861,...,0.027019,0.043867,109.379965,2.092335,108.995377,5.633807,9.164318,11.893452,1.0,11.893452
5115,254.761004,130.199229,24.264114,159.677432,-36.097183,145.970293,-50.241492,64.242307,98.452844,46.864996,...,0.027019,0.043867,109.379965,2.092335,108.995377,5.633807,9.164318,11.893452,1.0,11.893452


In [11]:
X_train1

NameError: name 'X_train1' is not defined

In [None]:
%%time
absolute_maximum(x)
absolute_sum_of_changes(x)/x.shape[-1]
benford_correlation(x)
cid_ce(x, normalize=True)
count_above_mean(x)/x.shape[-1]
count_below_mean(x)/x.shape[-1]
kurtosis(x) # already have?
longest_strike_above_mean(x)
longest_strike_below_mean(x)
mean_abs_change(x)
mean_second_derivative_central(x)
number_crossing_m(x, 0)/x.shape[-1] # can tune value
ratio_value_number_to_time_series_length(x) 
root_mean_square(x)
# sample_entropy(x) # Overlap?
skewness(x) # already have?
standard_deviation(x) # already have?
sum_of_reoccurring_data_points(x)/x.shape[-1]
sum_values(x)/x.shape[-1]
value_count(x, 0.)/x.shape[-1] # can tune value
variance_larger_than_standard_deviation(x) # bool
variation_coefficient(x)

In [None]:
X_test.head()

In [None]:
renaming = {}
for i in range(X_test.shape[1]):
    renaming[f"x{i}"] = i
renaming

In [None]:
X_test_renamed = X_test.rename(columns=renaming)

In [None]:
X_test_renamed

In [None]:
X_test_renamed_truncated = X_test_renamed[['id']+[i for i in range(100)]]
X_test_renamed_truncated

In [None]:
stacked_X_test = X_test_renamed_truncated[:10].set_index(['id']).stack(dropna=True).reset_index().rename(columns={'level_1': 'time', 0: 'value'})

In [None]:
stacked_X_test

In [None]:
extracted_features = extract_features(stacked_X_test, column_id="id", column_sort="time", n_jobs=4)

In [None]:
extracted_features

In [None]:
from tsfresh.feature_extraction.feature_calculators import \
absolute_sum_of_changes, absolute_maximum, \
agg_autocorrelation, approximate_entropy, \
benford_correlation, cid_ce, \
count_above_mean, count_below_mean, \
kurtosis, longest_strike_above_mean, \
longest_strike_below_mean, mean_abs_change, \
mean_second_derivative_central, number_crossing_m, \
percentage_of_reoccurring_datapoints_to_all_datapoints, \
percentage_of_reoccurring_values_to_all_values, \
ratio_value_number_to_time_series_length, root_mean_square, \
sample_entropy, skewness, standard_deviation, \
sum_of_reoccurring_data_points, sum_values, \
value_count, variance_larger_than_standard_deviation, \
variation_coefficient


In [None]:
x = X_test.iloc[0].dropna().to_numpy()
x

In [None]:
%%time
absolute_maximum(x)

In [None]:
%%time
absolute_sum_of_changes(x)/x.shape[-1]

In [None]:
%%time
benford_correlation(x)

In [None]:
%%time
cid_ce(x, normalize=True)

In [None]:
%%time
count_above_mean(x)/x.shape[-1]

In [None]:
%%time
count_below_mean(x)/x.shape[-1]

In [None]:
%%time
kurtosis(x) # already have?

In [None]:
%%time
longest_strike_above_mean(x)

In [None]:
%%time
longest_strike_below_mean(x)

In [None]:
%%time
mean_abs_change(x)

In [None]:
%%time
mean_second_derivative_central(x)

In [None]:
%%time
number_crossing_m(x, 0)/x.shape[-1] # can tune value

In [None]:
# number_peaks(x, n) could be useful but need to tune n

In [None]:
# %%time
# percentage_of_reoccurring_datapoints_to_all_datapoints(x)
# # this one is expensive

In [None]:
# %%time
# percentage_of_reoccurring_values_to_all_values(x)

In [None]:
# permutation_entropy(x, tau, dimension) is interesting but parameters

In [None]:
# quantile(x, q) determines the quantiles

In [None]:
# ratio_beyond_r_sigma determines r

In [None]:
%%time
ratio_value_number_to_time_series_length(x) 
# unique values / # values

In [None]:
%%time
root_mean_square(x)

In [None]:
%%time
sample_entropy(x) # Overlap?

In [None]:
%%time
skewness(x) # already have?

In [None]:
%%time
standard_deviation(x) # already have?

In [None]:
%%time
sum_of_reoccurring_data_points(x)/x.shape[-1]

In [None]:
# %%time
# sum_of_reoccurring_values(x)

In [None]:
%%time
sum_values(x)/x.shape[-1]

In [None]:
%%time
value_count(x, 0.)/x.shape[-1] # can tune value

In [None]:
%%time
variance_larger_than_standard_deviation(x) # bool

In [None]:
%%time
variation_coefficient(x)

In [None]:
from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, load_robot_execution_failures
download_robot_execution_failures()

In [None]:
x, y = load_robot_execution_failures()
x[:30]

In [None]:
features = extract_features(x, column_id="id", column_sort="time")

In [None]:
features