In [1]:
import numpy as np
import pandas as pd

In [2]:
from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters
com_setting = ComprehensiveFCParameters()
min_setting = MinimalFCParameters()

In [3]:
def feature_extractor_comprehensive(file):
    df = pd.read_csv(file)
    X = df.drop(['htn', 'dkd', 'gn', 'gender', 'value_egfr_init', 'egfr.y', 'eskd.y', 'age.init', 'death.y', 'cat0.5', 'cat1', 'cat1.5', 'cat2', 'cat2.5', 'cat3', 'cat3.5', 'cat4', 'cat4.5', 'cat5', 'cat5.5', 'cat6', 'cat6.5', 'cat7', 'cat7.5', 'cat8'], axis=1).copy()
    extracted_features = extract_features(X, column_id='id', column_sort='relyear', column_kind='test', column_value='value',
                                          default_fc_parameters=com_setting, n_jobs=14)
    return extracted_features

In [4]:
def feature_extractor_minimal(file):
    df = pd.read_csv(file)
    X = df.drop(['htn', 'dkd', 'gn', 'gender', 'value_egfr_init', 'egfr.y', 'eskd.y', 'age.init', 'death.y', 'cat0.5', 'cat1', 'cat1.5', 'cat2', 'cat2.5', 'cat3', 'cat3.5', 'cat4', 'cat4.5', 'cat5', 'cat5.5', 'cat6', 'cat6.5', 'cat7', 'cat7.5', 'cat8'], axis=1).copy()
    extracted_features = extract_features(X, column_id='id', column_sort='relyear', column_kind='test', column_value='value',
                                          default_fc_parameters=min_setting, n_jobs=14)
    return extracted_features

In [5]:
extracted_features = feature_extractor_comprehensive(file='../data/egfr_meta_melted_1_4.gz')

Feature Extraction: 100%|██████████| 70/70 [00:30<00:00,  2.27it/s]


In [6]:
extracted_features.shape

(5258, 787)

In [9]:
extracted_features.head()

Unnamed: 0,egfr__variance_larger_than_standard_deviation,egfr__has_duplicate_max,egfr__has_duplicate_min,egfr__has_duplicate,egfr__sum_values,egfr__abs_energy,egfr__mean_abs_change,egfr__mean_change,egfr__mean_second_derivative_central,egfr__median,...,egfr__permutation_entropy__dimension_5__tau_1,egfr__permutation_entropy__dimension_6__tau_1,egfr__permutation_entropy__dimension_7__tau_1,egfr__query_similarity_count__query_None__threshold_0.0,"egfr__matrix_profile__feature_""min""__threshold_0.98","egfr__matrix_profile__feature_""max""__threshold_0.98","egfr__matrix_profile__feature_""mean""__threshold_0.98","egfr__matrix_profile__feature_""median""__threshold_0.98","egfr__matrix_profile__feature_""25""__threshold_0.98","egfr__matrix_profile__feature_""75""__threshold_0.98"
3916,1.0,0.0,0.0,0.0,113.73,6523.7549,10.63,-10.63,,56.865,...,,,,,,,,,,
3918,0.0,0.0,0.0,0.0,52.81,2788.8961,,,,52.81,...,,,,,,,,,,
3924,1.0,0.0,0.0,0.0,151.32,11451.7994,2.42,-2.42,,75.66,...,,,,,,,,,,
3930,1.0,0.0,0.0,0.0,256.0,11206.0,7.0,0.6,-2.125,43.0,...,0.693147,-0.0,,,,,,,,
3936,1.0,0.0,0.0,0.0,98.23,3231.8929,4.765,0.435,-4.765,31.59,...,,,,,,,,,,


In [7]:
extracted_features_other = feature_extractor_minimal(file='../data/other_meta_melted_1_4.gz')

Feature Extraction: 100%|██████████| 70/70 [02:25<00:00,  2.07s/it]


In [8]:
extracted_features_other.shape

(5479, 549)

In [10]:
ready_df = pd.concat(objs=[extracted_features, extracted_features_other], axis=1)
ready_df.shape

(5603, 1336)

In [11]:
ready_df.to_csv('../data/egfrcom_elsemin_meta_melted_1_4.gz', index=True, compression='gzip')