In [2]:
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tsfresh.feature_extraction.extraction import _do_extraction_on_chunk
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm

In [3]:
df = pd.read_csv("All_prog_ident_tt.csv")
params = {'Target len mismatches ratio': {'maximum': None,
  'skewness': None,
  'agg_linear_trend': [{'f_agg': 'max', 'chunk_len': 5, 'attr': 'intercept'}]},
 'Query Target lens ratio': {'agg_linear_trend': [{'f_agg': 'max',
    'chunk_len': 5,
    'attr': 'intercept'}]},
 'Percent identity': {'energy_ratio_by_chunks': [{'num_segments': 10,
    'segment_focus': 0}],
  'cwt_coefficients': [{'widths': [2, 5, 10, 20], 'coeff': 0, 'w': 5}],
  'change_quantiles': [{'f_agg': 'mean',
    'isabs': True,
    'qh': 1.0,
    'ql': 0.8}]},
 'Alignment len target len ratio': {'maximum': None,
  'agg_linear_trend': [{'f_agg': 'max', 'chunk_len': 5, 'attr': 'intercept'}]},
 'Alignment len query len ratio': {'maximum': None,
  'agg_linear_trend': [{'f_agg': 'max', 'chunk_len': 5, 'attr': 'intercept'}]},
 'End position in target': {'agg_linear_trend': [{'f_agg': 'max',
    'chunk_len': 5,
    'attr': 'intercept'}],
  'quantile': [{'q': 0.9}],
  'maximum': None},
 'Query len mismatches ratio': {'fft_aggregated': [{'aggtype': 'kurtosis'}]},
 'Bit score': {'energy_ratio_by_chunks': [{'num_segments': 10,
    'segment_focus': 0}],
  'fft_aggregated': [{'aggtype': 'centroid'}, {'aggtype': 'variance'}]}}

feats = ['Target len mismatches ratio__maximum',
 'Query Target lens ratio__agg_linear_trend__f_agg_"max"__chunk_len_5__attr_"intercept"',
 'Percent identity__energy_ratio_by_chunks__num_segments_10__segment_focus_0',
 'Target len mismatches ratio__skewness',
 'Percent identity_3',
 'Alignment len target len ratio__maximum',
 'Percent identity_10',
 'Alignment len query len ratio__maximum',
 'Alignment len target len ratio_1',
 'End position in target__agg_linear_trend__f_agg_"max"__chunk_len_5__attr_"intercept"',
 'Percent identity_4',
 'Query Target lens ratio_1',
 'Alignment len query len ratio__agg_linear_trend__f_agg_"max"__chunk_len_5__attr_"intercept"',
 'End position in target__quantile__q_0.9',
 'Alignment len target len ratio__agg_linear_trend__f_agg_"max"__chunk_len_5__attr_"intercept"',
 'Percent identity__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_0__w_5',
 'End position in target__maximum',
 'Percent identity__change_quantiles__f_agg_"mean"__isabs_True__qh_1.0__ql_0.8',
 'Bit score_10',
 'Query len mismatches ratio__fft_aggregated__aggtype_"kurtosis"',
 'Target len mismatches ratio__agg_linear_trend__f_agg_"max"__chunk_len_5__attr_"intercept"',
 'Bit score__energy_ratio_by_chunks__num_segments_10__segment_focus_0',
 'Percent identity_8',
 'Bit score__fft_aggregated__aggtype_"centroid"',
 'Bit score__fft_aggregated__aggtype_"variance"',
 'Percent identity_7',
 'Target len mismatches ratio_1']

for i in range(1,11):
    s_q = f'Start position in query_{i}'
    e_q = f'End position in query_{i}'
    q_l = f'Query len_{i}'
    df[q_l] = df[e_q] - df[s_q]    
    s_t = f'Start position in target_{i}'
    e_t = f'End position in target_{i}'
    t_l = f'Target len_{i}'
    df[t_l] = df[e_t] - df[s_t]
    lens_ratio = f'Query Target lens ratio_{i}' #
    df[lens_ratio] = df[q_l] / df[t_l]
    
    m = f'Number of mismatches_{i}'
    a_l = f'Alignment length_{i}'
    al_tl_ratio = f'Alignment len target len ratio_{i}' #
    al_ql_ratio = f'Alignment len query len ratio_{i}' #
    tl_m_ration = f'Target len mismatches ratio_{i}' #
    ql_m_ration = f'Query len mismatches ratio_{i}' #

    df[al_tl_ratio] = df[a_l] / df[t_l] #
    df[al_ql_ratio] = df[a_l] / df[q_l] #
    df[tl_m_ration] = df[m] / df[t_l] #
    df[ql_m_ration] = df[m] / df[q_l] #
df = df.replace([np.inf, -np.inf], np.nan).fillna(-999)

In [4]:
%%time
feature_calculators_module = sys.modules['tsfresh.feature_extraction.feature_calculators']
def extraction_ts_traits(col_name,df,params):
    res = pd.DataFrame()
    for func_name in params[col_name]:
        func_params = params[col_name][func_name]
        trait_name = f'{col_name}__{func_name}'
        func = getattr(feature_calculators_module, func_name)
        if func_params is not None:
            for func_param_dict in func_params:
                trait_name_postfix = ''
                for key, val in func_param_dict.items():
                    if isinstance(val, str): val = f'"{val}"'
                    elif isinstance(val, list): val = "({})".format(', '.join(str(e) for e in val))
                    trait_name_postfix += f'__{key}_{val}'
                if func.fctype=='simple':
                    res[trait_name+trait_name_postfix] = df[list(df.filter(regex=f'{col_name}_\d+'))].apply(
                        lambda x: func(x,**func_param_dict),
                        raw=True,axis=1)                    
                else:
                    res[trait_name+trait_name_postfix] = df[list(df.filter(regex=f'{col_name}_\d+'))].apply(
                        lambda x: list(*func(x,[func_param_dict]))[1],
                        raw=True,axis=1)
        else:
            res[trait_name] = df[list(df.filter(regex=f'{col_name}_\d+'))].apply(func,raw=True,axis=1)
    return res

processed_list = Parallel(n_jobs=8)(delayed(extraction_ts_traits)(i,df,params) for i in tqdm(params.keys()))
df = pd.concat([df,*processed_list],axis=1)

100%|██████████| 8/8 [00:00<00:00, 93.15it/s]


CPU times: user 695 ms, sys: 217 ms, total: 912 ms
Wall time: 58.5 s


In [None]:
%%time
feature_calculators_module = sys.modules['tsfresh.feature_extraction.feature_calculators']
for col_name in params:
    for func_name in params[col_name]:
        func_params = params[col_name][func_name]
        trait_name = f'{col_name}__{func_name}'
        func = getattr(feature_calculators_module, func_name)
        if func_params is not None:
            for func_param_dict in func_params:
                trait_name_postfix = ''
                for key, val in func_param_dict.items():
                    if isinstance(val, str): val = f'"{val}"'
                    elif isinstance(val, list): val = "({})".format(', '.join(str(e) for e in val))
                    trait_name_postfix += f'__{key}_{val}'
                if func.fctype=='simple':
                    df[trait_name+trait_name_postfix] = df[list(df.filter(regex=f'{col_name}_\d+'))].apply(
                        lambda x: func(x,**func_param_dict),
                        raw=True,axis=1)                    
                else:
                    df[trait_name+trait_name_postfix] = df[list(df.filter(regex=f'{col_name}_\d+'))].apply(
                        lambda x: list(*func(x,[func_param_dict]))[1],
                        raw=True,axis=1)
        else:
            df[trait_name] = df[list(df.filter(regex=f'{col_name}_\d+'))].apply(func,raw=True,axis=1)

In [None]:
%%time
def extraction_ts_traits(key,df,kind_to_fc_parameters):
    print(f'Start extraction for {key}')
    res = pd.DataFrame()
    for index, row in df.iterrows():
        for d in _do_extraction_on_chunk(
            (index, key, pd.Series(row.filter(regex=key+'_\d+').to_list())),
            kind_to_fc_parameters[key],None):
            res.at[index,d['variable']] = d['value']
    print(f'Done extraction for {key}')
    return res

processed_list = Parallel(n_jobs=8)(delayed(extraction_ts_traits)(i,df,params) for i in tqdm(params.keys()))
df = pd.concat([df,*processed_list],axis=1)

### tsfresh feature calculators time results

#### 1) getattr + Parallel with n_jobs=8
```
CPU times: user 709 ms, sys: 193 ms, total: 903 ms
Wall time: 57.4 s
```

#### 2) getattr one loop
```
CPU times: user 2min 4s, sys: 1 s, total: 2min 5s
Wall time: 2min 5s
```

#### 3) do_extraction_on_chunk with df.at + Parallel with n_jobs=8
```
CPU times: user 697 ms, sys: 270 ms, total: 967 ms
Wall time: 3min 20s
```