In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tsfresh
from tsfresh.feature_extraction.settings import ComprehensiveFCParameters, MinimalFCParameters, EfficientFCParameters
from tsfresh.feature_extraction.extraction import _do_extraction_on_chunk, generate_data_chunk_format
from tsfresh import extract_features, extract_relevant_features
from tsfresh.feature_selection.relevance import calculate_relevance_table
from tsfresh.utilities.dataframe_functions import impute
import json
from sklearn.utils import shuffle


In [2]:
df = pd.read_csv("All_prog_ident_tt.csv").set_index(['Id_gene'])
df = df.astype({
    'Alignment length_1': 'float64',
    'Number of mismatches_1': 'float64',
    'Number of gap opens_1': 'float64',
    'Start position in query_1': 'float64',
    'End position in query_1': 'float64',
    'Start position in target_1': 'float64',
    'End position in target_1': 'float64',
})
df = df[df.columns.drop(list(df.filter(regex='Target_')))]
df['target'] = df.Age.apply( lambda x: 0 if (x<=12) else 1 )

In [3]:
for i in range(1,11):
    s_q = f'Start position in query_{i}'
    e_q = f'End position in query_{i}'
    q_l = f'Query len_{i}'
    df[q_l] = df[e_q] - df[s_q]
    
    s_t = f'Start position in target_{i}'
    e_t = f'End position in target_{i}'
    t_l = f'Target len_{i}'
    df[t_l] = df[e_t] - df[s_t]
    lens_ratio = f'Query Target lens ratio_{i}'
    df[lens_ratio] = df[q_l] / df[t_l]
    
    m = f'Number of mismatches_{i}'
    g = f'Number of gap opens_{i}'
    a_l = f'Alignment length_{i}'
    al_tl_ratio = f'Alignment len target len ratio_{i}'
    al_ql_ratio = f'Alignment len query len ratio_{i}'
    mg_ration = f'Mismatches gaps ratio_{i}'   
    tl_m_ration = f'Target len mismatches ratio_{i}'
    ql_m_ration = f'Query len mismatches ratio_{i}'
    tl_g_ration = f'Target len gaps ratio_{i}'
    ql_g_ration = f'Query len gaps ratio_{i}'

    al_m_ration = f'Alignment len mismatches ratio_{i}'
    al_g_ration = f'Alignment len gaps ratio_{i}'

    df[al_tl_ratio] = df[a_l] / df[t_l]
    df[al_ql_ratio] = df[a_l] / df[q_l]
    df[mg_ration] = df[m] / df[g]
    df[tl_m_ration] = df[m] / df[t_l]
    df[ql_m_ration] = df[m] / df[q_l]
    df[tl_g_ration] = df[g] / df[t_l]
    df[ql_g_ration] = df[g] / df[q_l]
    df[al_m_ration] = df[m] / df[a_l]
    df[al_g_ration] = df[g] / df[a_l]


In [4]:
df = df.replace([np.inf, -np.inf], np.nan).fillna(-999)
df[df.isin([np.nan, np.inf, -np.inf]).any(1)]

Unnamed: 0_level_0,Age,Prog,Percent identity_1,Percent identity_2,Percent identity_3,Percent identity_4,Percent identity_5,Percent identity_6,Percent identity_7,Percent identity_8,...,Query Target lens ratio_10,Alignment len target len ratio_10,Alignment len query len ratio_10,Mismatches gaps ratio_10,Target len mismatches ratio_10,Query len mismatches ratio_10,Target len gaps ratio_10,Query len gaps ratio_10,Alignment len mismatches ratio_10,Alignment len gaps ratio_10
Id_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [6]:
df_blast = df[df.Prog == 'Blast']
train_df, test_df = train_test_split(df_blast, test_size=0.93, random_state=42, stratify=df_blast.target)
train_df = df.loc[train_df.index].reset_index()
test_df = df.loc[test_df.index].reset_index()
y = train_df.target

In [7]:
traits = [x[:-3] for x in list(df.filter(regex='_10'))]
res = []
for index, row in train_df.iterrows():
    for i in range(1,11):
        res.append([index, i-1, *[row[f'{t}_{i}'] for t in traits]])
timeseries = pd.DataFrame(res,columns=['id','time',*traits])

In [8]:
timeseries.shape, y.shape

((62740, 24), (6274,))

In [9]:
EfficientFCParameters = EfficientFCParameters()
del EfficientFCParameters['binned_entropy']

In [13]:
X_mann = extract_relevant_features(timeseries, y,
                                   chunksize=100,
                                   column_id='id', column_sort='time',
                                   n_jobs=12, ml_task='classification',
                                   default_fc_parameters=EfficientFCParameters,
                                   fdr_level=0.001,
                                   test_for_binary_target_real_feature='mann'
                                   #kind_to_fc_parameters=kind_to_fc_parameters,
                                  )

Feature Extraction: 100%|██████████| 60/60 [04:57<00:00,  4.96s/it]


In [14]:
X_smir = extract_relevant_features(timeseries, y,
                                   chunksize=100,
                                   column_id='id', column_sort='time',
                                   n_jobs=12, ml_task='classification',
                                   default_fc_parameters=EfficientFCParameters,
                                   fdr_level=0.001,
                                   test_for_binary_target_real_feature='smir'
                                   #kind_to_fc_parameters=kind_to_fc_parameters,
                                  )


Feature Extraction: 100%|██████████| 60/60 [05:02<00:00,  5.04s/it]


In [15]:
feats = list(
    set(X_mann.columns.to_list()) &
    set(X_smir.columns.to_list())
)
len(feats), X_mann.shape, X_smir.shape

(4200, (6274, 4319), (6274, 5275))

In [16]:
X = X_mann[feats]

In [17]:
kind_to_fc_parameters = tsfresh.feature_extraction.settings.from_columns(X)
# Serialize data into file:
json.dump( kind_to_fc_parameters, open( "kind_to_fc_parameters.json", 'w' ) )