In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, CatBoostRegressor, Pool, cv
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from feature_selector import FeatureSelector
import seaborn as sns
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = (15, 12)
%matplotlib inline

from tsfresh.feature_extraction.extraction import _do_extraction_on_chunk
import json

import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm
import json


In [3]:
df = pd.read_csv("All_prog_ident_tt.csv")#.set_index(['Id_gene'])
df = df.astype({
    'Alignment length_1': 'float64',
    'Number of mismatches_1': 'float64',
    'Number of gap opens_1': 'float64',
    'Start position in query_1': 'float64',
    'End position in query_1': 'float64',
    'Start position in target_1': 'float64',
    'End position in target_1': 'float64',
})
df = df[df.columns.drop(list(df.filter(regex='Target_')))]

In [4]:
df['target'] = df.Age.apply( lambda x: 'other' if (x<=12) else 'young' )
kind_to_fc_parameters = json.load( open( "kind_to_fc_parameters.json" ) )

In [5]:
df['Hits number'] = df[list(df.filter(regex='E-value'))].isna().sum(axis=1)
df = df.astype({'Hits number': 'float64'})
for i in range(1,11):
    s_q = f'Start position in query_{i}'
    e_q = f'End position in query_{i}'
    q_l = f'Query len_{i}'
    df[q_l] = df[e_q] - df[s_q]
    
    s_t = f'Start position in target_{i}'
    e_t = f'End position in target_{i}'
    t_l = f'Target len_{i}'
    df[t_l] = df[e_t] - df[s_t]
    lens_ratio = f'Query Target lens ratio_{i}'
    df[lens_ratio] = df[q_l] / df[t_l]
    
    m = f'Number of mismatches_{i}'
    g = f'Number of gap opens_{i}'
    a_l = f'Alignment length_{i}'
    al_tl_ratio = f'Alignment len target len ratio_{i}'
    al_ql_ratio = f'Alignment len query len ratio_{i}'
    mg_ration = f'Mismatches gaps ratio_{i}'   
    tl_m_ration = f'Target len mismatches ratio_{i}'
    ql_m_ration = f'Query len mismatches ratio_{i}'
    tl_g_ration = f'Target len gaps ratio_{i}'
    ql_g_ration = f'Query len gaps ratio_{i}'

    al_m_ration = f'Alignment len mismatches ratio_{i}'
    al_g_ration = f'Alignment len gaps ratio_{i}'

    df[al_tl_ratio] = df[a_l] / df[t_l]
    df[al_ql_ratio] = df[a_l] / df[q_l]
    df[mg_ration] = df[m] / df[g]
    df[tl_m_ration] = df[m] / df[t_l]
    df[ql_m_ration] = df[m] / df[q_l]
    df[tl_g_ration] = df[g] / df[t_l]
    df[ql_g_ration] = df[g] / df[q_l]
    df[al_m_ration] = df[m] / df[a_l]
    df[al_g_ration] = df[g] / df[a_l]

traits = [x[:-3] for x in list(df.filter(regex='_10'))]
    
for i in range(1,11):   
    for j in range(1,11):
        if (i<j):
            d_pi = f'Delta percent identity_{i}_{j}'
            d_ev = f'Delta e-value_{i}_{j}'
            d_mr = f'Delta alignment len mismatches ratio_{i}_{j}'
            d_gr = f'Delta alignment len gaps ratio_{i}_{j}'
            d_al = f'Delta alignment length_{i}_{j}'
            d_tl = f'Delta target len_{i}_{j}'
            d_ql = f'Delta query len_{i}_{j}'
            d_bs = f'Delta bit score_{i}_{j}'
            d_mn = f'Delta number of mismatches_{i}_{j}'
            d_gn = f'Delta number of gap opens_{i}_{j}'
            
            dr_pi = f'Delta ratio percent identity_{i}_{j}'
            dr_ev = f'Delta ratio e-value_{i}_{j}'
            dr_mr = f'Delta ratio alignment len mismatches ratio_{i}_{j}'
            dr_gr = f'Delta ratio alignment len gaps ratio_{i}_{j}'
            dr_al = f'Delta ratio alignment length_{i}_{j}'
            dr_tl = f'Delta ratio target len_{i}_{j}'
            dr_ql = f'Delta ratio query len_{i}_{j}'
            dr_bs = f'Delta ratio bit score_{i}_{j}'
            dr_mn = f'Delta ratio number of mismatches_{i}_{j}'
            dr_gn = f'Delta ratio number of gap opens_{i}_{j}'
            
            df[d_pi] = df[f'Percent identity_{i}'] - df[f'Percent identity_{j}']
            df[d_ev] = df[f'E-value_{i}'] - df[f'E-value_{j}']
            df[d_mr] = df[f'Alignment len mismatches ratio_{i}'] - df[f'Alignment len mismatches ratio_{j}']
            df[d_gr] = df[f'Alignment len gaps ratio_{i}'] - df[f'Alignment len gaps ratio_{j}']
            df[d_al] = df[f'Alignment length_{i}'] - df[f'Alignment length_{j}']
            df[d_tl] = df[f'Target len_{i}'] - df[f'Target len_{j}']
            df[d_ql] = df[f'Query len_{i}'] - df[f'Query len_{j}']
            df[d_bs] = df[f'Bit score_{i}'] - df[f'Bit score_{j}']
            df[d_mn] = df[f'Number of mismatches_{i}'] - df[f'Number of mismatches_{j}']
            df[d_gn] = df[f'Number of gap opens_{i}'] - df[f'Number of gap opens_{j}']
            
            df[dr_pi] = df[f'Percent identity_{i}'] / df[f'Percent identity_{j}']
            df[dr_ev] = df[f'E-value_{i}'] / df[f'E-value_{j}']
            df[dr_mr] = df[f'Alignment len mismatches ratio_{i}'] / df[f'Alignment len mismatches ratio_{j}']
            df[dr_gr] = df[f'Alignment len gaps ratio_{i}'] / df[f'Alignment len gaps ratio_{j}']
            df[dr_al] = df[f'Alignment length_{i}'] / df[f'Alignment length_{j}']
            df[dr_tl] = df[f'Target len_{i}'] / df[f'Target len_{j}']
            df[dr_ql] = df[f'Query len_{i}'] / df[f'Query len_{j}']
            df[dr_bs] = df[f'Bit score_{i}'] / df[f'Bit score_{j}']
            df[dr_mn] = df[f'Number of mismatches_{i}'] / df[f'Number of mismatches_{j}']
            df[dr_gn] = df[f'Number of gap opens_{i}'] / df[f'Number of gap opens_{j}']

df = df.replace([np.inf, -np.inf], np.nan).fillna(-999)

In [7]:

#for key in traits:
def extraction_ts_traits(key,df):
    print(f'Start extraction for {key}')
    res = pd.DataFrame()
    for index, row in df.iterrows():
        for d in _do_extraction_on_chunk(
            (index, key, pd.Series(row.filter(regex=key+'_\d+').to_list())),
            kind_to_fc_parameters[key],None):
            res.at[index,d['variable']] = d['value']
    print(f'Done extraction for {key}')
    return res

processed_list = Parallel(n_jobs=12)(delayed(extraction_ts_traits)(i,df) for i in tqdm(traits))

100%|██████████| 22/22 [00:00<00:00, 114.51it/s]


In [8]:
pd.concat([df,*processed_list],axis=1).to_csv("All_prog_ident_tsfresh.csv")

In [15]:
df.shape

(89604, 1125)

In [16]:
df = pd.concat([df,*processed_list],axis=1)

In [17]:
df.shape

(89604, 5325)

In [18]:
df

Unnamed: 0,Id_gene,Age,Prog,Percent identity_1,Percent identity_2,Percent identity_3,Percent identity_4,Percent identity_5,Percent identity_6,Percent identity_7,...,Alignment len gaps ratio__mean_abs_change,Alignment len gaps ratio__mean_change,Alignment len gaps ratio__number_cwt_peaks__n_1,Alignment len gaps ratio__ratio_value_number_to_time_series_length,Alignment len gaps ratio__longest_strike_above_mean,Alignment len gaps ratio__first_location_of_maximum,Alignment len gaps ratio__c3__lag_2,Alignment len gaps ratio__c3__lag_3,Alignment len gaps ratio__has_duplicate,Alignment len gaps ratio__abs_energy
0,AT2G21390,0,UsearchLocal,97.000,96.300,96.700,96.200,95.900,93.600,93.700,...,0.000816,0.000091,2.0,0.8,4.0,0.3,2.645827e-08,2.992615e-08,True,9.720589e-05
1,AT5G65500,0,UsearchLocal,91.900,88.100,88.300,87.500,86.800,86.300,83.500,...,0.002435,0.000826,2.0,0.9,2.0,0.9,7.048606e-07,7.390973e-07,True,8.965455e-04
2,AT1G57720,0,UsearchLocal,96.400,96.100,95.600,93.500,93.000,93.600,88.900,...,0.001876,0.000267,2.0,0.5,4.0,0.7,0.000000e+00,2.105447e-08,True,1.337147e-04
3,AT5G08420,0,UsearchLocal,89.500,87.500,86.200,85.900,84.900,83.600,80.400,...,0.000860,-0.000274,2.0,0.7,3.0,0.3,1.736394e-07,1.526324e-07,True,3.212908e-04
4,AT1G23360,0,UsearchLocal,92.700,92.400,91.600,90.500,87.500,85.400,85.000,...,0.003753,-0.000424,3.0,0.6,3.0,0.8,1.427795e-07,5.497881e-08,True,3.548734e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89599,AT3G28853,17,BlastFast,76.106,50.862,47.414,43.089,42.149,43.802,42.149,...,0.006058,0.000949,2.0,0.8,4.0,0.5,1.016986e-05,8.445636e-06,True,4.597454e-03
89600,AT3G62499,17,BlastFast,70.667,-999.000,-999.000,-999.000,-999.000,-999.000,-999.000,...,111.002963,-111.002963,1.0,0.2,1.0,0.0,-8.308314e+08,-7.477456e+08,True,8.982009e+06
89601,AT4G03740,17,BlastFast,68.932,33.657,33.657,33.333,31.290,32.588,33.333,...,0.006871,0.000938,2.0,0.6,3.0,0.5,4.370946e-05,4.109845e-05,True,1.221296e-02
89602,AT3G04181,17,BlastFast,57.692,-999.000,-999.000,-999.000,-999.000,-999.000,-999.000,...,111.005342,-111.005342,1.0,0.2,1.0,0.0,-8.308278e+08,-7.477403e+08,True,8.982009e+06
