In [15]:
import numpy as np
import pandas as pd
import pickle

from matplotlib import pyplot as plt

from sklearn.datasets import make_classification
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

SEED = 42

In [118]:
df = pd.read_csv('data/Parameters_90%stability.csv')
df = df.drop(['Unnamed: 0'], axis = 1)
X_train = pd.read_csv('data/x_train.csv')
y_train = pd.read_csv('data/y_train.csv')
X_test = pd.read_csv('data/x_test.csv')
y_test = pd.read_csv('data/y_test.csv')

## Normalize data

In [119]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def normalize(X_train, X_test):
    scaler = StandardScaler()
#     scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

X_train, X_test = normalize(X_train, X_test)

# SelectKBest

In [80]:
select_class = SelectKBest(f_classif, k=20)
select_class.fit(X_train, y_train)
X_train_new = select_class.transform(X_train)
new_df = df.drop('Stability',axis=1).iloc[:,select_class.get_support()]
new_df.columns.values

array(['Gamma_ACACT1r', 'Gamma_ALCD2irm', 'Gamma_ANS', 'Gamma_ASPTA',
       'Gamma_ACN_a_m', 'Gamma_G6PDH2r', 'Gamma_GLUSx', 'Gamma_GF6PTA',
       'Gamma_IG3PS', 'Gamma_AKGDbm', 'Gamma_PHETA1', 'Gamma_PSERT',
       'Gamma_PHETRA', 'Gamma_CITtbm', 'Gamma_ASNt2r', 'Gamma_ACALDtm',
       'Gamma_HCO3E', 'Gamma_ALAtmi', 'Gamma_ACONTm', 'Gamma_CBPt'],
      dtype=object)

# Tree-based approach

In [97]:
def feature_importance(model, graph):
    model.fit(X_train, y_train)

    feats = {} # a dict to hold feature_name: feature_importance
    
    for feature, importance in zip(df.drop(columns='Stability').columns, model.feature_importances_):
        feats[feature] = importance #add the name/value pair 

    importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Feature Importance'})
    print(importances.sort_values(by='Feature Importance').tail(15))
    if(graph):
        importances.sort_values(by='Feature Importance').tail(15).plot(kind='bar', rot=45, figsize=(25, 25))
        plt.show()
        
    return importances.sort_values(by='Feature Importance').tail(15)

In [98]:
rf_import = feature_importance(RandomForestClassifier(), 0)

                          Feature Importance
Gamma_COAtrm                        0.006192
Gamma_SUCCt                         0.006220
Gamma_ICDHyr                        0.006247
Gamma_GF6PTA                        0.006472
Gamma_PHETRA                        0.006501
Gamma_CITtbm                        0.006631
Gamma_ACACT1r                       0.006889
Gamma_GLXt                          0.007051
Gamma_ALAtmi                        0.007393
Gamma_ACALDtm                       0.007558
Gamma_ASPTA                         0.007633
sigma_km_product2_GF6PTA            0.007891
Gamma_ALATA_L                       0.007895
Gamma_D_LACt2                       0.007916
Gamma_TPI                           0.014875


In [101]:
dct_import = feature_importance(DecisionTreeClassifier(), 0)

                                          Feature Importance
sigma_km_product2_ICDHym                            0.023683
sigma_km_product2_ASPTA                             0.023963
sigma_km_substrate2_PHETRA                          0.024248
Gamma_ACACT1r                                       0.024538
sigma_km_product2_ACOTAim                           0.025612
sigma_km_substrate44_LMPD_s_0450_c_1_256            0.027442
sigma_km_substrate34_LMPD_s_0450_c_1_256            0.029271
sigma_km_substrate1_CaCatA                          0.032523
sigma_km_product2_GLUDy                             0.036403
sigma_km_product2_ASPTAm                            0.046752
sigma_km_product2_SUCFUMtm                          0.063306
sigma_km_substrate1_AKGMALtm                        0.082716
sigma_km_substrate1_GUAPRT                          0.083145
sigma_km_product2_AKGMAL                            0.134397
Gamma_PYNP2r                                        0.239348


# XGBoost

In [102]:
from xgboost import XGBClassifier

xgb_import = feature_importance(XGBClassifier(eval_metric='logloss'), 0)

                                          Feature Importance
sigma_km_substrate1_HOMSYN2                         0.017848
Gamma_ACACT1r                                       0.018900
sigma_km_substrate10_LMPD_s_0450_c_1_256            0.018934
sigma_km_product2_KpAroY                            0.020015
sigma_km_product2_GLUDy                             0.021587
sigma_km_product2_PRFGS                             0.025397
sigma_km_product2_SUCFUMtm                          0.026566
sigma_km_product2_GF6PTA                            0.027705
sigma_km_product2_ASPTAm                            0.028773
sigma_km_substrate2_VALTA                           0.034284
sigma_km_product1_MGSA                              0.035212
sigma_km_substrate1_GUAPRT                          0.036452
Gamma_GLUDC                                         0.045234
sigma_km_product2_AKGMAL                            0.052332
sigma_km_substrate1_AKGMALtm                        0.055141


# Boruta

In [112]:
!pip install boruta

Collecting boruta
  Downloading Boruta-0.3-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 178 kB/s eta 0:00:01
Installing collected packages: boruta
Successfully installed boruta-0.3


In [128]:
from boruta import BorutaPy

forest = RandomForestClassifier()
feat_selector = BorutaPy(forest, n_estimators='auto', 
                         verbose=2, random_state=SEED)
feat_selector.fit(X_train, y_train)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	1410
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	1410
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	1410
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	1410
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	1410
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	1410
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	1410
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	113
Rejected: 	1297
Iteration: 	9 / 100
Confirmed: 	0
Tentative: 	113
Rejected: 	1297
Iteration: 	10 / 100
Confirmed: 	0
Tentative: 	113
Rejected: 	1297
Iteration: 	11 / 100
Confirmed: 	0
Tentative: 	113
Rejected: 	1297
Iteration: 	12 / 100
Confirmed: 	1
Tentative: 	92
Rejected: 	1317
Iteration: 	13 / 100
Confirmed: 	1
Tentative: 	92
Rejected: 	1317
Iteration: 	14 / 100
Confirmed: 	1
Tentative: 	92
Rejected: 	1317
Iteration: 	15 / 100
Confirmed: 	1
Tentative: 	92
Rejected: 	1317
Iteration: 	16 / 100
C

BorutaPy(estimator=RandomForestClassifier(n_estimators=96,
                                          random_state=RandomState(MT19937) at 0x7FCB2FC4E840),
         n_estimators='auto',
         random_state=RandomState(MT19937) at 0x7FCB2FC4E840, verbose=2)

In [129]:
# check selected features
print(feat_selector.support_)

# check ranking of features
print(feat_selector.ranking_)

[False False False ... False False False]
[ 197 1049   42 ...  369 1239  738]


In [131]:
X_filtered = feat_selector.transform(X_train)

In [133]:
X_filtered.shape

(260, 15)

In [125]:
pd.DataFrame(X_train, columns=df.drop(columns='Stability').columns.values)

Unnamed: 0,Gamma_AATA,Gamma_IPPS,Gamma_IPPSm,Gamma_DHQTi,Gamma_ADCS,Gamma_ABTA,Gamma_PGL,Gamma_ACACT1r,Gamma_ACOAHim,Gamma_ACOTAim,...,sigma_km_substrate_ccm2tp,sigma_km_product_ccm2tp,sigma_km_substrate_pca2tp,sigma_km_product_pca2tp,sigma_km_substrate_r2073_1,sigma_km_product_r2073_1,sigma_km_substrate1_r_4235,sigma_km_product1_r_4235,sigma_km_substrate2_r_4235,sigma_km_product2_r_4235
0,-1.133820,-1.079643,-0.088203,-1.121501,1.303972,-0.074145,-0.552777,-2.134972,-0.088045,-0.997357,...,1.223981,1.390741,-1.263445,0.292577,-0.174575,0.924028,0.123265,1.035213,0.175300,0.189256
1,-0.223812,0.007211,-0.088202,0.090623,-0.102687,0.443082,0.706233,0.559350,-0.088045,-0.117979,...,-0.967788,1.074848,-1.012657,0.021411,1.409727,0.110508,-0.913076,-0.433772,1.216402,-1.008746
2,-0.223812,0.007211,-0.088202,0.090623,-0.102687,0.443082,0.706233,0.559350,-0.088045,-0.117979,...,0.320471,1.009759,-1.431400,0.263970,-1.023687,-1.392981,-1.450260,-0.878471,-1.482437,0.977004
3,-0.223812,0.007211,-0.088202,0.090623,-0.102687,0.443082,0.706233,0.559350,-0.088045,-0.117979,...,-0.489659,-1.568078,-1.522954,-1.487435,-0.261801,-0.548825,-0.346411,-0.068753,-0.374895,-0.047561
4,0.300158,1.229549,-0.088203,0.961261,0.162007,0.685943,0.210878,0.486490,-0.088045,0.151249,...,0.371038,1.504930,0.578729,1.850364,-1.708977,1.506691,0.431705,-0.527311,-1.473402,-1.139813
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,0.300158,1.229549,-0.088203,0.961261,0.162007,0.685943,0.210878,0.486490,-0.088045,0.151249,...,-1.479453,1.549318,1.213857,-0.622300,-0.696322,-0.115852,0.037469,-0.123425,0.028872,-1.765856
256,-1.166453,-1.235684,-0.088160,-1.539918,-1.537759,-1.458458,-1.427044,0.288264,-0.088045,-1.316939,...,0.852746,1.477343,-0.846106,-0.793318,-0.544841,-1.196878,0.392946,-1.281849,0.320315,1.276811
257,0.300158,1.229549,-0.088203,0.961261,0.162007,0.685943,0.210878,0.486490,-0.088045,0.151249,...,0.139604,-0.146032,-0.103776,-1.444356,1.172392,0.708250,-0.977062,0.225511,1.734286,-1.165297
258,-1.047198,-1.013804,-0.088203,-0.925724,2.415687,-0.076230,-0.420712,-1.867333,-0.088045,-0.887228,...,-1.039935,-0.301828,1.574021,-0.172568,0.145409,-1.519413,0.499501,-0.799805,0.690481,-0.818955


In [105]:
rf_import.T.columns.values

array(['Gamma_COAtrm', 'Gamma_SUCCt', 'Gamma_ICDHyr', 'Gamma_GF6PTA',
       'Gamma_PHETRA', 'Gamma_CITtbm', 'Gamma_ACACT1r', 'Gamma_GLXt',
       'Gamma_ALAtmi', 'Gamma_ACALDtm', 'Gamma_ASPTA',
       'sigma_km_product2_GF6PTA', 'Gamma_ALATA_L', 'Gamma_D_LACt2',
       'Gamma_TPI'], dtype=object)

In [106]:
xgb_import.T.columns.values

array(['sigma_km_substrate1_HOMSYN2', 'Gamma_ACACT1r',
       'sigma_km_substrate10_LMPD_s_0450_c_1_256',
       'sigma_km_product2_KpAroY', 'sigma_km_product2_GLUDy',
       'sigma_km_product2_PRFGS', 'sigma_km_product2_SUCFUMtm',
       'sigma_km_product2_GF6PTA', 'sigma_km_product2_ASPTAm',
       'sigma_km_substrate2_VALTA', 'sigma_km_product1_MGSA',
       'sigma_km_substrate1_GUAPRT', 'Gamma_GLUDC',
       'sigma_km_product2_AKGMAL', 'sigma_km_substrate1_AKGMALtm'],
      dtype=object)

In [108]:
dct_import.T.columns.values

array(['sigma_km_product2_ICDHym', 'sigma_km_product2_ASPTA',
       'sigma_km_substrate2_PHETRA', 'Gamma_ACACT1r',
       'sigma_km_product2_ACOTAim',
       'sigma_km_substrate44_LMPD_s_0450_c_1_256',
       'sigma_km_substrate34_LMPD_s_0450_c_1_256',
       'sigma_km_substrate1_CaCatA', 'sigma_km_product2_GLUDy',
       'sigma_km_product2_ASPTAm', 'sigma_km_product2_SUCFUMtm',
       'sigma_km_substrate1_AKGMALtm', 'sigma_km_substrate1_GUAPRT',
       'sigma_km_product2_AKGMAL', 'Gamma_PYNP2r'], dtype=object)

In [110]:
x=rf_import.T.columns.values
y=xgb_import.T.columns.values
print(np.intersect1d(x, y)) 

['Gamma_ACACT1r' 'sigma_km_product2_AKGMAL' 'sigma_km_product2_ASPTAm'
 'sigma_km_product2_GLUDy' 'sigma_km_product2_SUCFUMtm'
 'sigma_km_substrate1_AKGMALtm' 'sigma_km_substrate1_GUAPRT']


In [109]:
x=rf_import.T.columns.values
y=
print(np.intersect1d(x, y)) 

['Gamma_ACACT1r']


In [111]:
x=dct_import.T.columns.values
y=xgb_import.T.columns.values
print(np.intersect1d(x, y)) 

['Gamma_ACACT1r' 'sigma_km_product2_AKGMAL' 'sigma_km_product2_ASPTAm'
 'sigma_km_product2_GLUDy' 'sigma_km_product2_SUCFUMtm'
 'sigma_km_substrate1_AKGMALtm' 'sigma_km_substrate1_GUAPRT']
