In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

SEED = 42

In [2]:
'''

LOAD FILES

'''
df = pd.read_csv('../data/Parameters_90%stability.csv')
df = df.drop(['Unnamed: 0'], axis = 1)


# Load X and Y 
X = df.drop(['Stability'], axis = 1)
y = df['Stability']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.35,
                                                stratify=y, random_state=SEED)

y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

class_names = y_train['Stability'].unique().astype(str)
feature_names = x_train.columns.values

scaler = StandardScaler()

X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

X_train = pd.DataFrame(X_train, columns=x_train.columns)
X_train.index = x_train.index

X_test = pd.DataFrame(X_test, columns=x_test.columns)
X_test.index = x_test.index

In [3]:
def main(index_GP):

	print("Number of rows following these rules:", len(index_GP))

	y_val = y_test.loc[index_GP]
	SI_val = y_val['Stability'].value_counts()[1] / len(y_val) * 100
	print("The Stability Index on VALIDATION SET (sampled from TEST SET) is: SI =",round(SI_val, 4), "%")

	SI_test = len(y_test[y_test['Stability']==1])/len(y_test) * 100
	print("The Stability Index on TEST SET is: SI =",round(SI_test, 4), "%")



def catboost():

	index_GP = X_test[(X_test['Gamma_GLUDC'] <= -1.277) 
                  & (X_test['sigma_km_product1_ICDHxm'] > 0.056)
                  & (X_test['sigma_km_product2_GS'] > -0.802)
                  & (X_test['sigma_km_substrate2_ILETAm'] <= 1.365)
                  & (X_test['sigma_km_substrate1_ADK1'] <= 1.551)].index
	
	return index_GP              



def logreg():

	index_GP = X_test[(X_test['Gamma_GLUDC'] <= -1.277) 
                  & (X_test['sigma_km_product1_CHORS'] <= -1.075)].index

	return index_GP


def svc():

	index_GP = X_test[(X_test['Gamma_GLUDC'] <= -1.277) 
                  & (X_test['sigma_km_substrate1_2OXOADPTm'] <= 0.494)
                  & (X_test['sigma_km_substrate26_LMPD_s_0450_c_1_256'] <= 0.398)
                  & (X_test['sigma_km_substrate2_GK1'] <= 1.581)].index

	return index_GP              


def xgbclass():
	
	index_GP = X_test[(X_test['Gamma_GLUDC'] <= -1.277) 
                  & (X_test['sigma_km_product1_ICDHxm'] > 0.056)
                  & (X_test['sigma_km_product2_GS'] > -0.802)
                  & (X_test['sigma_km_substrate2_ILETAm'] <= 1.365)
                  & (X_test['sigma_km_substrate1_ADK1'] <= 1.551)].index

	return index_GP              


def dectree():

	index_GP = X_test[(X_test['Gamma_HCO3E'] > 1.843) 
                  & (X_test['sigma_km_substrate2_GAPD'] <= 0.08)].index

	return index_GP              


def frst():
	
	index_GP = X_test[(X_test['Gamma_GLUDC'] <= -1.277) 
                  & (X_test['sigma_km_product1_ICDHxm'] > 0.056)
                  & (X_test['sigma_km_product2_GS'] > -0.802)
                  & (X_test['sigma_km_substrate2_ILETAm'] <= 1.365)
                  & (X_test['sigma_km_substrate1_ADK1'] <= 1.551)].index

	return index_GP            

In [4]:
index_GP = logreg()
main(index_GP)

Number of rows following these rules: 14
The Stability Index on VALIDATION SET (sampled from TEST SET) is: SI = 71.4286 %
The Stability Index on TEST SET is: SI = 19.8473 %


In [5]:
index_GP

Int64Index([5, 53, 356, 50, 46, 8, 64, 367, 29, 33, 355, 38, 364, 2], dtype='int64')

In [137]:
scaler_df = pd.DataFrame(columns=scaler.feature_names_in_, index=['mean', 'std'])
scaler_df.loc['mean'] = scaler.mean_
scaler_df.loc['std'] = np.sqrt(scaler.var_)
scaler_df

Unnamed: 0,Gamma_AATA,Gamma_IPPS,Gamma_IPPSm,Gamma_DHQTi,Gamma_ADCS,Gamma_ABTA,Gamma_PGL,Gamma_ACACT1r,Gamma_ACOAHim,Gamma_ACOTAim,...,sigma_km_substrate_ccm2tp,sigma_km_product_ccm2tp,sigma_km_substrate_pca2tp,sigma_km_product_pca2tp,sigma_km_substrate_r2073_1,sigma_km_product_r2073_1,sigma_km_substrate1_r_4235,sigma_km_product1_r_4235,sigma_km_substrate2_r_4235,sigma_km_product2_r_4235
mean,0.20038,0.040256,0.0,0.009001,0.0,0.988116,0.146629,0.971178,0.000309,0.319922,...,0.507277,0.575783,0.476098,0.457047,0.47991,0.490833,0.537938,0.478768,0.473518,0.530375
std,0.108185,0.032165,0.0,0.002899,0.0,0.008292,0.074569,0.034174,0.003374,0.115822,...,0.271474,0.279218,0.29459,0.280432,0.272831,0.296384,0.288627,0.292923,0.276526,0.294821


In [139]:
def inverse_scaling(col, element):
  
    mu = scaler_df[col].loc['mean']   
    sigma = scaler_df[col].loc['std']

    print(mu, sigma)
    
    return element*sigma + mu

In [140]:
X_train.head()

Unnamed: 0,Gamma_AATA,Gamma_IPPS,Gamma_IPPSm,Gamma_DHQTi,Gamma_ADCS,Gamma_ABTA,Gamma_PGL,Gamma_ACACT1r,Gamma_ACOAHim,Gamma_ACOTAim,...,sigma_km_substrate_ccm2tp,sigma_km_product_ccm2tp,sigma_km_substrate_pca2tp,sigma_km_product_pca2tp,sigma_km_substrate_r2073_1,sigma_km_product_r2073_1,sigma_km_substrate1_r_4235,sigma_km_product1_r_4235,sigma_km_substrate2_r_4235,sigma_km_product2_r_4235
111,-0.227847,-0.000326,-0.091626,0.084171,-0.115193,0.439687,0.683179,0.55727,-0.091478,-0.124686,...,-0.979786,0.197194,-1.548021,-1.563677,1.867024,-0.280662,-1.480116,0.959978,-1.054226,0.657742
286,0.296027,1.224024,-0.091627,0.961258,0.147707,0.684057,0.195591,0.484136,-0.091478,0.143858,...,-1.107727,-0.972774,-1.180548,-0.774376,0.354609,0.184946,0.836937,0.154701,-0.949473,-1.171273
336,0.663807,-1.245924,-0.091398,-1.358971,-1.537311,-0.817258,-1.188042,0.448885,-0.091478,1.113605,...,-0.713173,-1.410036,0.448344,-0.661847,0.592793,1.130568,1.310157,1.500219,-0.717171,-0.030873
34,-1.051083,-1.023022,-0.091627,-0.939705,2.386114,-0.082855,-0.426095,-1.878516,-0.091478,-0.891981,...,0.049391,1.374172,0.843699,0.675973,-0.545657,-1.141886,0.024023,-1.079511,-0.147215,1.40136
332,0.663807,-1.245924,-0.091398,-1.358971,-1.537311,-0.817258,-1.188042,0.448885,-0.091478,1.113605,...,0.544955,1.416262,-0.358409,-0.791408,-0.228025,-1.044432,-1.502059,-1.366493,-1.498882,-0.435908


In [141]:
element = inverse_scaling('Gamma_IPPS', 1.224024)
element

0.04025621940248963 0.0321645291439099


0.0796263750233348

In [142]:
X_train.head()

Unnamed: 0,Gamma_AATA,Gamma_IPPS,Gamma_IPPSm,Gamma_DHQTi,Gamma_ADCS,Gamma_ABTA,Gamma_PGL,Gamma_ACACT1r,Gamma_ACOAHim,Gamma_ACOTAim,...,sigma_km_substrate_ccm2tp,sigma_km_product_ccm2tp,sigma_km_substrate_pca2tp,sigma_km_product_pca2tp,sigma_km_substrate_r2073_1,sigma_km_product_r2073_1,sigma_km_substrate1_r_4235,sigma_km_product1_r_4235,sigma_km_substrate2_r_4235,sigma_km_product2_r_4235
111,-0.227847,-0.000326,-0.091626,0.084171,-0.115193,0.439687,0.683179,0.55727,-0.091478,-0.124686,...,-0.979786,0.197194,-1.548021,-1.563677,1.867024,-0.280662,-1.480116,0.959978,-1.054226,0.657742
286,0.296027,1.224024,-0.091627,0.961258,0.147707,0.684057,0.195591,0.484136,-0.091478,0.143858,...,-1.107727,-0.972774,-1.180548,-0.774376,0.354609,0.184946,0.836937,0.154701,-0.949473,-1.171273
336,0.663807,-1.245924,-0.091398,-1.358971,-1.537311,-0.817258,-1.188042,0.448885,-0.091478,1.113605,...,-0.713173,-1.410036,0.448344,-0.661847,0.592793,1.130568,1.310157,1.500219,-0.717171,-0.030873
34,-1.051083,-1.023022,-0.091627,-0.939705,2.386114,-0.082855,-0.426095,-1.878516,-0.091478,-0.891981,...,0.049391,1.374172,0.843699,0.675973,-0.545657,-1.141886,0.024023,-1.079511,-0.147215,1.40136
332,0.663807,-1.245924,-0.091398,-1.358971,-1.537311,-0.817258,-1.188042,0.448885,-0.091478,1.113605,...,0.544955,1.416262,-0.358409,-0.791408,-0.228025,-1.044432,-1.502059,-1.366493,-1.498882,-0.435908


In [143]:
x_train.head()

Unnamed: 0,Gamma_AATA,Gamma_IPPS,Gamma_IPPSm,Gamma_DHQTi,Gamma_ADCS,Gamma_ABTA,Gamma_PGL,Gamma_ACACT1r,Gamma_ACOAHim,Gamma_ACOTAim,...,sigma_km_substrate_ccm2tp,sigma_km_product_ccm2tp,sigma_km_substrate_pca2tp,sigma_km_product_pca2tp,sigma_km_substrate_r2073_1,sigma_km_product_r2073_1,sigma_km_substrate1_r_4235,sigma_km_product1_r_4235,sigma_km_substrate2_r_4235,sigma_km_product2_r_4235
111,0.17573,0.040246,2.8e-13,0.009246,1.9e-08,0.991762,0.197574,0.990222,7.61e-12,0.30548,...,0.241291,0.630843,0.020067,0.018543,0.989293,0.407649,0.110737,0.759968,0.181997,0.724291
286,0.232406,0.079626,4.73e-14,0.011788,2.25e-08,0.993788,0.161215,0.987722,2.18e-12,0.336584,...,0.206558,0.304168,0.12832,0.239888,0.576659,0.545648,0.7795,0.524084,0.210964,0.18506
336,0.272194,0.000182,9.11e-11,0.005062,6.73e-11,0.981339,0.058038,0.986518,6.93e-10,0.448902,...,0.313669,0.182076,0.608176,0.271445,0.641643,0.825916,0.916084,0.918217,0.275201,0.521273
34,0.086668,0.007351,3.28e-15,0.006277,5.23e-08,0.987429,0.114856,0.906982,3.54e-12,0.21661,...,0.520685,0.959476,0.724643,0.646612,0.331038,0.152397,0.544872,0.162554,0.432809,0.943525
332,0.272194,0.000182,9.11e-11,0.005062,6.73e-11,0.981339,0.058038,0.986518,6.93e-10,0.448902,...,0.655218,0.971228,0.370514,0.235112,0.417698,0.18128,0.104404,0.07849,0.059038,0.401861
