In [6]:
import numpy as np
from sklearn import preprocessing, neighbors
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

import pandas as pd


In [7]:
df = pd.read_csv('..\..\data\_All_Receptors_runs_1_2_3_binary.csv')
df.drop(['s_score','receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
df.fillna(-99999)

predictors = list(df.columns)
predictors = predictors[:-1]

print('Predictors:', predictors,'\n')

np.random.seed(42)

#split data into quality/not quality sets
q_ph4s = df[df['quality'] == 1]
nq_ph4s = df[df['quality'] != 1]

#ensure that there is an equal number of nq ph4s
nq_ph4s = nq_ph4s.sample(n=len(q_ph4s))

#merge arrays prior to TTS
frames = [q_ph4s, nq_ph4s]
df = pd.concat(frames)

#x is features, y is classes
x = df.drop('quality', 1)
y = df.quality

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

#make instance of model
# all parameters not specified are set to their defaults
logisticRegr = LogisticRegression(max_iter = 5000)
logisticRegr.fit(x_train, y_train)

y_pred = (logisticRegr.predict(x_test))
confmat = confusion_matrix(y_test, y_pred, labels=[0,1])
#print(confmat)

confmat = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print(cm,'\n')

print('PPV:', format(PPV, '.2f'))


Predictors: ['Hits', 'max_feat', 'min_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

Predicted    0    1
Actual             
0          684  217
1           83  827 

PPV: 0.79


In [8]:
#CLassify external data (GH score-based pharmacophore models)

ext_df = pd.read_csv('..\..\data\score_based_alldata_binary.csv')
ext_df.drop(['Receptor', 'Score Type','Enrichment',  'hyd', 'don', 'acc', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc', 'don_prop', 'acc_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
ext_df.fillna(-99999)

#use this line to only classify a sample
#ext_df = ext_df.sample(n=50)

predictors = list(df.columns)
predictors = predictors[:-1]

print('Predictors:', predictors,'\n')

x = ext_df.drop('quality', 1)
y = ext_df.quality

ext_pred = (logisticRegr.predict(x))

confmat = confusion_matrix(y, ext_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y, ext_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print(cm,'\n')

print('PPV:', format(PPV, '.2f'))

Predictors: ['Hits', 'max_feat', 'min_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

Predicted  0   1
Actual          
0          7  65
1          5  79 

PPV: 0.55


In [4]:
#Model training/testing 7 times, each time leaving 1 receptor's data out and then classifying afterwards
for i in range(8):
    df = pd.read_csv('..\..\data\_All_Receptors_runs_1_2_3_binary.csv')
    df.drop(['s_score','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
    df.fillna(-99999)

    receptor_names = ['5HT2B','A2A','Beta 2','H1','M1','OPRD','OPRK','OPRM']
    holdout_receptor = receptor_names[i]
    del receptor_names[i]

    holdout_set = df.loc[df['receptor'] == holdout_receptor]
    df = df[df['receptor'] != holdout_receptor]

    #drop receptor column
    df.drop(['receptor'],1, inplace=True)
    holdout_set.drop(['receptor'],1, inplace=True)

    #split data into quality/not quality sets
    q_ph4s = df[df['quality'] == 1]
    nq_ph4s = df[df['quality'] != 1]

    #ensure that there is an equal number of nq ph4s
    nq_ph4s = nq_ph4s.sample(n=len(q_ph4s))

    #merge arrays prior to TTS
    frames = [q_ph4s, nq_ph4s]
    df = pd.concat(frames)

    #x is features, y is classes
    x = df.drop('quality', 1)
    y = df.quality

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

    #make instance of model
    # all parameters not specified are set to their defaults
    logisticRegr = LogisticRegression(max_iter = 5000)
    logisticRegr.fit(x_train, y_train)

    y_pred = (logisticRegr.predict(x_test))
    confmat = confusion_matrix(y_test, y_pred, labels=[0,1])
    #print(confmat)

    confmat = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()
    FP = (confmat[1])
    TP = (confmat[3])
    
    PPV = (TP / (TP + FP))

    cm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
    
    if i == 0:
        print('Testing Data')
        print('------------')
        print(cm,'\n')
        print('PPV:', format(PPV, '.2f'), '\n')

    holdout_set = holdout_set.sample(n=100)
    holdout_x = holdout_set.drop('quality', 1)
    holdout_y = holdout_set.quality

    holdout_pred = (logisticRegr.predict(holdout_x))

    confmat = confusion_matrix(holdout_y, holdout_pred, labels=[0,1]).ravel()
    FP = (confmat[1])
    TP = (confmat[3])

    PPV = (TP / (TP + FP))
    
    cm = pd.crosstab(holdout_y, holdout_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

    print('Holdout Data')
    print('------------')
    print(cm,'\n')

    print('PPV:', format(PPV, '.2f'),'\n')

Testing Data
------------
Predicted    0    1
Actual             
0          597  192
1           61  724 

PPV: 0.79 

Holdout Data
------------
Predicted   0   1
Actual           
0          63  35
1           0   2 

PPV: 0.05 

Holdout Data
------------
Predicted   0   1
Actual           
0          86  13
1           0   1 

PPV: 0.07 

Holdout Data
------------
Predicted   0   1
Actual           
0          90  10 

PPV: 0.00 

Holdout Data
------------
Predicted   0   1
Actual           
0          70  29
1           0   1 

PPV: 0.03 

Holdout Data
------------
Predicted   0   1
Actual           
0          79  15
1           3   3 

PPV: 0.17 

Holdout Data
------------
Predicted   0   1
Actual           
0          83  15
1           0   2 

PPV: 0.12 

Holdout Data
------------
Predicted   0   1
Actual           
0          76  23
1           0   1 

PPV: 0.04 

Holdout Data
------------
Predicted   0   1
Actual           
0          59  38
1           0   3 

PPV: 0.07 



In [25]:
#Next step: find best p_cutoff threshold for first script

In [43]:
PPV_values = []
p_cutoffs = []

for p_cutoff in np.arange(0.0,1.0,0.01):
    df = pd.read_csv('..\..\data\_All_Receptors_runs_1_2_3_binary.csv')
    df.drop(['s_score', 'receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop', 'min_feat'], 1, inplace=True)
    df.fillna(-99999)

    #split data into quality/not quality sets
    q_ph4s = df[df['quality'] == 1]
    nq_ph4s = df[df['quality'] != 1]

    #ensure that there is an equal number of nq ph4s
    nq_ph4s = nq_ph4s.sample(n=1*len(q_ph4s))

    #merge arrays prior to TTS
    frames = [q_ph4s, nq_ph4s]
    df = pd.concat(frames)

    #x is features, y is classes
    x = df.drop('quality', 1)
    y = df.quality

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

    #make instance of model
    # all parameters not specified are set to their defaults
    logisticRegr = LogisticRegression(max_iter = 5000)
    logisticRegr.fit(x_train, y_train)

    y_pred = (logisticRegr.predict_proba(x_test)[:,1] >= p_cutoff).astype(bool) # set threshold as p_cutoff
    confmat = confusion_matrix(y_test, y_pred, labels=[0,1])
    #print(confmat)

    confmat = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()
    FP = (confmat[1])
    TP = (confmat[3])

    PPV = (TP / (TP + FP))

    cm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
    
    if p_cutoff > 0.0:
        print('\n')
        
    print(p_cutoff)
    print(cm,'\n')
    print('PPV:', format(PPV, '.2f'))
    
    p_cutoffs.append(p_cutoff)
    PPV_values.append(PPV)
    
res = {p_cutoffs[i]: PPV_values[i] for i in range(len(p_cutoffs))}

#print(res)

#Find item with Max Value in Dictionary
itemMaxValue = max(res.items(), key=lambda x: x[1])
print('Maximum Value in Dictionary : ', itemMaxValue[1])
listOfKeys = list()
# Iterate over all the items in dictionary to find keys with max value
for key, value in res.items():
    if value == itemMaxValue[1]:
        listOfKeys.append(key)
print('Keys with maximum Value in Dictionary : ', listOfKeys)

0.0
Predicted  True
Actual         
0           901
1           910 

PPV: 0.50


0.01
Predicted  False  True 
Actual                 
0            308    593
1              3    907 

PPV: 0.60


0.02
Predicted  False  True 
Actual                 
0            333    568
1              4    906 

PPV: 0.61


0.03
Predicted  False  True 
Actual                 
0            365    536
1              6    904 

PPV: 0.63


0.04
Predicted  False  True 
Actual                 
0            377    524
1              7    903 

PPV: 0.63


0.05
Predicted  False  True 
Actual                 
0            411    490
1              7    903 

PPV: 0.65


0.06
Predicted  False  True 
Actual                 
0            415    486
1              7    903 

PPV: 0.65


0.07
Predicted  False  True 
Actual                 
0            441    460
1              7    903 

PPV: 0.66


0.08
Predicted  False  True 
Actual                 
0            431    470
1              7    903 

PPV: 0.66




0.71
Predicted  False  True 
Actual                 
0            794    107
1            309    601 

PPV: 0.85


0.72
Predicted  False  True 
Actual                 
0            789    112
1            330    580 

PPV: 0.84


0.73
Predicted  False  True 
Actual                 
0            789    112
1            343    567 

PPV: 0.84


0.74
Predicted  False  True 
Actual                 
0            800    101
1            368    542 

PPV: 0.84


0.75
Predicted  False  True 
Actual                 
0            791    110
1            393    517 

PPV: 0.82


0.76
Predicted  False  True 
Actual                 
0            805     96
1            409    501 

PPV: 0.84


0.77
Predicted  False  True 
Actual                 
0            826     75
1            435    475 

PPV: 0.86


0.78
Predicted  False  True 
Actual                 
0            815     86
1            436    474 

PPV: 0.85


0.79
Predicted  False  True 
Actual                 
0            815     86
1

  PPV = (TP / (TP + FP))


In [9]:
#CLassify external data w/ custom p_cutoff (D2 6LUQ pharmacophore models)
#Steps for creating "refined" external dataset:
ext_df = pd.read_csv('..\..\data\score_based_alldata_binary.csv')
ext_df.drop(['Receptor', 'Score Type','Enrichment',  'hyd', 'don', 'acc', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc', 'don_prop', 'acc_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
ext_df.fillna(-99999)


#ext_df = ext_df.sample(n=50)

x = ext_df.drop('quality', 1)
y = ext_df.quality

ext_pred = (logisticRegr.predict_proba(x)[:,1] >= 0.98).astype(bool) # set threshold as p_cutoff

confmat = confusion_matrix(y, ext_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y, ext_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print(cm,'\n')

print('PPV:', format(PPV, '.2f'))

Predicted  False  True 
Actual                 
0             12     60
1             10     74 

PPV: 0.55
