In [1]:
import numpy as np
from sklearn import preprocessing, neighbors
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

import pandas as pd


In [4]:
df = pd.read_csv('data\_All_Receptors_runs_1_2_3_binary.csv')
df.drop(['receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
df.fillna(-99999)

predictors = list(df.columns)
predictors = predictors[:-1]

print('Predictors:', predictors,'\n')

np.random.seed(42)

#split data into quality/not quality sets
q_ph4s = df[df['quality'] == 1]
nq_ph4s = df[df['quality'] != 1]

#ensure that there is an equal number of nq ph4s
nq_ph4s = nq_ph4s.sample(n=len(q_ph4s))

#merge arrays prior to TTS
frames = [q_ph4s, nq_ph4s]
df = pd.concat(frames)

#x is features, y is classes
x = df.drop('quality', 1)
y = df.quality

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

#make instance of model
# all parameters not specified are set to their defaults
logisticRegr = LogisticRegression(max_iter = 5000)
logisticRegr.fit(x_train, y_train)

y_pred = (logisticRegr.predict(x_test))
confmat = confusion_matrix(y_test, y_pred, labels=[0,1])
#print(confmat)

confmat = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print(cm,'\n')

print('PPV:', format(PPV, '.2f'))


Predictors: ['Hits', 'max_feat', 'min_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 's_score', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

Predicted    0    1
Actual             
0          563  172
1           69  645 

PPV: 0.79


In [5]:
#CLassify external data (D2 6LUQ pharmacophore models)
#Steps for creating "refined" external dataset:
#1. delete max_feat >15
#2. delete min_feat >5
#3.
ext_df = pd.read_csv('data\D2_6LUQ_pharmacophores_binary.csv')
ext_df.drop(['receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
ext_df.fillna(-99999)

#use this line to only classify a sample
#ext_df = ext_df.sample(n=50)

x = ext_df.drop('quality', 1)
y = ext_df.quality

ext_pred = (logisticRegr.predict(x))

confmat = confusion_matrix(y, ext_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y, ext_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print(cm,'\n')

print('PPV:', format(PPV, '.2f'))

Predicted     0     1
Actual               
0          1932  2921
1             1    77 

PPV: 0.03


In [32]:
#Model training/testing 7 times, each time leaving 1 receptor's data out and then classifying afterwards
for i in range(8):
    df = pd.read_csv('data\_All_Receptors_runs_1_2_3_binary.csv')
    df.drop(['Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop', 'min_feat'], 1, inplace=True)
    df.fillna(-99999)

    receptor_names = ['5HT2B','A2A','Beta 2','H1','M1','OPRD','OPRK','OPRM']
    holdout_receptor = receptor_names[i]
    del receptor_names[i]

    holdout_set = df.loc[df['receptor'] == holdout_receptor]
    df = df[df['receptor'] != holdout_receptor]

    #drop receptor column
    df.drop(['receptor'],1, inplace=True)
    holdout_set.drop(['receptor'],1, inplace=True)

    #split data into quality/not quality sets
    q_ph4s = df[df['quality'] == 1]
    nq_ph4s = df[df['quality'] != 1]

    #ensure that there is an equal number of nq ph4s
    nq_ph4s = nq_ph4s.sample(n=len(q_ph4s))

    #merge arrays prior to TTS
    frames = [q_ph4s, nq_ph4s]
    df = pd.concat(frames)

    #x is features, y is classes
    x = df.drop('quality', 1)
    y = df.quality

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

    #make instance of model
    # all parameters not specified are set to their defaults
    logisticRegr = LogisticRegression(max_iter = 5000)
    logisticRegr.fit(x_train, y_train)

    y_pred = (logisticRegr.predict(x_test))
    confmat = confusion_matrix(y_test, y_pred, labels=[0,1])
    #print(confmat)

    confmat = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()
    FP = (confmat[1])
    TP = (confmat[3])
    
    PPV = (TP / (TP + FP))

    cm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
    
    if i == 0:
        print('Testing Data')
        print('------------')
        print(cm,'\n')
        print('PPV:', format(PPV, '.2f'), '\n')

    holdout_set = holdout_set.sample(n=100)
    holdout_x = holdout_set.drop('quality', 1)
    holdout_y = holdout_set.quality

    holdout_pred = (logisticRegr.predict(holdout_x))

    confmat = confusion_matrix(holdout_y, holdout_pred, labels=[0,1]).ravel()
    FP = (confmat[1])
    TP = (confmat[3])

    PPV = (TP / (TP + FP))
    
    cm = pd.crosstab(holdout_y, holdout_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

    print('Holdout Data')
    print('------------')
    print(cm,'\n')

    print('PPV:', format(PPV, '.2f'),'\n')

Testing Data
------------
Predicted    0    1
Actual             
0          622  167
1           75  710 

PPV: 0.81 

Holdout Data
------------
Predicted   0   1
Actual           
0          62  36
1           0   2 

PPV: 0.05 

Holdout Data
------------
Predicted   0   1
Actual           
0          78  22 

PPV: 0.00 

Holdout Data
------------
Predicted   0   1
Actual           
0          77  22
1           0   1 

PPV: 0.04 

Holdout Data
------------
Predicted   0   1
Actual           
0          78  20
1           1   1 

PPV: 0.05 

Holdout Data
------------
Predicted   0   1
Actual           
0          75  16
1           3   6 

PPV: 0.27 

Holdout Data
------------
Predicted   0   1
Actual           
0          78  19
1           0   3 

PPV: 0.14 

Holdout Data
------------
Predicted   0   1
Actual           
0          80  18
1           1   1 

PPV: 0.05 

Holdout Data
------------
Predicted   0   1
Actual           
0          46  45
1           0   9 

PPV: 0.17 



In [None]:
#Next step: find best p_cutoff threshold for first script

In [24]:
PPV_values = []
p_cutoffs = []

for p_cutoff in np.arange(0.0,1.0,0.01):
    df = pd.read_csv('data\_All_Receptors_runs_1_2_3_binary.csv')
    df.drop(['receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop', 'min_feat'], 1, inplace=True)
    df.fillna(-99999)

    #split data into quality/not quality sets
    q_ph4s = df[df['quality'] == 1]
    nq_ph4s = df[df['quality'] != 1]

    #ensure that there is an equal number of nq ph4s
    nq_ph4s = nq_ph4s.sample(n=1*len(q_ph4s))

    #merge arrays prior to TTS
    frames = [q_ph4s, nq_ph4s]
    df = pd.concat(frames)

    #x is features, y is classes
    x = df.drop('quality', 1)
    y = df.quality

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

    #make instance of model
    # all parameters not specified are set to their defaults
    logisticRegr = LogisticRegression(max_iter = 5000)
    logisticRegr.fit(x_train, y_train)

    y_pred = (logisticRegr.predict_proba(x_test)[:,1] >= p_cutoff).astype(bool) # set threshold as p_cutoff
    confmat = confusion_matrix(y_test, y_pred, labels=[0,1])
    #print(confmat)

    confmat = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()
    FP = (confmat[1])
    TP = (confmat[3])

    PPV = (TP / (TP + FP))

    cm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
    
    if p_cutoff > 0.0:
        print('\n')
        
    print(p_cutoff)
    print(cm,'\n')
    print('PPV:', format(PPV, '.2f'))
    
    p_cutoffs.append(p_cutoff)
    PPV_values.append(PPV)
    
res = {p_cutoffs[i]: PPV_values[i] for i in range(len(p_cutoffs))}

#print(res)

#Find item with Max Value in Dictionary
itemMaxValue = max(res.items(), key=lambda x: x[1])
print('Maximum Value in Dictionary : ', itemMaxValue[1])
listOfKeys = list()
# Iterate over all the items in dictionary to find keys with max value
for key, value in res.items():
    if value == itemMaxValue[1]:
        listOfKeys.append(key)
print('Keys with maximum Value in Dictionary : ', listOfKeys)

0.0
Predicted  True
Actual         
0          1844
1           872 

PPV: 0.32


0.01
Predicted  False  True 
Actual                 
0            664   1180
1              3    869 

PPV: 0.42


0.02
Predicted  False  True 
Actual                 
0            739   1105
1              4    868 

PPV: 0.44


0.03
Predicted  False  True 
Actual                 
0            836   1008
1              4    868 

PPV: 0.46


0.04
Predicted  False  True 
Actual                 
0            867    977
1              4    868 

PPV: 0.47


0.05
Predicted  False  True 
Actual                 
0            850    994
1              4    868 

PPV: 0.47


0.06
Predicted  False  True 
Actual                 
0            996    848
1              4    868 

PPV: 0.51


0.07
Predicted  False  True 
Actual                 
0            995    849
1              4    868 

PPV: 0.51


0.08
Predicted  False  True 
Actual                 
0           1004    840
1              4    868 

PPV: 0.51




0.71
Predicted  False  True 
Actual                 
0           1732    112
1            527    345 

PPV: 0.75


0.72
Predicted  False  True 
Actual                 
0           1754     90
1            570    302 

PPV: 0.77


0.73
Predicted  False  True 
Actual                 
0           1764     80
1            584    288 

PPV: 0.78


0.74
Predicted  False  True 
Actual                 
0           1778     66
1            599    273 

PPV: 0.81


0.75
Predicted  False  True 
Actual                 
0           1780     64
1            618    254 

PPV: 0.80


0.76
Predicted  False  True 
Actual                 
0           1786     58
1            629    243 

PPV: 0.81


0.77
Predicted  False  True 
Actual                 
0           1777     67
1            646    226 

PPV: 0.77


0.78
Predicted  False  True 
Actual                 
0           1786     58
1            651    221 

PPV: 0.79


0.79
Predicted  False  True 
Actual                 
0           1800     44
1

  PPV = (TP / (TP + FP))




0.96
Predicted  False
Actual          
0           1844
1            872 

PPV: nan


  PPV = (TP / (TP + FP))




0.97
Predicted  False
Actual          
0           1844
1            872 

PPV: nan


  PPV = (TP / (TP + FP))




0.98
Predicted  False
Actual          
0           1844
1            872 

PPV: nan


0.99
Predicted  False
Actual          
0           1844
1            872 

PPV: nan
Maximum Value in Dictionary :  1.0
Keys with maximum Value in Dictionary :  [0.9500000000000001]


  PPV = (TP / (TP + FP))


In [30]:
#CLassify external data w/ custom p_cutoff (D2 6LUQ pharmacophore models)
#Steps for creating "refined" external dataset:
ext_df = pd.read_csv('data\D2_6LUQ_pharmacophores_binary.csv')
ext_df.drop(['receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop', 'min_feat'], 1, inplace=True)
ext_df.fillna(-99999)

#ext_df = ext_df.sample(n=50)

x = ext_df.drop('quality', 1)
y = ext_df.quality

ext_pred = (logisticRegr.predict_proba(x)[:,1] >= 0.95).astype(bool) # set threshold as p_cutoff

confmat = confusion_matrix(y, ext_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y, ext_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print(cm,'\n')

print('PPV:', format(PPV, '.2f'))

Predicted  False  True 
Actual                 
0           4764     89
1             78      0 

PPV: 0.00
