In [2]:
import numpy as np
from sklearn import preprocessing, neighbors
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

import pandas as pd
import imblearn
from imblearn.over_sampling import RandomOverSampler


In [5]:
df = pd.read_csv('..\data\_All_Receptors_runs_1_2_3_binary.csv')
df.drop(['receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
df.fillna(-99999)

predictors = list(df.columns)
predictors = predictors[:-1]

print('Predictors:', predictors,'\n')

np.random.seed(42)

#x is features, y is classes
x = df.drop('quality', 1)
y = df.quality

# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')

#create oversampled dataset
x_over, y_over = oversample.fit_resample(x, y)

x_train, x_test, y_train, y_test = train_test_split(x_over, y_over, test_size=0.25, random_state=0)

#make instance of model
# all parameters not specified are set to their defaults
logisticRegr = LogisticRegression(max_iter = 5000)
logisticRegr.fit(x_train, y_train)

y_pred = (logisticRegr.predict(x_test))
confmat = confusion_matrix(y_test, y_pred, labels=[0,1])
#print(confmat)

confmat = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print(cm,'\n')

print('PPV:', format(PPV, '.2f'))


Predictors: ['Hits', 'max_feat', 'min_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 's_score', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

Predicted      0      1
Actual                 
0          21393   6720
1           2338  25848 

PPV: 0.79


In [5]:
#CLassify external data (D2 6LUQ pharmacophore models)
#Steps for creating "refined" external dataset:
#1. delete max_feat >15
#2. delete min_feat >5
#3.
ext_df = pd.read_csv('data\D2_6LUQ_pharmacophores_binary.csv')
ext_df.drop(['receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
ext_df.fillna(-99999)

#use this line to only classify a sample
#ext_df = ext_df.sample(n=50)

x = ext_df.drop('quality', 1)
y = ext_df.quality

ext_pred = (logisticRegr.predict(x))

confmat = confusion_matrix(y, ext_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y, ext_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print(cm,'\n')

print('PPV:', format(PPV, '.2f'))

Predicted     0     1
Actual               
0          1932  2921
1             1    77 

PPV: 0.03


In [7]:
#Model training/testing 7 times, each time leaving 1 receptor's data out and then classifying afterwards
for i in range(8):
    df = pd.read_csv('..\data\_All_Receptors_runs_1_2_3_binary.csv')
    df.drop(['Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop', 'min_feat'], 1, inplace=True)
    df.fillna(-99999)

    receptor_names = ['5HT2B','A2A','Beta 2','H1','M1','OPRD','OPRK','OPRM']
    holdout_receptor = receptor_names[i]
    del receptor_names[i]

    holdout_set = df.loc[df['receptor'] == holdout_receptor]
    df = df[df['receptor'] != holdout_receptor]

    #drop receptor column
    df.drop(['receptor'],1, inplace=True)
    holdout_set.drop(['receptor'],1, inplace=True)

    #split data into quality/not quality sets
    q_ph4s = df[df['quality'] == 1]
    nq_ph4s = df[df['quality'] != 1]

    #ensure that there is an equal number of nq ph4s
    nq_ph4s = nq_ph4s.sample(n=len(q_ph4s))

    #merge arrays prior to TTS
    frames = [q_ph4s, nq_ph4s]
    df = pd.concat(frames)

    #x is features, y is classes
    x = df.drop('quality', 1)
    y = df.quality

   # define oversampling strategy
    oversample = RandomOverSampler(sampling_strategy='minority')

    #create oversampled dataset
    x_over, y_over = oversample.fit_resample(x, y)
    x_train, x_test, y_train, y_test = train_test_split(x_over, y_over, test_size=0.25, random_state=0)

    #make instance of model
    # all parameters not specified are set to their defaults
    logisticRegr = LogisticRegression(max_iter = 5000)
    logisticRegr.fit(x_train, y_train)

    y_pred = (logisticRegr.predict(x_test))
    confmat = confusion_matrix(y_test, y_pred, labels=[0,1])
    #print(confmat)

    confmat = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()
    FP = (confmat[1])
    TP = (confmat[3])
    
    PPV = (TP / (TP + FP))

    cm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
    
    if i == 0:
        print('Testing Data')
        print('------------')
        print(cm,'\n')
        print('PPV:', format(PPV, '.2f'), '\n')

    holdout_set = holdout_set.sample(n=100)
    holdout_x = holdout_set.drop('quality', 1)
    holdout_y = holdout_set.quality

    holdout_pred = (logisticRegr.predict(holdout_x))

    confmat = confusion_matrix(holdout_y, holdout_pred, labels=[0,1]).ravel()
    FP = (confmat[1])
    TP = (confmat[3])

    PPV = (TP / (TP + FP))
    
    cm = pd.crosstab(holdout_y, holdout_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

    print('Holdout Data')
    print('------------')
    print(cm,'\n')

    print('PPV:', format(PPV, '.2f'),'\n')

Testing Data
------------
Predicted    0    1
Actual             
0          619  170
1           67  718 

PPV: 0.81 

Holdout Data
------------
Predicted   0   1
Actual           
0          73  24
1           0   3 

PPV: 0.11 

Holdout Data
------------
Predicted   0   1
Actual           
0          83  17 

PPV: 0.00 

Holdout Data
------------
Predicted   0  1
Actual          
0          90  9
1           0  1 

PPV: 0.10 

Holdout Data
------------
Predicted   0   1
Actual           
0          74  26 

PPV: 0.00 

Holdout Data
------------
Predicted   0   1
Actual           
0          82  12
1           5   1 

PPV: 0.08 

Holdout Data
------------
Predicted   0   1
Actual           
0          78  20
1           0   2 

PPV: 0.09 

Holdout Data
------------
Predicted   0   1
Actual           
0          76  21
1           1   2 

PPV: 0.09 

Holdout Data
------------
Predicted   0   1
Actual           
0          60  33
1           0   7 

PPV: 0.17 



In [None]:
#Next step: find best p_cutoff threshold for first script

In [9]:
PPV_values = []
p_cutoffs = []

for p_cutoff in np.arange(0.0,1.0,0.01):
    df = pd.read_csv('..\data\_All_Receptors_runs_1_2_3_binary.csv')
    df.drop(['receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop', 'min_feat'], 1, inplace=True)
    df.fillna(-99999)

    #split data into quality/not quality sets
    q_ph4s = df[df['quality'] == 1]
    nq_ph4s = df[df['quality'] != 1]

    #ensure that there is an equal number of nq ph4s
    nq_ph4s = nq_ph4s.sample(n=1*len(q_ph4s))

    #merge arrays prior to TTS
    frames = [q_ph4s, nq_ph4s]
    df = pd.concat(frames)

    #x is features, y is classes
    x = df.drop('quality', 1)
    y = df.quality

    # define oversampling strategy
    oversample = RandomOverSampler(sampling_strategy='minority')

    #create oversampled dataset
    x_over, y_over = oversample.fit_resample(x, y)

    x_train, x_test, y_train, y_test = train_test_split(x_over, y_over, test_size=0.25, random_state=0)

    #make instance of model
    # all parameters not specified are set to their defaults
    logisticRegr = LogisticRegression(max_iter = 5000)
    logisticRegr.fit(x_train, y_train)

    y_pred = (logisticRegr.predict_proba(x_test)[:,1] >= p_cutoff).astype(bool) # set threshold as p_cutoff
    confmat = confusion_matrix(y_test, y_pred, labels=[0,1])
    #print(confmat)

    confmat = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()
    FP = (confmat[1])
    TP = (confmat[3])

    PPV = (TP / (TP + FP))

    cm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
    
    if p_cutoff > 0.0:
        print('\n')
        
    print(p_cutoff)
    print(cm,'\n')
    print('PPV:', format(PPV, '.2f'))
    
    p_cutoffs.append(p_cutoff)
    PPV_values.append(PPV)
    
res = {p_cutoffs[i]: PPV_values[i] for i in range(len(p_cutoffs))}

#print(res)

#Find item with Max Value in Dictionary
itemMaxValue = max(res.items(), key=lambda x: x[1])
print('Maximum Value in Dictionary : ', itemMaxValue[1])
listOfKeys = list()
# Iterate over all the items in dictionary to find keys with max value
for key, value in res.items():
    if value == itemMaxValue[1]:
        listOfKeys.append(key)
print('Keys with maximum Value in Dictionary : ', listOfKeys)

0.0
Predicted  True
Actual         
0           901
1           910 

PPV: 0.50


0.01
Predicted  False  True 
Actual                 
0            296    605
1              3    907 

PPV: 0.60


0.02
Predicted  False  True 
Actual                 
0            321    580
1              4    906 

PPV: 0.61


0.03
Predicted  False  True 
Actual                 
0            373    528
1              5    905 

PPV: 0.63


0.04
Predicted  False  True 
Actual                 
0            380    521
1              6    904 

PPV: 0.63


0.05
Predicted  False  True 
Actual                 
0            401    500
1              7    903 

PPV: 0.64


0.06
Predicted  False  True 
Actual                 
0            426    475
1              7    903 

PPV: 0.66


0.07
Predicted  False  True 
Actual                 
0            426    475
1              7    903 

PPV: 0.66


0.08
Predicted  False  True 
Actual                 
0            445    456
1              7    903 

PPV: 0.66




0.71
Predicted  False  True 
Actual                 
0            796    105
1            311    599 

PPV: 0.85


0.72
Predicted  False  True 
Actual                 
0            797    104
1            328    582 

PPV: 0.85


0.73
Predicted  False  True 
Actual                 
0            783    118
1            352    558 

PPV: 0.83


0.74
Predicted  False  True 
Actual                 
0            798    103
1            364    546 

PPV: 0.84


0.75
Predicted  False  True 
Actual                 
0            832     69
1            413    497 

PPV: 0.88


0.76
Predicted  False  True 
Actual                 
0            795    106
1            397    513 

PPV: 0.83


0.77
Predicted  False  True 
Actual                 
0            811     90
1            426    484 

PPV: 0.84


0.78
Predicted  False  True 
Actual                 
0            817     84
1            468    442 

PPV: 0.84


0.79
Predicted  False  True 
Actual                 
0            842     59
1

  PPV = (TP / (TP + FP))


In [13]:
#CLassify external data w/ custom p_cutoff (D2 6LUQ pharmacophore models)
#Steps for creating "refined" external dataset:
ext_df = pd.read_csv('..\data\D2_6LUQ_pharmacophores_binary.csv')
ext_df.drop(['receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop', 'min_feat'], 1, inplace=True)
ext_df.fillna(-99999)

#ext_df = ext_df.sample(n=50)

x = ext_df.drop('quality', 1)
y = ext_df.quality

ext_pred = (logisticRegr.predict_proba(x)[:,1] >= 0.97).astype(bool) # set threshold as p_cutoff

confmat = confusion_matrix(y, ext_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y, ext_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print(cm,'\n')

print('PPV:', format(PPV, '.2f'))

Predicted  False  True 
Actual                 
0           4703    150
1             76      2 

PPV: 0.01


Feature names must be in the same order as they were in fit.

