In [1]:
import numpy as np
from sklearn import preprocessing, neighbors
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from typing import Tuple
from sklearn import model_selection
from sklearn.metrics import classification_report

import pandas as pd


In [2]:
#cluster test/train data
def get_clusters(X_train: pd.DataFrame, X_test: pd.DataFrame, n_clusters: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies k-means clustering to training data to find clusters and predicts them for the test set
    """
    clustering = KMeans(n_clusters=n_clusters, random_state=8675309)
    clustering.fit(X_train)
    # apply the labels
    train_labels = clustering.labels_
    X_train_clstrs = X_train.copy()
    X_train_clstrs['clusters'] = train_labels
    
    # predict labels on the test set
    test_labels = clustering.predict(X_test)
    X_test_clstrs = X_test.copy()
    X_test_clstrs['clusters'] = test_labels
    return X_train_clstrs, X_test_clstrs

#scale each feature
def scale_features(X_train: pd.DataFrame, X_test: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies standard scaler (z-scores) to training data and predicts z-scores for the test set
    """
    scaler = StandardScaler()
    to_scale = [col for col in X_train.columns.values]
    scaler.fit(X_train[to_scale])
    X_train[to_scale] = scaler.transform(X_train[to_scale])
    
    # predict z-scores on the test set
    X_test[to_scale] = scaler.transform(X_test[to_scale])
    
    return X_train, X_test



In [3]:
df = pd.read_csv('..\..\data\_All_Receptors_runs_1_2_3_binary.csv')
df.drop(['s_score','receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
df.fillna(-99999)

predictors = list(df.columns)
predictors = predictors[:-1]

print('Predictors:', predictors,'\n')

np.random.seed(42)

#split data into quality/not quality sets
q_ph4s = df[df['quality'] == 1]
nq_ph4s = df[df['quality'] != 1]

#ensure that there is an equal number of nq ph4s
nq_ph4s = nq_ph4s.sample(n=1*len(q_ph4s))

#merge arrays prior to TTS
frames = [q_ph4s, nq_ph4s]
df = pd.concat(frames)

#x is features, y is classes
x = df.drop('quality', 1)
y = df.quality

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)


print("x_train Q ph4s:", y_train.sum(),'\n')
print("x_test Q ph4s:",y_test.sum(),'\n')

X_train_clstrs, X_test_clstrs = get_clusters(x_train, x_test, 2)

#print(x_train_clstrs)

X_train_scaled, X_test_scaled = scale_features(X_train_clstrs, X_test_clstrs)

#print(X_train_scaled)

# to divide the df by cluster, we need to ensure we use the correct class labels, we'll use pandas to do that
train_clusters = X_train_scaled.copy()
test_clusters = X_test_scaled.copy()
train_clusters['y'] = y_train
test_clusters['y'] = y_test

#print(y_train)
#print(train_clusters)

# locate the "0" cluster
train_0 = train_clusters.loc[train_clusters.clusters < 0] # after scaling, 0 went negtive
test_0 = test_clusters.loc[test_clusters.clusters < 0]
y_train_0 = train_0.y.values
y_test_0 = test_0.y.values
# locate the "1" cluster
train_1 = train_clusters.loc[train_clusters.clusters > 0] # after scaling, 1 dropped slightly
test_1 = test_clusters.loc[test_clusters.clusters > 0]
y_train_1 = train_1.y.values
y_test_1 = test_1.y.values
# the base dataset has no "clusters" feature
X_train_base = X_train_scaled.drop(columns=['clusters'])
X_test_base = X_test_scaled.drop(columns=['clusters'])
# drop the targets from the training set
X_train_0 = train_0.drop(columns=['y'])
X_test_0 = test_0.drop(columns=['y'])
X_train_1 = train_1.drop(columns=['y'])
X_test_1 = test_1.drop(columns=['y'])

#print(X_train_0)
#print(len(X_test_1))

#0 cluster LR model
logisticRegr0 = LogisticRegression(max_iter = 5000)
logisticRegr0.fit(X_train_0, y_train_0)

y_pred = (logisticRegr0.predict(X_test_0))
confmat = confusion_matrix(y_test_0, y_pred, labels=[0,1])
#print(confmat)

confmat = confusion_matrix(y_test_0, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y_test_0, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print('0 cluster model\n')
print(cm,'\n')

print('PPV:', format(PPV, '.2f'),'\n')

#1 cluster LR model
logisticRegr1 = LogisticRegression(max_iter = 5000)
logisticRegr1.fit(X_train_1, y_train_1)

y_pred = (logisticRegr1.predict(X_test_1))
confmat = confusion_matrix(y_test_1, y_pred, labels=[0,1])
#print(confmat)

confmat = confusion_matrix(y_test_1, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y_test_1, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print('1 cluster model\n')
print(cm,'\n')

print('PPV:', format(PPV, '.2f'))

Predictors: ['Hits', 'max_feat', 'min_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

x_train Q ph4s: 2711 

x_test Q ph4s: 910 

0 cluster model

Predicted    0    1
Actual             
0          688  195
1           88  760 

PPV: 0.80 

1 cluster model

Predicted  0   1
Actual          
0          1  17
1          0  62 

PPV: 0.78


In [4]:
def scale_features_single(X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies standard scaler (z-scores) to training data and predicts z-scores for the test set
    """
    scaler = StandardScaler()
    to_scale = [col for col in X.columns.values]
    scaler.fit(X[to_scale])
    X[to_scale] = scaler.transform(X[to_scale])
    
    return X

#CLassify external data (score-based pharmacophore models)
#Steps for creating "refined" external dataset:
#1. delete max_feat >15
#2. delete min_feat >5
#3.
ext_df = pd.read_csv('..\..\data\score_based_alldata_binary.csv')
ext_df.drop(['Receptor', 'Score Type','Enrichment',  'hyd', 'don', 'acc', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc', 'don_prop', 'acc_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
ext_df.fillna(-99999)

x = ext_df.drop('quality', 1)
y = ext_df.quality

print("score-based Q ph4s:", y.sum(),'\n')

clustering = KMeans(n_clusters=2, random_state=8675309)
clustering.fit(x)

train_labels = clustering.labels_

X_clstrs = x.copy()
X_clstrs['clusters'] = train_labels

X_scaled = scale_features_single(X_clstrs)
ext_clusters = X_scaled.copy()
ext_clusters['y'] = y

# locate the "0" cluster
ext_0 = ext_clusters.loc[ext_clusters.clusters < 0] # after scaling, 0 went negtive
y_ext_0 = ext_0.y.values

# locate the "1" cluster
ext_1 = ext_clusters.loc[ext_clusters.clusters > 0] # after scaling, 0 went negtive
y_ext_1 = ext_1.y.values

# drop the targets from the external set
X_ext_0 = ext_0.drop(columns=['y'])
X_ext_1 = ext_1.drop(columns=['y'])

#predict based on 0 cluster model
y_pred = (logisticRegr0.predict(X_ext_0))
confmat = confusion_matrix(y_ext_0, y_pred, labels=[0,1])
#print(confmat)

confmat = confusion_matrix(y_ext_0, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y_ext_0, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print('0 cluster model\n')
print(cm,'\n')

print('PPV:', format(PPV, '.2f'),'\n')

#1 cluster LR model
y_pred = (logisticRegr1.predict(X_ext_1))
confmat = confusion_matrix(y_ext_1, y_pred, labels=[0,1])
#print(confmat)

confmat = confusion_matrix(y_ext_1, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y_ext_1, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print('1 cluster model\n')
print(cm,'\n')

print('PPV:', format(PPV, '.2f'))

score-based Q ph4s: 57 

0 cluster model

Predicted   0
Actual       
0          11 

PPV: nan 

1 cluster model

Predicted   0   1
Actual           
0          35  53
1          33  24 

PPV: 0.31


  PPV = (TP / (TP + FP))


In [5]:
#find best p_cutoff for 0 cluster model
PPV_values = []
p_cutoffs = []

for p_cutoff in np.arange(0.0,1.0,0.01):
    df = pd.read_csv('..\..\data\_All_Receptors_runs_1_2_3_binary.csv')
    df.drop(['s_score', 'receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop', 'min_feat'], 1, inplace=True)
    df.fillna(-99999)

    #split data into quality/not quality sets
    q_ph4s = df[df['quality'] == 1]
    nq_ph4s = df[df['quality'] != 1]

    #ensure that there is an equal number of nq ph4s
    nq_ph4s = nq_ph4s.sample(n=1*len(q_ph4s))

    #merge arrays prior to TTS
    frames = [q_ph4s, nq_ph4s]
    df = pd.concat(frames)

    #x is features, y is classes
    x = df.drop('quality', 1)
    y = df.quality

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
    
    X_train_clstrs, X_test_clstrs = get_clusters(x_train, x_test, 2)

    #print(x_train_clstrs)

    X_train_scaled, X_test_scaled = scale_features(X_train_clstrs, X_test_clstrs)

    #print(X_train_scaled)

    # to divide the df by cluster, we need to ensure we use the correct class labels, we'll use pandas to do that
    train_clusters = X_train_scaled.copy()
    test_clusters = X_test_scaled.copy()
    train_clusters['y'] = y_train
    test_clusters['y'] = y_test

    #print(y_train)
    #print(train_clusters)

    # locate the "0" cluster
    train_0 = train_clusters.loc[train_clusters.clusters < 0] # after scaling, 0 went negtive
    test_0 = test_clusters.loc[test_clusters.clusters < 0]
    y_train_0 = train_0.y.values
    y_test_0 = test_0.y.values
    # locate the "1" cluster
    train_1 = train_clusters.loc[train_clusters.clusters > 0] # after scaling, 1 dropped slightly
    test_1 = test_clusters.loc[test_clusters.clusters > 0]
    y_train_1 = train_1.y.values
    y_test_1 = test_1.y.values
    # the base dataset has no "clusters" feature
    X_train_base = X_train_scaled.drop(columns=['clusters'])
    X_test_base = X_test_scaled.drop(columns=['clusters'])
    # drop the targets from the training set
    X_train_0 = train_0.drop(columns=['y'])
    X_test_0 = test_0.drop(columns=['y'])
    X_train_1 = train_1.drop(columns=['y'])
    X_test_1 = test_1.drop(columns=['y'])

    #print(X_train_0)
    #print(len(X_test_1))
    
    ##0 cluster LR model
    logisticRegr0 = LogisticRegression(max_iter = 5000)
    logisticRegr0.fit(X_train_0, y_train_0)

    y_pred = (logisticRegr0.predict_proba(X_test_0)[:,1] >= p_cutoff).astype(bool) # set threshold as p_cutoff
    confmat = confusion_matrix(y_test_0, y_pred, labels=[0,1])
    #print(confmat)

    confmat = confusion_matrix(y_test_0, y_pred, labels=[0,1]).ravel()
    FP = (confmat[1])
    TP = (confmat[3])

    PPV = (TP / (TP + FP))

    cm = pd.crosstab(y_test_0, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
    
    if p_cutoff > 0.0:
        print('\n')
        
    print(p_cutoff)
    print(cm,'\n')
    print('PPV:', format(PPV, '.2f'))
    
    p_cutoffs.append(p_cutoff)
    PPV_values.append(PPV)
    
res = {p_cutoffs[i]: PPV_values[i] for i in range(len(p_cutoffs))}

#print(res)

#Find item with Max Value in Dictionary
itemMaxValue = max(res.items(), key=lambda x: x[1])
print('Maximum Value in Dictionary : ', itemMaxValue[1])
listOfKeys = list()
# Iterate over all the items in dictionary to find keys with max value
for key, value in res.items():
    if value == itemMaxValue[1]:
        listOfKeys.append(key)
print('Keys with maximum Value in Dictionary : ', listOfKeys)

0.0
Predicted  True
Actual         
0           883
1           840 

PPV: 0.49


0.01
Predicted  False  True 
Actual                 
0            296    596
1              3    840 

PPV: 0.58


0.02
Predicted  True
Actual         
0            18
1            58 

PPV: 0.76


0.03
Predicted  True
Actual         
0            14
1            64 

PPV: 0.82


0.04
Predicted  False  True 
Actual                 
0            411    474
1              6    842 

PPV: 0.64


0.05
Predicted  False  True 
Actual                 
0            401    484
1              6    848 

PPV: 0.64


0.06
Predicted  False  True 
Actual                 
0            388    489
1              6    836 

PPV: 0.63


0.07
Predicted  True
Actual         
0            18
1            68 

PPV: 0.79


0.08
Predicted  False  True 
Actual                 
0            438    446
1              7    859 

PPV: 0.66


0.09
Predicted  False  True 
Actual                 
0            435    438
1              6 



0.73
Predicted  False  True 
Actual                 
0            805     76
1            321    527 

PPV: 0.87


0.74
Predicted  False  True 
Actual                 
0            779    105
1            336    512 

PPV: 0.83


0.75
Predicted  False  True 
Actual                 
0            812     73
1            362    501 

PPV: 0.87


0.76
Predicted  False  True 
Actual                 
0            795     95
1            376    470 

PPV: 0.83


0.77
Predicted  False  True 
Actual                 
0             14      8
1             25     47 

PPV: 0.85


0.78
Predicted  False  True 
Actual                 
0            819     61
1            429    417 

PPV: 0.87


0.79
Predicted  False  True 
Actual                 
0            799     80
1            417    425 

PPV: 0.84


0.8
Predicted  False  True 
Actual                 
0            828     59
1            474    368 

PPV: 0.86


0.81
Predicted  False  True 
Actual                 
0            817     63
1 

  PPV = (TP / (TP + FP))


In [6]:
#find best p_cutoff for 1 cluster model
PPV_values = []
p_cutoffs = []

for p_cutoff in np.arange(0.0,1.0,0.01):
    df = pd.read_csv('..\..\data\_All_Receptors_runs_1_2_3_binary.csv')
    df.drop(['s_score', 'receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop', 'min_feat'], 1, inplace=True)
    df.fillna(-99999)

    #split data into quality/not quality sets
    q_ph4s = df[df['quality'] == 1]
    nq_ph4s = df[df['quality'] != 1]

    #ensure that there is an equal number of nq ph4s
    nq_ph4s = nq_ph4s.sample(n=1*len(q_ph4s))

    #merge arrays prior to TTS
    frames = [q_ph4s, nq_ph4s]
    df = pd.concat(frames)

    #x is features, y is classes
    x = df.drop('quality', 1)
    y = df.quality

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
    
    X_train_clstrs, X_test_clstrs = get_clusters(x_train, x_test, 2)

    #print(x_train_clstrs)

    X_train_scaled, X_test_scaled = scale_features(X_train_clstrs, X_test_clstrs)

    #print(X_train_scaled)

    # to divide the df by cluster, we need to ensure we use the correct class labels, we'll use pandas to do that
    train_clusters = X_train_scaled.copy()
    test_clusters = X_test_scaled.copy()
    train_clusters['y'] = y_train
    test_clusters['y'] = y_test

    #print(y_train)
    #print(train_clusters)

    # locate the "0" cluster
    train_0 = train_clusters.loc[train_clusters.clusters < 0] # after scaling, 0 went negtive
    test_0 = test_clusters.loc[test_clusters.clusters < 0]
    y_train_0 = train_0.y.values
    y_test_0 = test_0.y.values
    # locate the "1" cluster
    train_1 = train_clusters.loc[train_clusters.clusters > 0] # after scaling, 1 dropped slightly
    test_1 = test_clusters.loc[test_clusters.clusters > 0]
    y_train_1 = train_1.y.values
    y_test_1 = test_1.y.values
    # the base dataset has no "clusters" feature
    X_train_base = X_train_scaled.drop(columns=['clusters'])
    X_test_base = X_test_scaled.drop(columns=['clusters'])
    # drop the targets from the training set
    X_train_0 = train_0.drop(columns=['y'])
    X_test_0 = test_0.drop(columns=['y'])
    X_train_1 = train_1.drop(columns=['y'])
    X_test_1 = test_1.drop(columns=['y'])

    #print(X_train_0)
    #print(len(X_test_1))
    
    ##0 cluster LR model
    logisticRegr1 = LogisticRegression(max_iter = 5000)
    logisticRegr1.fit(X_train_1, y_train_1)

    y_pred = (logisticRegr1.predict_proba(X_test_1)[:,1] >= p_cutoff).astype(bool) # set threshold as p_cutoff
    confmat = confusion_matrix(y_test_1, y_pred, labels=[0,1])
    #print(confmat)

    confmat = confusion_matrix(y_test_1, y_pred, labels=[0,1]).ravel()
    FP = (confmat[1])
    TP = (confmat[3])

    PPV = (TP / (TP + FP))

    cm = pd.crosstab(y_test_1, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
    
    if p_cutoff > 0.0:
        print('\n')
        
    print(p_cutoff)
    print(cm,'\n')
    print('PPV:', format(PPV, '.2f'))
    
    p_cutoffs.append(p_cutoff)
    PPV_values.append(PPV)
    
res = {p_cutoffs[i]: PPV_values[i] for i in range(len(p_cutoffs))}

#print(res)

#Find item with Max Value in Dictionary
itemMaxValue = max(res.items(), key=lambda x: x[1])
print('Maximum Value in Dictionary : ', itemMaxValue[1])
listOfKeys = list()
# Iterate over all the items in dictionary to find keys with max value
for key, value in res.items():
    if value == itemMaxValue[1]:
        listOfKeys.append(key)
print('Keys with maximum Value in Dictionary : ', listOfKeys)

0.0
Predicted  True
Actual         
0            24
1            63 

PPV: 0.72


0.01
Predicted  False  True 
Actual                 
0            287    591
1              3    849 

PPV: 0.59


0.02
Predicted  True
Actual         
0            25
1            50 

PPV: 0.67


0.03
Predicted  True
Actual         
0            20
1            60 

PPV: 0.75


0.04
Predicted  True
Actual         
0            12
1            68 

PPV: 0.85


0.05
Predicted  True
Actual         
0            25
1            70 

PPV: 0.74


0.06
Predicted  True
Actual         
0            25
1            72 

PPV: 0.74


0.07
Predicted  True
Actual         
0            18
1            50 

PPV: 0.74


0.08
Predicted  False  True 
Actual                 
0              1     19
1              0     64 

PPV: 0.77


0.09
Predicted  True
Actual         
0            13
1            65 

PPV: 0.83


0.1
Predicted  True
Actual         
0            28
1            72 

PPV: 0.72


0.11
Predicted  False  Tr



0.79
Predicted  False  True 
Actual                 
0            836     53
1            443    411 

PPV: 0.89


0.8
Predicted  False  True 
Actual                 
0             10      3
1             25     33 

PPV: 0.92


0.81
Predicted  False  True 
Actual                 
0            818     69
1            484    356 

PPV: 0.84


0.8200000000000001
Predicted  False  True 
Actual                 
0             16     10
1             24     45 

PPV: 0.82


0.8300000000000001
Predicted  False  True 
Actual                 
0             10      3
1             39     28 

PPV: 0.90


0.84
Predicted  False  True 
Actual                 
0            835     50
1            493    350 

PPV: 0.88


0.85
Predicted  False  True 
Actual                 
0              8      2
1             41     27 

PPV: 0.93


0.86
Predicted  False  True 
Actual                 
0             25      6
1             40     30 

PPV: 0.83


0.87
Predicted  False  True 
Actual                

  PPV = (TP / (TP + FP))




0.89
Predicted  False
Actual          
0             20
1             68 

PPV: nan


0.9
Predicted  False  True 
Actual                 
0             20      0
1             54      9 

PPV: 1.00


0.91
Predicted  False  True 
Actual                 
0            847     30
1            643    203 

PPV: 0.87


  PPV = (TP / (TP + FP))




0.92
Predicted  False
Actual          
0             17
1             62 

PPV: nan


0.93
Predicted  False  True 
Actual                 
0             17      0
1             45      2 

PPV: 1.00


0.9400000000000001
Predicted  False  True 
Actual                 
0             15      0
1             64      1 

PPV: 1.00


0.9500000000000001
Predicted  False  True 
Actual                 
0             21      0
1             67      1 

PPV: 1.00


  PPV = (TP / (TP + FP))




0.96
Predicted  False
Actual          
0             28
1             70 

PPV: nan


  PPV = (TP / (TP + FP))




0.97
Predicted  False
Actual          
0             12
1             66 

PPV: nan


  PPV = (TP / (TP + FP))




0.98
Predicted  False
Actual          
0             17
1             70 

PPV: nan


0.99
Predicted  False
Actual          
0             19
1             73 

PPV: nan
Maximum Value in Dictionary :  1.0
Keys with maximum Value in Dictionary :  [0.78, 0.88, 0.9, 0.93, 0.9400000000000001, 0.9500000000000001]


  PPV = (TP / (TP + FP))


In [21]:
df = pd.read_csv('..\..\data\_All_Receptors_runs_1_2_3_binary.csv')
df.drop(['s_score','receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
df.fillna(-99999)

predictors = list(df.columns)
predictors = predictors[:-1]

print('Predictors:', predictors,'\n')

np.random.seed(42)

#split data into quality/not quality sets
q_ph4s = df[df['quality'] == 1]
nq_ph4s = df[df['quality'] != 1]

#ensure that there is an equal number of nq ph4s
nq_ph4s = nq_ph4s.sample(n=1*len(q_ph4s))

#merge arrays prior to TTS
frames = [q_ph4s, nq_ph4s]
df = pd.concat(frames)

#x is features, y is classes
x = df.drop('quality', 1)
y = df.quality

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)


print("x_train Q ph4s:", y_train.sum(),'\n')
print("x_test Q ph4s:",y_test.sum(),'\n')

X_train_clstrs, X_test_clstrs = get_clusters(x_train, x_test, 2)

#print(x_train_clstrs)

X_train_scaled, X_test_scaled = scale_features(X_train_clstrs, X_test_clstrs)

#print(X_train_scaled)

# to divide the df by cluster, we need to ensure we use the correct class labels, we'll use pandas to do that
train_clusters = X_train_scaled.copy()
test_clusters = X_test_scaled.copy()
train_clusters['y'] = y_train
test_clusters['y'] = y_test

#print(y_train)
#print(train_clusters)

# locate the "0" cluster
train_0 = train_clusters.loc[train_clusters.clusters < 0] # after scaling, 0 went negtive
test_0 = test_clusters.loc[test_clusters.clusters < 0]
y_train_0 = train_0.y.values
y_test_0 = test_0.y.values
# locate the "1" cluster
train_1 = train_clusters.loc[train_clusters.clusters > 0] # after scaling, 1 dropped slightly
test_1 = test_clusters.loc[test_clusters.clusters > 0]
y_train_1 = train_1.y.values
y_test_1 = test_1.y.values
# the base dataset has no "clusters" feature
X_train_base = X_train_scaled.drop(columns=['clusters'])
X_test_base = X_test_scaled.drop(columns=['clusters'])
# drop the targets from the training set
X_train_0 = train_0.drop(columns=['y'])
X_test_0 = test_0.drop(columns=['y'])
X_train_1 = train_1.drop(columns=['y'])
X_test_1 = test_1.drop(columns=['y'])

#print(X_train_0)
#print(len(X_test_1))

#0 cluster LR model
logisticRegr0 = LogisticRegression(max_iter = 5000)
logisticRegr0.fit(X_train_0, y_train_0)

y_pred = (logisticRegr0.predict_proba(X_test_0)[:,1] >= 0.94).astype(bool)
confmat = confusion_matrix(y_test_0, y_pred, labels=[0,1])
#print(confmat)

confmat = confusion_matrix(y_test_0, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y_test_0, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print('0 cluster model\n')
print(cm,'\n')

print('PPV:', format(PPV, '.2f'),'\n')

#1 cluster LR model
logisticRegr1 = LogisticRegression(max_iter = 5000)
logisticRegr1.fit(X_train_1, y_train_1)

y_pred = (logisticRegr1.predict_proba(X_test_1)[:,1] >= 0.88).astype(bool)
confmat = confusion_matrix(y_test_1, y_pred, labels=[0,1])
#print(confmat)

confmat = confusion_matrix(y_test_1, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y_test_1, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print('1 cluster model\n')
print(cm,'\n')

print('PPV:', format(PPV, '.2f'))

Predictors: ['Hits', 'max_feat', 'min_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

x_train Q ph4s: 2711 

x_test Q ph4s: 910 

0 cluster model

Predicted  False  True 
Actual                 
0            869     14
1            738    110 

PPV: 0.89 

1 cluster model

Predicted  False  True 
Actual                 
0             17      1
1             42     20 

PPV: 0.95


In [22]:
#CLassify external data (score-based pharmacophore models)
#Steps for creating "refined" external dataset:
#1. delete max_feat >15
#2. delete min_feat >5
#3.
ext_df = pd.read_csv('..\..\data\score_based_alldata_binary.csv')
ext_df.drop(['Receptor', 'Score Type','Enrichment',  'hyd', 'don', 'acc', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc', 'don_prop', 'acc_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
ext_df.fillna(-99999)

x = ext_df.drop('quality', 1)
y = ext_df.quality

print("score-based Q ph4s:", y.sum(),'\n')

clustering = KMeans(n_clusters=2, random_state=8675309)
clustering.fit(x)

train_labels = clustering.labels_

X_clstrs = x.copy()
X_clstrs['clusters'] = train_labels

X_scaled = scale_features_single(X_clstrs)
ext_clusters = X_scaled.copy()
ext_clusters['y'] = y

# locate the "0" cluster
ext_0 = ext_clusters.loc[ext_clusters.clusters < 0] # after scaling, 0 went negtive
y_ext_0 = ext_0.y.values

# locate the "1" cluster
ext_1 = ext_clusters.loc[ext_clusters.clusters > 0] # after scaling, 0 went negtive
y_ext_1 = ext_1.y.values

# drop the targets from the external set
X_ext_0 = ext_0.drop(columns=['y'])
X_ext_1 = ext_1.drop(columns=['y'])

#predict based on 0 cluster model
y_pred = (logisticRegr0.predict_proba(X_ext_0)[:,1] >= 0.95).astype(bool)
confmat = confusion_matrix(y_ext_0, y_pred, labels=[0,1])
#print(confmat)

confmat = confusion_matrix(y_ext_0, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y_ext_0, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print('0 cluster model\n')
print(cm,'\n')

print('PPV:', format(PPV, '.2f'),'\n')

#1 cluster LR model
y_pred = (logisticRegr1.predict_proba(X_ext_1)[:,1] >= 0.94).astype(bool)
confmat = confusion_matrix(y_ext_1, y_pred, labels=[0,1])
#print(confmat)

confmat = confusion_matrix(y_ext_1, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y_ext_1, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print('1 cluster model\n')
print(cm,'\n')

print('PPV:', format(PPV, '.2f'))

print(len(ext_1))

score-based Q ph4s: 57 

0 cluster model

Predicted  False
Actual          
0             11 

PPV: nan 

1 cluster model

Predicted  False  True 
Actual                 
0             74     14
1             48      9 

PPV: 0.39
145


  PPV = (TP / (TP + FP))
