In [2]:
import numpy as np
from sklearn import preprocessing, neighbors
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from typing import Tuple
from sklearn import model_selection
from sklearn.metrics import classification_report

import pandas as pd

In [3]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import scale

In [4]:
#scale each feature
def scale_features(X_train: pd.DataFrame, X_test: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies standard scaler (z-scores) to training data and predicts z-scores for the test set
    """
    scaler = StandardScaler()
    to_scale = [col for col in X_train.columns.values]
    scaler.fit(X_train[to_scale])
    X_train[to_scale] = scaler.transform(X_train[to_scale])
    
    # predict z-scores on the test set
    X_test[to_scale] = scaler.transform(X_test[to_scale])
    
    return X_train, X_test



In [7]:
df = pd.read_csv('..\..\data\_All_Receptors_runs_1_2_3_binary.csv')
df.drop(['min_feat','s_score','receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
df.fillna(-99999)

predictors = list(df.columns)
predictors = predictors[:-1]

print('Predictors:', predictors,'\n')

np.random.seed(42)

#split data into quality/not quality sets
q_ph4s = df[df['quality'] == 1]
nq_ph4s = df[df['quality'] != 1]

#ensure that there is an equal number of nq ph4s
nq_ph4s = nq_ph4s.sample(n=1*len(q_ph4s))

#merge arrays prior to TTS
frames = [q_ph4s, nq_ph4s]
df = pd.concat(frames)

#x is features, y is classes
x = df.drop('quality', 1)
y = df.quality

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)


print("x_train Q ph4s:", y_train.sum(),'\n')
print("x_test Q ph4s:",y_test.sum(),'\n')

X_train_scaled, X_test_scaled = scale_features(x_train, x_test)

#LR model
sgdc = SGDClassifier(loss="log", penalty="l2", max_iter=1000, tol=1e-3, class_weight='balanced')
sgdc.fit(X_train_scaled, y_train)

y_pred = (sgdc.predict(X_test_scaled))
confmat = confusion_matrix(y_test, y_pred, labels=[0,1])
#print(confmat)

confmat = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

#print('0 cluster model\n')
print(cm,'\n')

print('PPV:', format(PPV, '.2f'),'\n')

Predictors: ['Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

x_train Q ph4s: 2711 

x_test Q ph4s: 910 

Predicted    0    1
Actual             
0          703  198
1          142  768 

PPV: 0.80 



In [8]:
def scale_features_single(X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies standard scaler (z-scores) to training data and predicts z-scores for the test set
    """
    scaler = StandardScaler()
    to_scale = [col for col in X.columns.values]
    scaler.fit(X[to_scale])
    X[to_scale] = scaler.transform(X[to_scale])
    
    return X

def classify_ext_data(subset):
    #CLassify external data (score based pharmacophore models)
    if subset == "moe":
        ext_df = pd.read_csv('..\..\data\score_based_moefrags_data_binary.csv')
    elif subset == "EF":
        ext_df = pd.read_csv('..\..\data\score_based_efdata_binary.csv')
    elif subset == "GH":
        ext_df = pd.read_csv('..\..\data\score_based_ghdata_binary.csv')
    elif subset == "all":
        ext_df = pd.read_csv('..\..\data\score_based_alldata_binary.csv')
        
    ext_df.drop(['min_feat','Receptor', 'Score Type','Enrichment',  'hyd', 'don', 'acc', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc', 'don_prop', 'acc_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
    ext_df.fillna(-99999)
    x = ext_df.drop('quality', 1)
    y = ext_df.quality
    
    predictors = list(ext_df.columns)
    predictors = predictors[:-1]
    print('Predictors:', predictors,'\n')
    
    print("score based Q ph4s:", y.sum(),'\n')
    
    X_scaled = scale_features_single(x)
    
    #predict based on 0 cluster model
    y_pred = (sgdc.predict(X_scaled))
    confmat = confusion_matrix(y, y_pred, labels=[0,1])
    #print(confmat)

    confmat = confusion_matrix(y, y_pred, labels=[0,1]).ravel()
    FP = (confmat[1])
    TP = (confmat[3])

    PPV = (TP / (TP + FP))

    cm = pd.crosstab(y, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

    print('0 cluster model\n')
    print(cm,'\n')

    print('PPV:', format(PPV, '.2f'),'\n')

In [13]:
classify_ext_data("all")

Predictors: ['Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

score based Q ph4s: 111 

0 cluster model

Predicted   0   1
Actual           
0          28  17
1          90  21 

PPV: 0.55 



In [14]:
classify_ext_data("moe")

Predictors: ['Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

score based Q ph4s: 42 

0 cluster model

Predicted   0  1
Actual          
0           9  1
1          33  9 

PPV: 0.90 



In [15]:
classify_ext_data("EF")

Predictors: ['Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

score based Q ph4s: 48 

0 cluster model

Predicted   0   1
Actual           
0           2   2
1          37  11 

PPV: 0.85 



In [16]:
classify_ext_data("GH")

Predictors: ['Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

score based Q ph4s: 21 

0 cluster model

Predicted   0  1
Actual          
0          22  9
1          19  2 

PPV: 0.18 

