In [1]:
import numpy as np
from sklearn import preprocessing, neighbors
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from typing import Tuple
from sklearn import model_selection
from sklearn.metrics import classification_report

import pandas as pd

import csv

randomstate = 1

In [2]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import scale

In [3]:
#cluster test/train data
def get_clusters(X_train: pd.DataFrame, X_test: pd.DataFrame, n_clusters: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies k-means clustering to training data to find clusters and predicts them for the test set
    """
    clustering = KMeans(n_clusters=n_clusters, random_state=randomstate)
    clustering.fit(X_train)
    # apply the labels to the training set
    train_labels = clustering.labels_
    X_train_clstrs = X_train.copy()
    X_train_clstrs['clusters'] = train_labels
    
    #write ext_clusters to csv
    X_train_clstrs.to_csv('OLD/X_train_clstrs.csv')
    
    # predict labels on the test set
    test_labels = clustering.predict(X_test)
    X_test_clstrs = X_test.copy()
    X_test_clstrs['clusters'] = test_labels
    return X_train_clstrs, X_test_clstrs

#scale each feature
def scale_features(X_train: pd.DataFrame, X_test: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies standard scaler (z-scores) to training data and predicts z-scores for the test set
    """
    scaler = StandardScaler()
    to_scale = [col for col in X_train.columns.values]
    scaler.fit(X_train[to_scale])
    X_train[to_scale] = scaler.transform(X_train[to_scale])
    
    # predict z-scores on the test set
    X_test[to_scale] = scaler.transform(X_test[to_scale])
    
    return X_train, X_test



In [4]:
df = pd.read_csv('..\..\data\_All_Receptors_runs_1_2_3_binary.csv')
df.drop(['min_feat','receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'donacc_prop'], 1, inplace=True)
df.fillna(-99999)
    
predictors = list(df.columns)
predictors = predictors[:-1]
    
print('Predictors:', predictors,'\n')
    
np.random.seed(randomstate)

#split data into quality/not quality sets
q_ph4s = df[df['quality'] == 1]
nq_ph4s = df[df['quality'] != 1]
    
#ensure that there is an equal number of nq ph4s
nq_ph4s = nq_ph4s.sample(n=2*len(q_ph4s))

#merge arrays prior to TTS
frames = [q_ph4s, nq_ph4s]
df = pd.concat(frames)

#x is features, y is classes
x = df.drop('quality', 1)
y = df.quality
    
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=randomstate)
    
    
print("x_train Q ph4s:", y_train.sum(),'\n')
print("x_test Q ph4s:",y_test.sum(),'\n')
    
X_train_clstrs, X_test_clstrs = get_clusters(x_train, x_test, 4)

pd.set_option("display.max_rows", None, "display.max_columns", None)
#print(X_train_clstrs['clusters'].unique())
clusters = X_train_clstrs['clusters']

X_train_scaled, X_test_scaled = scale_features(X_train_clstrs, X_test_clstrs)
    
#print(X_train_scaled)
    
# to divide the df by cluster, we need to ensure we use the correct class labels, we'll use pandas to do that
train_clusters = X_train_scaled.copy()
test_clusters = X_test_scaled.copy()
train_clusters['y'] = y_train
test_clusters['y'] = y_test

uniq_clusters = train_clusters['clusters'].unique()
uniqs = uniq_clusters.tolist()
uniqs.sort()
print(uniqs)

#print(y_train)
#print(train_clusters['clusters'])

#print(type(clusters))
#print(type(train_clusters['clusters']))
#frames = [clusters, train_clusters['clusters']]
#df = pd.concat(frames, axis=1)
#print(type(df))
#print(df)
#df.to_csv('4clusters.csv')

# locate the "0" cluster
train_0 = train_clusters.loc[train_clusters.clusters <= uniqs[0]] # after scaling, 0 went to -2.187
test_0 = test_clusters.loc[test_clusters.clusters <= uniqs[0]]
y_train_0 = train_0.y.values
y_test_0 = test_0.y.values
# locate the "1" cluster
train_1 = train_clusters.loc[(train_clusters.clusters <= uniqs[1]) & (train_clusters.clusters > uniqs[0])] # after scaling, 1 went to -0.62
test_1 = test_clusters.loc[(test_clusters.clusters <= uniqs[1]) & (test_clusters.clusters > uniqs[0])]
y_train_1 = train_1.y.values
y_test_1 = test_1.y.values

# locate the "2" cluster
train_2 = train_clusters.loc[(train_clusters.clusters <= uniqs[2]) & (train_clusters.clusters > uniqs[1])] # after scaling, 2 went to 0.945
test_2 = test_clusters.loc[(test_clusters.clusters <= uniqs[2]) & (test_clusters.clusters > uniqs[1])]
y_train_2 = train_2.y.values
y_test_2 = test_2.y.values

# locate the "3" cluster
train_3 = train_clusters.loc[train_clusters.clusters >= uniqs[3]] # after scaling, 3 went to 2.51
test_3 = test_clusters.loc[test_clusters.clusters >= uniqs[3]]
y_train_3 = train_3.y.values
y_test_3 = test_3.y.values

# drop the targets from the training set
X_train_0 = train_0.drop(columns=['y'])
X_test_0 = test_0.drop(columns=['y'])
X_train_1 = train_1.drop(columns=['y'])
X_test_1 = test_1.drop(columns=['y'])
X_train_2 = train_2.drop(columns=['y'])
X_test_2 = test_2.drop(columns=['y'])
X_train_3 = train_3.drop(columns=['y'])
X_test_3 = test_3.drop(columns=['y'])

print('X_train 1/2/3/4 cluster values\n')
print('-------------------------------\n')
print(X_train_0['clusters'].unique())
print(X_train_1['clusters'].unique())
print(X_train_2['clusters'].unique())
print(X_train_3['clusters'].unique(),'\n')

#print(X_train_0)
#print(len(X_test_1))

    
#0 cluster LR model
sgdc0 = SGDClassifier(loss="log", penalty="l1", max_iter=1000, tol=1e-2, class_weight='balanced')
sgdc0.fit(X_train_0, y_train_0)
    
y_pred = (sgdc0.predict(X_test_0))
confmat = confusion_matrix(y_test_0, y_pred, labels=[0,1])
#print(confmat)
    
confmat = confusion_matrix(y_test_0, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])
    
PPV = (TP / (TP + FP))
    
cm = pd.crosstab(y_test_0, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
    
print('0 cluster model\n')
print(cm,'\n')
    
print('PPV:', format(PPV, '.2f'),'\n')
    
#1 cluster LR model
sgdc1 = SGDClassifier(loss="log", penalty="l1", max_iter=1000, tol=1e-2, class_weight='balanced')
sgdc1.fit(X_train_1, y_train_1)
    
y_pred = (sgdc1.predict(X_test_1))
confmat = confusion_matrix(y_test_1, y_pred, labels=[0,1])
    
confmat = confusion_matrix(y_test_1, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])
    
PPV = (TP / (TP + FP))
    
cm = pd.crosstab(y_test_1, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
    
print('1 cluster model\n')
print(cm,'\n')

print('PPV:', format(PPV, '.2f'))

#print(X_train_0)
#print(len(X_train_2))

#2 cluster LR model
sgdc2 = SGDClassifier(loss="log", penalty="l1", max_iter=1000, tol=1e-2, class_weight='balanced')
sgdc2.fit(X_train_2, y_train_2)
    
y_pred = (sgdc2.predict(X_test_2))
confmat = confusion_matrix(y_test_2, y_pred, labels=[0,1])
#print(confmat)
    
confmat = confusion_matrix(y_test_2, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])
    
PPV = (TP / (TP + FP))
    
cm = pd.crosstab(y_test_2, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
    
print('2 cluster model\n')
print(cm,'\n')
    
print('PPV:', format(PPV, '.2f'),'\n')
    
#3 cluster LR model
sgdc3 = SGDClassifier(loss="log", penalty="l1", max_iter=1000, tol=1e-2, class_weight='balanced')
sgdc3.fit(X_train_3, y_train_3)
    
y_pred = (sgdc3.predict(X_test_3))
confmat = confusion_matrix(y_test_3, y_pred, labels=[0,1])
#print(confmat)
    
confmat = confusion_matrix(y_test_3, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])
    
PPV = (TP / (TP + FP))
    
cm = pd.crosstab(y_test_3, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
    
print('3 cluster model\n')
print(cm,'\n')

print('PPV:', format(PPV, '.2f'))

FileNotFoundError: [Errno 2] No such file or directory: '..\\..\\data\\_All_Receptors_runs_1_2_3_binary.csv'

In [5]:
def scale_features_single(X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies standard scaler (z-scores) to training data and predicts z-scores for the test set
    """
    scaler = StandardScaler()
    to_scale = [col for col in X.columns.values]
    scaler.fit(X[to_scale])
    X[to_scale] = scaler.transform(X[to_scale])
    
    return X

def classify_ext_data(subset):
    
    #CLassify external data (score based pharmacophore models)
    if subset == "moe":
        ext_df = pd.read_csv('..\..\data\score_based_moefrags_data_binary.csv')
    elif subset == "EF":
        ext_df = pd.read_csv('..\..\data\score_based_efdata_binary.csv')
    elif subset == "GH":
        ext_df = pd.read_csv('..\..\data\score_based_ghdata_binary.csv')
    elif subset == "rec_ef":
        ext_df = pd.read_csv('..\..\data\score_based_recefdata_binary.csv')
    elif subset == "rec_gh":
        ext_df = pd.read_csv('..\..\data\score_based_recghdata_binary.csv')
    elif subset == "moe_ef_gh":
        ext_df = pd.read_csv('..\..\data\score_based_moeefgh_data_binary.csv')
    elif subset == "all":
        ext_df = pd.read_csv('..\..\data\score_based_alldata_binary.csv')
        
    receptors = ext_df.Receptor
    hits_actual = ext_df.Hits
    ext_df.drop(['min_feat','Receptor', 'Score Type','Enrichment',  'hyd', 'don', 'acc', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc', 'acc_prop', 'donhyd_prop', 'donacc_prop'], 1, inplace=True)
    
    #drop extra column from data with searches
    if subset == "moe_searches":
        ext_df.drop(['search_features'], 1, inplace=True)
        
    ext_df.fillna(-99999)
    x = ext_df.drop('quality', 1)
    y = ext_df.quality
    
    predictors = list(ext_df.columns)
    predictors = predictors[:-1]
    print('Predictors:', predictors,'\n')
    
    print("score based Q ph4s:", y.sum(),'\n')
    
    #cluster training data
    clustering = KMeans(n_clusters=4, random_state=randomstate)
    clustering.fit(x_train)
    # apply the labels to the training set
    train_labels = clustering.labels_
    X_train_clstrs = x_train.copy()
    X_train_clstrs['clusters'] = train_labels
    
    # predict labels on the external set
    ext_labels = clustering.predict(x)
    X_clstrs = x.copy()
    X_clstrs['clusters'] = ext_labels
    
    X_scaled = scale_features_single(X_clstrs)
    ext_clusters = X_scaled.copy()
    ext_clusters['y'] = y
    
    #add receptors and hits_actual columns back prior to 0/1/2/3 split
    ext_clusters['Receptor'] = receptors
    ext_clusters['hits_actual'] = hits_actual
    
    #write ext_clusters to csv
    X_train_clstrs.to_csv('X_train_clstrs2.csv')
    
    # locate the "0" cluster
    ext_0 = ext_clusters.loc[ext_clusters.clusters <= uniqs[0]] # after scaling, 0 went negtive
    y_ext_0 = ext_0.y.values
    ext_0_receptors = ext_0.Receptor
    ext_0_hits_actual = ext_0.hits_actual
    
    # locate the "1" cluster
    ext_1 = ext_clusters.loc[(ext_clusters.clusters <= uniqs[1]) & (ext_clusters.clusters > uniqs[0])] # after scaling, 0 went negtive
    y_ext_1 = ext_1.y.values
    ext_1_receptors = ext_1.Receptor
    ext_1_hits_actual = ext_1.hits_actual
    
    # locate the "2" cluster
    ext_2 = ext_clusters.loc[(ext_clusters.clusters <= uniqs[2]) & (ext_clusters.clusters > uniqs[1])] # after scaling, 0 went negtive
    y_ext_2 = ext_2.y.values
    ext_2_receptors = ext_2.Receptor
    ext_2_hits_actual = ext_2.hits_actual
    
    # locate the "3" cluster
    ext_3 = ext_clusters.loc[ext_clusters.clusters > uniqs[3] ] # after scaling, 0 went negtive
    y_ext_3 = ext_3.y.values
    ext_3_receptors = ext_3.Receptor
    ext_3_hits_actual = ext_3.hits_actual

    # drop the targets from each external set
    X_ext_0 = ext_0.drop(columns=['y', 'Receptor', 'hits_actual'])
    X_ext_1 = ext_1.drop(columns=['y', 'Receptor', 'hits_actual'])
    X_ext_2 = ext_2.drop(columns=['y', 'Receptor', 'hits_actual'])
    X_ext_3 = ext_3.drop(columns=['y', 'Receptor', 'hits_actual'])
    
    # drop receptor column from each external set
    
    #print(len(X_ext_0))
    #print(len(X_ext_1))
    #print(len(X_ext_2))
    #print(len(X_ext_3))
    
    #predict based on 0 cluster model
    print('0 cluster model\n')
    if len(X_ext_0) == 0:
        print('No cluster 0 data.\n')
    else:
        y_pred = (sgdc0.predict(X_ext_0))
        confmat = confusion_matrix(y_ext_0, y_pred, labels=[0,1])
        #print(confmat)

        confmat = confusion_matrix(y_ext_0, y_pred, labels=[0,1]).ravel()
        FP = (confmat[1])
        TP = (confmat[3])

        PPV = (TP / (TP + FP))

        cm = pd.crosstab(y_ext_0, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

    
        print(cm,'\n')

        print('PPV:', format(PPV, '.2f'),'\n')
        
        X_ext_0['Receptor'] = ext_0_receptors
        X_ext_0['hits_actual'] = ext_0_hits_actual
        X_ext_0['quality'] = y_ext_0
        X_ext_0['quality_pred'] = y_pred
        X_ext_0.to_csv('results/'+subset+'/0cluster_results.csv')

    print('1 cluster model\n')
    if len(X_ext_1) == 0:
        print('No cluster 1 data.\n')
    else:
        y_pred = (sgdc1.predict(X_ext_1))
        confmat = confusion_matrix(y_ext_1, y_pred, labels=[0,1])
        #print(confmat)

        confmat = confusion_matrix(y_ext_1, y_pred, labels=[0,1]).ravel()
        FP = (confmat[1])
        TP = (confmat[3])

        PPV = (TP / (TP + FP))

        cm = pd.crosstab(y_ext_1, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

    
        print(cm,'\n')

        print('PPV:', format(PPV, '.2f'),'\n')
        
        X_ext_1['Receptor'] = ext_1_receptors
        X_ext_1['hits_actual'] = ext_1_hits_actual
        X_ext_1['quality'] = y_ext_1
        X_ext_1['quality_pred'] = y_pred
        X_ext_1.to_csv('results/'+subset+'/1cluster_results.csv')
        
    print('2 cluster model\n')
    if len(X_ext_2) == 0:
        print('No cluster 2 data.\n')
    else:
        y_pred = (sgdc2.predict(X_ext_2))
        confmat = confusion_matrix(y_ext_2, y_pred, labels=[0,1])
        #print(confmat)

        confmat = confusion_matrix(y_ext_2, y_pred, labels=[0,1]).ravel()
        FP = (confmat[1])
        TP = (confmat[3])

        PPV = (TP / (TP + FP))

        cm = pd.crosstab(y_ext_2, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

    
        print(cm,'\n')

        print('PPV:', format(PPV, '.2f'),'\n')
        
        X_ext_2['Receptor'] = ext_2_receptors
        X_ext_2['hits_actual'] = ext_2_hits_actual
        X_ext_2['quality'] = y_ext_2
        X_ext_2['quality_pred'] = y_pred
        X_ext_2.to_csv('results/'+subset+'/2cluster_results.csv')
        
    print('3 cluster model\n')
    if len(X_ext_3) == 0:
        print('No cluster 3 data.\n')
    else:
        y_pred = (sgdc3.predict(X_ext_3))
        confmat = confusion_matrix(y_ext_3, y_pred, labels=[0,1])
        #print(confmat)

        confmat = confusion_matrix(y_ext_3, y_pred, labels=[0,1]).ravel()
        FP = (confmat[1])
        TP = (confmat[3])

        PPV = (TP / (TP + FP))

        cm = pd.crosstab(y_ext_3, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

    
        print(cm,'\n')

        print('PPV:', format(PPV, '.2f'),'\n')
        
        X_ext_3['Receptor'] = ext_3_receptors
        X_ext_3['hits_actual'] = ext_3_hits_actual
        X_ext_3['quality'] = y_ext_3
        X_ext_3['quality_pred'] = y_pred
        X_ext_3.to_csv('results/'+subset+'/3cluster_results.csv')

In [6]:
classify_ext_data("all")

FileNotFoundError: [Errno 2] No such file or directory: '..\\..\\data\\score_based_alldata_binary.csv'

In [97]:
classify_ext_data("moe")

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 16 

0 cluster model

No cluster 0 data.

1 cluster model

Predicted  0  1
Actual         
0          3  1
1          6  6 

PPV: 0.86 

2 cluster model

Predicted   0   1
Actual           
0          10  17
1           2   2 

PPV: 0.11 

3 cluster model

No cluster 3 data.



Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.



In [98]:
classify_ext_data("EF")

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 20 

0 cluster model

No cluster 0 data.

1 cluster model

Predicted  0  1
Actual         
0          2  0
1          5  4 

PPV: 1.00 

2 cluster model

Predicted   0   1
Actual           
0          16  12
1           4   7 

PPV: 0.37 

3 cluster model

No cluster 3 data.



Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.



In [99]:
classify_ext_data("GH")

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 21 

0 cluster model

No cluster 0 data.

1 cluster model

Predicted  0  1
Actual         
0          3  1
1          9  6 

PPV: 0.86 

2 cluster model

Predicted  0   1
Actual          
0          8  15
1          3   3 

PPV: 0.17 

3 cluster model

No cluster 3 data.



Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.



In [100]:
classify_ext_data("rec_ef")

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 13 

0 cluster model

No cluster 0 data.

1 cluster model

Predicted  0  1
Actual         
0          1  0
1          4  4 

PPV: 1.00 

2 cluster model

Predicted   0   1
Actual           
0          18  18
1           1   4 

PPV: 0.18 

3 cluster model

No cluster 3 data.



Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.



In [101]:
classify_ext_data("rec_gh")

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 15 

0 cluster model

No cluster 0 data.

1 cluster model

Predicted  0  1
Actual         
0          1  2
1          6  2 

PPV: 0.50 

2 cluster model

Predicted  0   1
Actual          
0          9  22
1          6   1 

PPV: 0.04 

3 cluster model

No cluster 3 data.



Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.



In [65]:
#save models
#import joblib
#from joblib import dump, load

#joblib.dump(sgdc0 , 'model_sgdc0')
#joblib.dump(sgdc0 , 'model_sgdc1')
#joblib.dump(sgdc0 , 'model_sgdc2')
#joblib.dump(sgdc0 , 'model_sgdc3')

['model_sgdc3']