In [1]:
import numpy as np
from sklearn import preprocessing, neighbors
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from typing import Tuple
from sklearn import model_selection
from sklearn.metrics import classification_report

import pandas as pd

In [2]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import scale

from sklearn import datasets, svm, pipeline
from sklearn.kernel_approximation import (RBFSampler, Nystroem)

In [3]:
#cluster test/train data
def get_clusters(X_train: pd.DataFrame, X_test: pd.DataFrame, n_clusters: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies k-means clustering to training data to find clusters and predicts them for the test set
    """
    clustering = KMeans(n_clusters=n_clusters, random_state=8675309)
    clustering.fit(X_train)
    # apply the labels
    train_labels = clustering.labels_
    X_train_clstrs = X_train.copy()
    X_train_clstrs['clusters'] = train_labels
    
    # predict labels on the test set
    test_labels = clustering.predict(X_test)
    X_test_clstrs = X_test.copy()
    X_test_clstrs['clusters'] = test_labels
    return X_train_clstrs, X_test_clstrs

#scale each feature
def scale_features(X_train: pd.DataFrame, X_test: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies standard scaler (z-scores) to training data and predicts z-scores for the test set
    """
    scaler = StandardScaler()
    to_scale = [col for col in X_train.columns.values]
    scaler.fit(X_train[to_scale])
    X_train[to_scale] = scaler.transform(X_train[to_scale])
    
    # predict z-scores on the test set
    X_test[to_scale] = scaler.transform(X_test[to_scale])
    
    return X_train, X_test



In [4]:
df = pd.read_csv('..\data\_All_Receptors_runs_1_2_3_binary.csv')
df.drop(['receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
df.fillna(-99999)

predictors = list(df.columns)
predictors = predictors[:-1]

print('Predictors:', predictors,'\n')

np.random.seed(42)

#split data into quality/not quality sets
q_ph4s = df[df['quality'] == 1]
nq_ph4s = df[df['quality'] != 1]

#ensure that there is an equal number of nq ph4s
nq_ph4s = nq_ph4s.sample(n=1*len(q_ph4s))

#merge arrays prior to TTS
frames = [q_ph4s, nq_ph4s]
df = pd.concat(frames)

#x is features, y is classes
x = df.drop('quality', 1)
y = df.quality

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)


print("x_train Q ph4s:", y_train.sum(),'\n')
print("x_test Q ph4s:",y_test.sum(),'\n')

X_train_clstrs, X_test_clstrs = get_clusters(x_train, x_test, 2)

#print(x_train_clstrs)

X_train_scaled, X_test_scaled = scale_features(X_train_clstrs, X_test_clstrs)

#print(X_train_scaled)

# to divide the df by cluster, we need to ensure we use the correct class labels, we'll use pandas to do that
train_clusters = X_train_scaled.copy()
test_clusters = X_test_scaled.copy()
train_clusters['y'] = y_train
test_clusters['y'] = y_test

#print(y_train)
#print(train_clusters)

# locate the "0" cluster
train_0 = train_clusters.loc[train_clusters.clusters < 0] # after scaling, 0 went negtive
test_0 = test_clusters.loc[test_clusters.clusters < 0]
y_train_0 = train_0.y.values
y_test_0 = test_0.y.values
# locate the "1" cluster
train_1 = train_clusters.loc[train_clusters.clusters > 0] # after scaling, 1 dropped slightly
test_1 = test_clusters.loc[test_clusters.clusters > 0]
y_train_1 = train_1.y.values
y_test_1 = test_1.y.values
# the base dataset has no "clusters" feature
X_train_base = X_train_scaled.drop(columns=['clusters'])
X_test_base = X_test_scaled.drop(columns=['clusters'])
# drop the targets from the training set
X_train_0 = train_0.drop(columns=['y'])
X_test_0 = test_0.drop(columns=['y'])
X_train_1 = train_1.drop(columns=['y'])
X_test_1 = test_1.drop(columns=['y'])

#print(X_train_0)
#print(len(X_test_1))

#0 cluster LR model
kernel_svm0 = svm.SVC(kernel='rbf', gamma='auto', class_weight = 'balanced')
kernel_svm0.fit(X_train_0, y_train_0)

y_pred = (kernel_svm0.predict(X_test_0))
confmat = confusion_matrix(y_test_0, y_pred, labels=[0,1])
#print(confmat)

confmat = confusion_matrix(y_test_0, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y_test_0, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print('0 cluster model\n')
print(cm,'\n')

print('PPV:', format(PPV, '.2f'),'\n')

#1 cluster LR model
kernel_svm1 = svm.SVC(kernel='rbf', gamma='auto', class_weight = 'balanced')
kernel_svm1.fit(X_train_1, y_train_1)

y_pred = (kernel_svm1.predict(X_test_1))
confmat = confusion_matrix(y_test_1, y_pred, labels=[0,1])
#print(confmat)

confmat = confusion_matrix(y_test_1, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y_test_1, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print('1 cluster model\n')
print(cm,'\n')

print('PPV:', format(PPV, '.2f'))

Predictors: ['Hits', 'max_feat', 'min_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 's_score', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

x_train Q ph4s: 2711 

x_test Q ph4s: 910 

0 cluster model

Predicted    0    1
Actual             
0          672  211
1           45  803 

PPV: 0.79 

1 cluster model

Predicted   0   1
Actual           
0          14   4
1          19  43 

PPV: 0.91


In [5]:
def scale_features_single(X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies standard scaler (z-scores) to training data and predicts z-scores for the test set
    """
    scaler = StandardScaler()
    to_scale = [col for col in X.columns.values]
    scaler.fit(X[to_scale])
    X[to_scale] = scaler.transform(X[to_scale])
    
    return X

#CLassify external data (D2 6LUQ pharmacophore models)
#Steps for creating "refined" external dataset:
#1. delete max_feat >15
#2. delete min_feat >5
#3.
ext_df = pd.read_csv('..\data\D2_6LUQ_pharmacophores_binary.csv')
ext_df.drop(['receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
ext_df.fillna(-99999)

x = ext_df.drop('quality', 1)
y = ext_df.quality

print("D2 Q ph4s:", y.sum(),'\n')

clustering = KMeans(n_clusters=2, random_state=8675309)
clustering.fit(x)

train_labels = clustering.labels_

X_clstrs = x.copy()
X_clstrs['clusters'] = train_labels

X_scaled = scale_features_single(X_clstrs)
ext_clusters = X_scaled.copy()
ext_clusters['y'] = y

# locate the "0" cluster
ext_0 = ext_clusters.loc[ext_clusters.clusters < 0] # after scaling, 0 went negtive
y_ext_0 = ext_0.y.values

# locate the "1" cluster
ext_1 = ext_clusters.loc[ext_clusters.clusters > 0] # after scaling, 0 went negtive
y_ext_1 = ext_1.y.values

# drop the targets from the external set
X_ext_0 = ext_0.drop(columns=['y'])
X_ext_1 = ext_1.drop(columns=['y'])

#predict based on 0 cluster model
y_pred = (kernel_svm0.predict(X_ext_0))
confmat = confusion_matrix(y_ext_0, y_pred, labels=[0,1])
#print(confmat)

confmat = confusion_matrix(y_ext_0, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y_ext_0, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print('0 cluster model\n')
print(cm,'\n')

print('PPV:', format(PPV, '.2f'),'\n')

#1 cluster LR model
y_pred = (kernel_svm1.predict(X_ext_1))
confmat = confusion_matrix(y_ext_1, y_pred, labels=[0,1])
#print(confmat)

confmat = confusion_matrix(y_ext_1, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y_ext_1, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print('1 cluster model\n')
print(cm,'\n')

print('PPV:', format(PPV, '.2f'))

print(len(ext_1))

D2 Q ph4s: 78 

0 cluster model

Predicted     0     1
Actual               
0          2450  2134
1             4    67 

PPV: 0.03 

1 cluster model

Predicted    0   1
Actual            
0          202  67
1            4   3 

PPV: 0.04
276
