In [2]:
import numpy as np
from sklearn import preprocessing, neighbors
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn import datasets, svm, pipeline
from sklearn.kernel_approximation import (RBFSampler, Nystroem)

scaler = MinMaxScaler()

import pandas as pd


In [22]:
df = pd.read_csv('data\_All_Receptors_runs_1_2_3_binary.csv')
df.drop(['receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
df.fillna(-99999)

predictors = list(df.columns)
predictors = predictors[:-1]

print('Predictors:', predictors,'\n')

np.random.seed(42)

#split data into quality/not quality sets
q_ph4s = df[df['quality'] == 1]
nq_ph4s = df[df['quality'] != 1]

#ensure that there is an equal number of nq ph4s
nq_ph4s = nq_ph4s.sample(n=1*len(q_ph4s))

#merge arrays prior to TTS
frames = [q_ph4s, nq_ph4s]
df = pd.concat(frames)

#x is features, y is classes
x = df.drop('quality', 1)
y = df.quality

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

#scale train/test feature data
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)

#model goes here
# Create a classifier: a support vector classifier
kernel_svm = svm.SVC(gamma=.2)
linear_svm = svm.LinearSVC()

# create pipeline from kernel approximation
# and linear svm
feature_map_fourier = RBFSampler(gamma=.2, random_state=1)
feature_map_nystroem = Nystroem(gamma=.2, random_state=1)
fourier_approx_svm = pipeline.Pipeline([("feature_map", feature_map_fourier),
                                        ("svm", svm.LinearSVC())])
 
nystroem_approx_svm = pipeline.Pipeline([("feature_map", feature_map_nystroem),
                                        ("svm", svm.LinearSVC())])

# fit and predict using linear and kernel svm:
kernel_svm.fit(x_train_scaled, y_train)
y_pred = kernel_svm.predict(x_test_scaled)

confmat = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])
PPV = (TP / (TP + FP))
cm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
print(cm,'\n')
print('PPV:', format(PPV, '.2f'),'\n')
 
linear_svm.fit(x_train_scaled, y_train)
y_pred = linear_svm.predict(x_test_scaled)

confmat = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])
PPV = (TP / (TP + FP))
cm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print('Linear SVM\n')
print(cm,'\n')
print('PPV:', format(PPV, '.2f'),'\n')

Predictors: ['Hits', 'max_feat', 'min_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 's_score', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

Predicted    0    1
Actual             
0          714  187
1          129  781 

PPV: 0.81 

Linear SVM

Predicted    0    1
Actual             
0          751  150
1          226  684 

PPV: 0.82 



In [15]:
#CLassify external data (D2 6LUQ pharmacophore models)
ext_df = pd.read_csv('data\D2_6LUQ_pharmacophores_binary.csv')
ext_df.drop(['receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
ext_df.fillna(-99999)

#use this line to only classify a sample
#ext_df = ext_df.sample(n=50)

x = ext_df.drop('quality', 1)
y = ext_df.quality

x_scaled = scaler.fit_transform(x)

#make prediction with kernel SVM
ext_pred = kernel_svm.predict(x_scaled)

confmat = confusion_matrix(y, ext_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y, ext_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

print(cm,'\n')

print('PPV:', format(PPV, '.2f'))

Predicted     0   1
Actual             
0          4832  21
1            78   0 

PPV: 0.00
