In [26]:
import numpy as np
from sklearn import preprocessing, neighbors
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from typing import Tuple
from sklearn import model_selection
from sklearn.metrics import classification_report

import pandas as pd

import csv

In [2]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import scale

In [3]:
#cluster test/train data
def get_clusters(X_train: pd.DataFrame, X_test: pd.DataFrame, n_clusters: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies k-means clustering to training data to find clusters and predicts them for the test set
    """
    clustering = KMeans(n_clusters=n_clusters, random_state=8675309)
    clustering.fit(X_train)
    # apply the labels
    train_labels = clustering.labels_
    X_train_clstrs = X_train.copy()
    X_train_clstrs['clusters'] = train_labels
    
    # predict labels on the test set
    test_labels = clustering.predict(X_test)
    X_test_clstrs = X_test.copy()
    X_test_clstrs['clusters'] = test_labels
    return X_train_clstrs, X_test_clstrs

#scale each feature
def scale_features(X_train: pd.DataFrame, X_test: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies standard scaler (z-scores) to training data and predicts z-scores for the test set
    """
    scaler = StandardScaler()
    to_scale = [col for col in X_train.columns.values]
    scaler.fit(X_train[to_scale])
    X_train[to_scale] = scaler.transform(X_train[to_scale])
    
    # predict z-scores on the test set
    X_test[to_scale] = scaler.transform(X_test[to_scale])
    
    return X_train, X_test



In [29]:

df = pd.read_csv('..\..\..\data\_All_Receptors_runs_1_2_3_binary.csv')
df.drop(['receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
df.fillna(-99999)
    
predictors = list(df.columns)
predictors = predictors[:-1]
    
print('Predictors:', predictors,'\n')
    
np.random.seed(42)

#split data into quality/not quality sets
q_ph4s = df[df['quality'] == 1]
nq_ph4s = df[df['quality'] != 1]
    
#ensure that there is an equal number of nq ph4s
nq_ph4s = nq_ph4s.sample(n=1*len(q_ph4s))

#merge arrays prior to TTS
frames = [q_ph4s, nq_ph4s]
df = pd.concat(frames)

#x is features, y is classes
x = df.drop('quality', 1)
y = df.quality
    
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
    
    
print("x_train Q ph4s:", y_train.sum(),'\n')
print("x_test Q ph4s:",y_test.sum(),'\n')
    
X_train_clstrs, X_test_clstrs = get_clusters(x_train, x_test, 4)

pd.set_option("display.max_rows", None, "display.max_columns", None)
#print(X_train_clstrs['clusters'].unique())
clusters = X_train_clstrs['clusters']

X_train_scaled, X_test_scaled = scale_features(X_train_clstrs, X_test_clstrs)
    
#print(X_train_scaled)
    
# to divide the df by cluster, we need to ensure we use the correct class labels, we'll use pandas to do that
train_clusters = X_train_scaled.copy()
test_clusters = X_test_scaled.copy()
train_clusters['y'] = y_train
test_clusters['y'] = y_test

#print(y_train)
#print(train_clusters['clusters'])

#print(type(clusters))
#print(type(train_clusters['clusters']))
#frames = [clusters, train_clusters['clusters']]
#df = pd.concat(frames, axis=1)
#print(type(df))
#print(df)
#df.to_csv('4clusters.csv')

# locate the "0" cluster
train_0 = train_clusters.loc[train_clusters.clusters < -2] # after scaling, 0 went to -2.187
test_0 = test_clusters.loc[test_clusters.clusters < -2]
y_train_0 = train_0.y.values
y_test_0 = test_0.y.values
# locate the "1" cluster
train_1 = train_clusters.loc[train_clusters.clusters < -0.5] # after scaling, 1 went to -0.62
train_1 - train_1.loc[train_1.clusters > -2] #remove 0 cluster data from 1 cluster data
test_1 = test_clusters.loc[test_clusters.clusters < -0.5]
test_1 = test_1.loc[test_1.clusters > -2] #remove 0 cluster data from 1 cluster test data
y_train_1 = train_1.y.values
y_test_1 = test_1.y.values

# locate the "2" cluster
train_2 = train_clusters.loc[train_clusters.clusters < -1] # after scaling, 3 went to 0.945
train_2 - train_2.loc[train_2.clusters > -0.5] #remove 0 cluster data from 1 cluster data
test_2 = test_clusters.loc[test_clusters.clusters < -1]
test_2 = test_2.loc[test_2.clusters > -0.5] #remove 0 cluster data from 1 cluster test data
y_train_2 = train_1.y.values
y_test_2 = test_1.y.values

# locate the "3" cluster
train_1 = train_clusters.loc[train_clusters.clusters < -0.5] # after scaling, 1 went to -0.62
train_1 - train_1.loc[train_1.clusters > -2] #remove 0 cluster data from 1 cluster data
test_1 = test_clusters.loc[test_clusters.clusters > 0]
test_1 = test_1.loc[test_1.clusters > -2] #remove 0 cluster data from 1 cluster test data
y_train_1 = train_1.y.values
y_test_1 = test_1.y.values

# drop the targets from the training set
X_train_0 = train_0.drop(columns=['y'])
X_test_0 = test_0.drop(columns=['y'])
X_train_1 = train_1.drop(columns=['y'])
X_test_1 = test_1.drop(columns=['y'])
    
#print(X_train_0)
#print(len(X_test_1))

    
#0 cluster LR model
sgdc0 = SGDClassifier(loss="log", penalty="l1", max_iter=1000, tol=1e-2, class_weight='balanced')
sgdc0.fit(X_train_0, y_train_0)
    
y_pred = (sgdc0.predict(X_test_0))
confmat = confusion_matrix(y_test_0, y_pred, labels=[0,1])
#print(confmat)
    
confmat = confusion_matrix(y_test_0, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])
    
PPV = (TP / (TP + FP))
    
cm = pd.crosstab(y_test_0, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
    
print('0 cluster model\n')
print(cm,'\n')
    
print('PPV:', format(PPV, '.2f'),'\n')
    
#1 cluster LR model
sgdc1 = SGDClassifier(loss="log", penalty="l1", max_iter=1000, tol=1e-2, class_weight='balanced')
sgdc1.fit(X_train_1, y_train_1)
    
y_pred = (sgdc1.predict(X_test_1))
confmat = confusion_matrix(y_test_1, y_pred, labels=[0,1])
#print(confmat)
    
confmat = confusion_matrix(y_test_1, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])
    
PPV = (TP / (TP + FP))
    
cm = pd.crosstab(y_test_1, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
    
print('1 cluster model\n')
print(cm,'\n')

print('PPV:', format(PPV, '.2f'))


Predictors: ['Hits', 'max_feat', 'min_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 's_score', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

x_train Q ph4s: 2711 

x_test Q ph4s: 910 

<class 'pandas.core.frame.DataFrame'>
        clusters  clusters
10257          2  0.945432
89229          1 -0.620962
90509          2  0.945432
26023          1 -0.620962
25185          2  0.945432
88679          1 -0.620962
111700         0 -2.187357
57216          1 -0.620962
62000          1 -0.620962
8251           1 -0.620962
87779          1 -0.620962
2393           1 -0.620962
60579          1 -0.620962
57455          1 -0.620962
87504          1 -0.620962
108623         2  0.945432
104861         1 -0.620962
32473          2  0.945432
85728          1 -0.620962
61073          2  0.945432
5637           2  0.945432
57296          0 -2.187357
41281          1 -0.620962
111786         0 -2.187357
86166          1 -0.620962
54278          2  0.945432
87801         

104610         0 -2.187357


In [5]:
def scale_features_single(X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies standard scaler (z-scores) to training data and predicts z-scores for the test set
    """
    scaler = StandardScaler()
    to_scale = [col for col in X.columns.values]
    scaler.fit(X[to_scale])
    X[to_scale] = scaler.transform(X[to_scale])
    
    return X

def classify_ext_data(subset):
    #CLassify external data (score based pharmacophore models)
    if subset == "moe":
        ext_df = pd.read_csv('..\..\data\score_based_moefrags_data_binary.csv')
    elif subset == "EF":
        ext_df = pd.read_csv('..\..\data\score_based_efdata_binary.csv')
    elif subset == "GH":
        ext_df = pd.read_csv('..\..\data\score_based_ghdata_binary.csv')
    elif subset == "all":
        ext_df = pd.read_csv('..\..\data\score_based_alldata_binary.csv')
    elif subset == "moe_searches":
        ext_df = pd.read_csv('..\..\data\moefrags_searches.csv')
        
    ext_df.drop(['Receptor', 'Score Type','Enrichment',  'hyd', 'don', 'acc', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc', 'don_prop', 'acc_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
    
    #drop extra column from data with searches
    if subset == "moe_searches":
        ext_df.drop(['search_features'], 1, inplace=True)
        
    ext_df.fillna(-99999)
    x = ext_df.drop('quality', 1)
    y = ext_df.quality
    
    predictors = list(ext_df.columns)
    predictors = predictors[:-1]
    print('Predictors:', predictors,'\n')
    
    print("score based Q ph4s:", y.sum(),'\n')
    
    clustering = KMeans(n_clusters=2, random_state=8675309)
    clustering.fit(x)
    
    train_labels = clustering.labels_
    
    X_clstrs = x.copy()
    X_clstrs['clusters'] = train_labels
    
    X_scaled = scale_features_single(X_clstrs)
    ext_clusters = X_scaled.copy()
    ext_clusters['y'] = y
    
    # locate the "0" cluster
    ext_0 = ext_clusters.loc[ext_clusters.clusters < 0] # after scaling, 0 went negtive
    y_ext_0 = ext_0.y.values
    
    # locate the "1" cluster
    ext_1 = ext_clusters.loc[ext_clusters.clusters > 0] # after scaling, 0 went negtive
    y_ext_1 = ext_1.y.values
    
    # drop the targets from the external set
    X_ext_0 = ext_0.drop(columns=['y'])
    X_ext_1 = ext_1.drop(columns=['y'])
    
    #predict based on 0 cluster model
    y_pred = (sgdc0.predict(X_ext_0))
    confmat = confusion_matrix(y_ext_0, y_pred, labels=[0,1])
    #print(confmat)

    confmat = confusion_matrix(y_ext_0, y_pred, labels=[0,1]).ravel()
    FP = (confmat[1])
    TP = (confmat[3])

    PPV = (TP / (TP + FP))

    cm = pd.crosstab(y_ext_0, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

    print('0 cluster model\n')
    print(cm,'\n')

    print('PPV:', format(PPV, '.2f'),'\n')

    #1 cluster LR model
    y_pred = (sgdc1.predict(X_ext_1))
    confmat = confusion_matrix(y_ext_1, y_pred, labels=[0,1])
    #print(confmat)

    confmat = confusion_matrix(y_ext_1, y_pred, labels=[0,1]).ravel()
    FP = (confmat[1])
    TP = (confmat[3])

    PPV = (TP / (TP + FP))

    cm = pd.crosstab(y_ext_1, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

    print('1 cluster model\n')
    print(cm,'\n')

    print('PPV:', format(PPV, '.2f'))

In [6]:
classify_ext_data("all")

Predictors: ['s_score', 'Hits', 'max_feat', 'min_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

score based Q ph4s: 57 

0 cluster model

Predicted  0  1
Actual         
0          9  2 

PPV: 0.00 

1 cluster model

Predicted   0   1
Actual           
0          55  33
1          33  24 

PPV: 0.42


In [7]:
classify_ext_data("moe")

Predictors: ['s_score', 'Hits', 'max_feat', 'min_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

score based Q ph4s: 16 

0 cluster model

Predicted   0   1
Actual           
0          16  15
1           6  10 

PPV: 0.40 

1 cluster model

Predicted  0  1
Actual         
0          2  3 

PPV: 0.00


In [8]:
classify_ext_data("EF")

Predictors: ['s_score', 'Hits', 'max_feat', 'min_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

score based Q ph4s: 20 

0 cluster model

Predicted   0   1
Actual           
0          13  17
1          14   6 

PPV: 0.26 

1 cluster model

Predicted  1
Actual      
0          2 

PPV: 0.00


In [9]:
classify_ext_data("GH")

Predictors: ['s_score', 'Hits', 'max_feat', 'min_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

score based Q ph4s: 21 

0 cluster model

Predicted   0   1
Actual           
0          12  15
1          10  11 

PPV: 0.42 

1 cluster model

Predicted  0  1
Actual         
0          3  1 

PPV: 0.00


In [75]:
classify_ext_data("moe_searches")

Predictors: ['Hits', 'max_feat', 'min_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

score based Q ph4s: 16 



ValueError: X has 13 features per sample; expecting 14

In [24]:
#cluster data together 
df = pd.read_csv('..\..\data\_All_Receptors_runs_1_2_3_binary.csv')
df.drop(['s_score','receptor','Active_Rate','Enrichment', 'GH', 'Actives', 'filename', 'fbase', 'hyd', 'don', 'acc', 'ani', 'cat', 'aro', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc','don_prop', 'acc_prop', 'ani_prop', 'cat_prop', 'aro_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
df.fillna(-99999)

df2 = pd.read_csv('..\..\data\score_based_ghdata_binary.csv')
df2.drop(['Receptor', 'Score Type','Enrichment',  'hyd', 'don', 'acc', 'donhyd', 'catdon', 'hydaro', 'aniacc', 'donacc', 'don_prop', 'acc_prop', 'donhyd_prop', 'hydaro_prop', 'donacc_prop'], 1, inplace=True)
df2.fillna(-99999)    
    
predictors = list(df.columns)
predictors = predictors[:-1]
    
print('Predictors:', predictors,'\n')
    
np.random.seed(42)

#split data into quality/not quality sets
q_ph4s = df[df['quality'] == 1]
nq_ph4s = df[df['quality'] != 1]
    
#ensure that there is an equal number of nq ph4s
nq_ph4s = nq_ph4s.sample(n=1*len(q_ph4s))

#merge arrays prior to TTS
frames = [q_ph4s, nq_ph4s]
df = pd.concat(frames)
model_len = len(df)
print(model_len)

#merge model/external datasets
frames = [df, df2]
df = pd.concat(frames)

#print(df)
x = df.drop('quality', 1)
y = df.quality

#cluster
clustering = KMeans(n_clusters=2, random_state=999)
clustering.fit(x)
train_labels = clustering.labels_
print(sum(train_labels))
X_clstrs = x.copy()
#add clusters column to data
X_clstrs['clusters'] = train_labels
#add quality labels back to dataset
df = pd.concat([X_clstrs, y], axis = 1)
#split data back into model set/external set
model_df = df.iloc[:model_len,:]
ext_df = df.iloc[model_len:,:]

#print(model_df)
#print(ext_df)
#print(len(model_df))
#print(len(ext_df))

#drop quality labels from modeling df
x = model_df.drop('quality', 1)
y = model_df.quality

#train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

#scale model data
X_train_scaled, X_test_scaled = scale_features(x_train, x_test)

#pd.set_option("display.max_rows", None, "display.max_columns", None)
#print(X_train_scaled)

# to divide the df by cluster, we need to ensure we use the correct class labels, we'll use pandas to do that
train_clusters = X_train_scaled.copy()
test_clusters = X_test_scaled.copy()
train_clusters['y'] = y_train
test_clusters['y'] = y_test

#print(y_train)
#print(train_clusters)

# locate the "0" cluster
train_0 = train_clusters.loc[train_clusters.clusters < 0] # after scaling, 0 went negtive
test_0 = test_clusters.loc[test_clusters.clusters < 0]
y_train_0 = train_0.y.values
y_test_0 = test_0.y.values
# locate the "1" cluster
train_1 = train_clusters.loc[train_clusters.clusters > 0] # after scaling, 1 dropped slightly
test_1 = test_clusters.loc[test_clusters.clusters > 0]
y_train_1 = train_1.y.values
y_test_1 = test_1.y.values
# drop the targets from the training set
X_train_0 = train_0.drop(columns=['y'])
X_test_0 = test_0.drop(columns=['y'])
X_train_1 = train_1.drop(columns=['y'])
X_test_1 = test_1.drop(columns=['y'])
    
#print(X_train_0)
#print(len(X_test_0))
    
#0 cluster LR model
sgdc0 = SGDClassifier(loss="log", penalty="l1", max_iter=1000, tol=1e-3, class_weight='balanced')
sgdc0.fit(X_train_0, y_train_0)
    
y_pred = (sgdc0.predict(X_test_0))
confmat = confusion_matrix(y_test_0, y_pred, labels=[0,1])
#print(confmat)
    
confmat = confusion_matrix(y_test_0, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])
    
PPV = (TP / (TP + FP))
    
cm = pd.crosstab(y_test_0, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
    
print('0 cluster model\n')
print(cm,'\n')
    
print('PPV:', format(PPV, '.2f'),'\n')
    
#1 cluster LR model
sgdc1 = SGDClassifier(loss="log", penalty="l1", max_iter=1000, tol=1e-3, class_weight='balanced')
sgdc1.fit(X_train_1, y_train_1)
    
y_pred = (sgdc1.predict(X_test_1))
confmat = confusion_matrix(y_test_1, y_pred, labels=[0,1])
#print(confmat)
    
confmat = confusion_matrix(y_test_1, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])
    
PPV = (TP / (TP + FP))
    
cm = pd.crosstab(y_test_1, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
    
print('1 cluster model\n')
print(cm,'\n')

print('PPV:', format(PPV, '.2f'))

Predictors: ['Hits', 'max_feat', 'min_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'catdon_prop', 'aniacc_prop'] 

7242
4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[to_scale] = scaler.transform(X_train[to_scale])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[to_scale] = scaler.transform(X_test[to_scale])
A value is trying to be set on a copy of a slice from a Data

ValueError: Found array with 0 sample(s) (shape=(0, 13)) while a minimum of 1 is required.