In [11]:
import numpy as np
import sklearn
from sklearn import preprocessing, neighbors
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from typing import Tuple
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn import metrics

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import random, os
import csv

rng = 1

def seed_everything(seed=1):
    """"
    Seed everything.
    """   
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
#print('The scikit-learn version is {}.'.format(sklearn.__version__))

In [12]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import scale

In [13]:
#cluster test/train data
def get_clusters(X_train: pd.DataFrame, X_test: pd.DataFrame, n_clusters: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies k-means clustering to training data to find clusters and predicts them for the test set
    """
    clustering = KMeans(n_clusters=n_clusters, random_state=rng)
    clustering.fit(X_train)
    # apply the labels to the training set
    train_labels = clustering.labels_
    X_train_clstrs = X_train.copy()
    X_train_clstrs['clusters'] = train_labels
    
    #write ext_clusters to csv
    #X_train_clstrs.to_csv('X_train_clstrs.csv')
    
    # predict labels on the test set
    test_labels = clustering.predict(X_test)
    X_test_clstrs = X_test.copy()
    X_test_clstrs['clusters'] = test_labels
    return X_train_clstrs, X_test_clstrs

#scale each feature
def scale_features(X_train: pd.DataFrame, X_test: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies standard scaler (z-scores) to training data and predicts z-scores for the test set
    """
    scaler = StandardScaler()
    to_scale = [col for col in X_train.columns.values]
    scaler.fit(X_train[to_scale])
    X_train[to_scale] = scaler.transform(X_train[to_scale])
    
    # predict z-scores on the test set
    X_test[to_scale] = scaler.transform(X_test[to_scale])
    
    return X_train, X_test



In [14]:
#read in train/test data
df = pd.read_csv('..\..\data\_All_Receptors_runs_1_2_3_binary.csv')
df = df[['Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 's_score', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop', 'quality']]
df.fillna(-99999)

predictors = list(df.columns)
predictors = predictors[:-1]

print('Predictors:', predictors,'\n')


#split data into quality/not quality sets
q_ph4s = df[df['quality'] == 1]
nq_ph4s = df[df['quality'] != 1]

#ensure that there is an equal number of nq ph4s
nq_ph4s = nq_ph4s.sample(n=1*len(q_ph4s), random_state = rng)
#print(type(nq_ph4s))
#print(nq_ph4s['Hits'].head())
#print(nq_ph4s['Hits'].tail())

#merge arrays prior to TTS
frames = [q_ph4s, nq_ph4s]
df = pd.concat(frames)

#x is features, y is classes
x = df.drop('quality', axis=1)
y = df.quality

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=rng)

print("x_train Q ph4s:", y_train.sum(),'\n')
print("x_test Q ph4s:",y_test.sum(),'\n')

X_train_scaled, X_test_scaled = scale_features(x_train, x_test)

#LR model
sgdc = SGDClassifier(loss="log", penalty="l1", max_iter=1000, tol=1e-3, class_weight='balanced', random_state = rng)
sgdc.fit(X_train_scaled, y_train)

y_pred = (sgdc.predict(X_test_scaled))
confmat = confusion_matrix(y_test, y_pred, labels=[0,1])
#print(confmat)

confmat = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

#print('Result\n')
print(cm,'\n')

print("PPV:",format(metrics.precision_score(y_test, y_pred), '.2f'))
print("Accuracy:",format(metrics.accuracy_score(y_test, y_pred),'.2f'))
print("Recall:",format(metrics.recall_score(y_test, y_pred), '.2f'))
print("F1:",format(metrics.f1_score(y_test, y_pred), '.2f'),'\n')

Predictors: ['Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 's_score', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

x_train Q ph4s: 2739 

x_test Q ph4s: 882 

Predicted    0    1
Actual             
0          708  221
1           69  813 

PPV: 0.79
Accuracy: 0.84
Recall: 0.92
F1: 0.85 



In [15]:
def scale_features_single(X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies standard scaler (z-scores) to training data and predicts z-scores for the test set
    """
    scaler = StandardScaler()
    to_scale = [col for col in X.columns.values]
    scaler.fit(X[to_scale])
    X[to_scale] = scaler.transform(X[to_scale])
    
    return X
      
def classify_ext_data(input_csv, subset = "none"):      
    ext_df = pd.read_csv(input_csv)
    ext_df.fillna(-99999)
    receptors = ext_df.Receptor
    hits_actual = ext_df.Hits
    score_types = ext_df['Score Type']
    subsets = ext_df.subset
    match_features = ext_df.match_features
    init_ext_df = ext_df
    
    #check if a 'quality' column exists. one will not exist if classifying data with unknown enrichments.
    if 'quality' not in ext_df:
        ext_df = ext_df[['s_score','Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop']]
        x = ext_df
    else:
        ext_df = ext_df[['s_score','Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop', 'quality']]   
        x = ext_df.drop('quality', axis=1)
        y = ext_df.quality
    
    predictors = list(ext_df.columns)
    predictors = predictors[:-1]
    print('Predictors:', predictors,'\n')
    
    if 'quality' in init_ext_df:
        print("score based Q ph4s:", y.sum(),'\n')
    
    X_scaled = scale_features_single(x)
    ext_0 = X_scaled.copy()
    if 'quality' in init_ext_df:
        ext_0['y'] = y
    
    #add receptors, hits_actual, score type, and subset columns back prior to 0/1/2/3 split
    ext_0['Receptor'] = receptors
    ext_0['hits_actual'] = hits_actual
    ext_0['Score Type'] = score_types
    ext_0['subset'] = subsets
    ext_0['match_features'] = match_features
    
    if 'quality' in init_ext_df:
        y_ext_0 = ext_0.y.values
    ext_0_receptors = ext_0.Receptor
    ext_0_hits_actual = ext_0.hits_actual
    ext_0_score_types = ext_0['Score Type']
    ext_0_subsets = ext_0.subset
    ext_0_match_features = ext_0.match_features

    # drop the targets from each external set (if classifying known external data)
    if 'quality' in init_ext_df:
        X_ext_0 = ext_0.drop(columns=['y', 'Receptor', 'hits_actual', 'Score Type', 'subset', 'match_features'])
    else:
        X_ext_0 = ext_0.drop(columns=['Receptor', 'hits_actual', 'Score Type', 'subset', 'match_features'])
    
    #predict based on 0 cluster model
    print('Classification Results\n')
    print('----------------------\n')
    if len(X_ext_0) == 0:
        print('No data.\n')
    else:
        y_pred = (sgdc.predict(X_ext_0))
        if 'quality' in init_ext_df:
            confmat = confusion_matrix(y_ext_0, y_pred, labels=[0,1])
            confmat = confusion_matrix(y_ext_0, y_pred, labels=[0,1]).ravel()
            FP = (confmat[1])
            TP = (confmat[3])
            PPV = (TP / (TP + FP))
            cm = pd.crosstab(y_ext_0, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
            print(cm,'\n')
            print('PPV:', format(PPV, '.2f'))
            print("Accuracy:",format(metrics.accuracy_score(y_ext_0, y_pred),'.2f'))
            print("Recall:",format(metrics.recall_score(y_ext_0, y_pred), '.2f'))
            print("F1:",format(metrics.f1_score(y_ext_0, y_pred), '.2f'),'\n')
        
        X_ext_0['Receptor'] = ext_0_receptors
        X_ext_0['hits_actual'] = ext_0_hits_actual
        X_ext_0['Score Type'] = ext_0_score_types
        X_ext_0['subset'] = ext_0_subsets
        X_ext_0['match_features'] = ext_0_match_features
        
        if 'quality' in init_ext_df:
            X_ext_0['quality'] = y_ext_0
        
        X_ext_0['quality_pred'] = y_pred
        if 'quality' in init_ext_df:
            X_ext_0.to_csv('results/'+subset+'/k1_0cluster_results.csv')
        if 'quality' not in init_ext_df:
            print(X_ext_0.loc[X_ext_0['quality_pred'] == 1], '\n')
            ph4_preds = X_ext_0.loc[X_ext_0['quality_pred'] == 1]
            ph4_preds.to_csv('predictions/k1_0cluster_ph4_preds.csv')

In [16]:
classify_ext_data('..\..\data\score_based_alldata_binary.csv', 'all')

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 85 

Classification Results

----------------------

Predicted    0   1
Actual            
0          112  63
1           43  42 

PPV: 0.40
Accuracy: 0.59
Recall: 0.49
F1: 0.44 



In [17]:
classify_ext_data('..\..\data\hm_score_based_alldata_binary.csv', 'hm_all')

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 58 

Classification Results

----------------------

Predicted    0   1
Actual            
0          132  70
1           33  25 

PPV: 0.26
Accuracy: 0.60
Recall: 0.43
F1: 0.33 

