In [1]:
import numpy as np
import sklearn
from sklearn import preprocessing, neighbors
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from typing import Tuple
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn import metrics

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import random, os
import csv

rng = 1

def seed_everything(seed=1):
    """"
    Seed everything.
    """   
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
#print('The scikit-learn version is {}.'.format(sklearn.__version__))

In [2]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import scale

In [3]:
#cluster test/train data
def get_clusters(X_train: pd.DataFrame, X_test: pd.DataFrame, n_clusters: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies k-means clustering to training data to find clusters and predicts them for the test set
    """
    clustering = KMeans(n_clusters=n_clusters, random_state=rng)
    clustering.fit(X_train)
    # apply the labels to the training set
    train_labels = clustering.labels_
    X_train_clstrs = X_train.copy()
    X_train_clstrs['clusters'] = train_labels
    
    #write ext_clusters to csv
    #X_train_clstrs.to_csv('X_train_clstrs.csv')
    
    # predict labels on the test set
    test_labels = clustering.predict(X_test)
    X_test_clstrs = X_test.copy()
    X_test_clstrs['clusters'] = test_labels
    return X_train_clstrs, X_test_clstrs

#scale each feature
def scale_features(X_train: pd.DataFrame, X_test: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies standard scaler (z-scores) to training data and predicts z-scores for the test set
    """
    scaler = StandardScaler()
    to_scale = [col for col in X_train.columns.values]
    scaler.fit(X_train[to_scale])
    X_train[to_scale] = scaler.transform(X_train[to_scale])
    
    # predict z-scores on the test set
    X_test[to_scale] = scaler.transform(X_test[to_scale])
    
    return X_train, X_test



In [4]:
#read in train/test data
df = pd.read_csv('..\..\data\_All_Receptors_runs_1_2_3_binary.csv')
df = df[['Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 's_score', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop', 'quality']]
df.fillna(-99999)

predictors = list(df.columns)
predictors = predictors[:-1]

print('Predictors:', predictors,'\n')


#split data into quality/not quality sets
q_ph4s = df[df['quality'] == 1]
nq_ph4s = df[df['quality'] != 1]

#ensure that there is an equal number of nq ph4s
nq_ph4s = nq_ph4s.sample(n=1*len(q_ph4s), random_state = rng)
#print(type(nq_ph4s))
#print(nq_ph4s['Hits'].head())
#print(nq_ph4s['Hits'].tail())

#merge arrays prior to TTS
frames = [q_ph4s, nq_ph4s]
df = pd.concat(frames)

#x is features, y is classes
x = df.drop('quality', axis=1)
y = df.quality

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=rng)

print("x_train Q ph4s:", y_train.sum(),'\n')
print("x_test Q ph4s:",y_test.sum(),'\n')

X_train_scaled, X_test_scaled = scale_features(x_train, x_test)

#LR model
sgdc = SGDClassifier(loss="log", penalty="l1", max_iter=1000, tol=1e-3, class_weight='balanced', random_state = rng)
sgdc.fit(X_train_scaled, y_train)

y_pred = (sgdc.predict(X_test_scaled))
confmat = confusion_matrix(y_test, y_pred, labels=[0,1])
#print(confmat)

confmat = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()
FP = (confmat[1])
TP = (confmat[3])

PPV = (TP / (TP + FP))

cm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)

#print('Result\n')
print(cm,'\n')

print("PPV:",format(metrics.precision_score(y_test, y_pred), '.2f'))
print("Accuracy:",format(metrics.accuracy_score(y_test, y_pred),'.2f'))
print("Recall:",format(metrics.recall_score(y_test, y_pred), '.2f'))
print("F1:",format(metrics.f1_score(y_test, y_pred), '.2f'),'\n')

Predictors: ['Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 's_score', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

x_train Q ph4s: 2739 

x_test Q ph4s: 882 

Predicted    0    1
Actual             
0          708  221
1           69  813 

PPV: 0.79
Accuracy: 0.84
Recall: 0.92
F1: 0.85 



In [5]:
def scale_features_single(X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    applies standard scaler (z-scores) to training data and predicts z-scores for the test set
    """
    scaler = StandardScaler()
    to_scale = [col for col in X.columns.values]
    scaler.fit(X[to_scale])
    X[to_scale] = scaler.transform(X[to_scale])
    
    return X
      
def classify_ext_data(input_csv, subset = "none"):      
    ext_df = pd.read_csv(input_csv)
    ext_df.fillna(-99999)
    receptors = ext_df.Receptor
    hits_actual = ext_df.Hits
    score_types = ext_df['Score Type']
    subsets = ext_df.subset
    match_features = ext_df.match_features
    init_ext_df = ext_df
    
    #check if a 'quality' column exists. one will not exist if classifying data with unknown enrichments.
    if 'quality' not in ext_df:
        ext_df = ext_df[['s_score','Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop']]
        x = ext_df
    else:
        ext_df = ext_df[['s_score','Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop', 'quality']]   
        x = ext_df.drop('quality', axis=1)
        y = ext_df.quality
    
    predictors = list(ext_df.columns)
    predictors = predictors[:-1]
    print('Predictors:', predictors,'\n')
    
    if 'quality' in init_ext_df:
        print("score based Q ph4s:", y.sum(),'\n')
    
    X_scaled = scale_features_single(x)
    ext_0 = X_scaled.copy()
    if 'quality' in init_ext_df:
        ext_0['y'] = y
    
    #add receptors, hits_actual, score type, and subset columns back prior to 0/1/2/3 split
    ext_0['Receptor'] = receptors
    ext_0['hits_actual'] = hits_actual
    ext_0['Score Type'] = score_types
    ext_0['subset'] = subsets
    ext_0['match_features'] = match_features
    
    if 'quality' in init_ext_df:
        y_ext_0 = ext_0.y.values
    ext_0_receptors = ext_0.Receptor
    ext_0_hits_actual = ext_0.hits_actual
    ext_0_score_types = ext_0['Score Type']
    ext_0_subsets = ext_0.subset
    ext_0_match_features = ext_0.match_features

    # drop the targets from each external set (if classifying known external data)
    if 'quality' in init_ext_df:
        X_ext_0 = ext_0.drop(columns=['y', 'Receptor', 'hits_actual', 'Score Type', 'subset', 'match_features'])
    else:
        X_ext_0 = ext_0.drop(columns=['Receptor', 'hits_actual', 'Score Type', 'subset', 'match_features'])
    
    #predict based on 0 cluster model
    print('Classification Results\n')
    print('----------------------\n')
    if len(X_ext_0) == 0:
        print('No data.\n')
    else:
        y_pred = (sgdc.predict(X_ext_0))
        if 'quality' in init_ext_df:
            confmat = confusion_matrix(y_ext_0, y_pred, labels=[0,1])
            confmat = confusion_matrix(y_ext_0, y_pred, labels=[0,1]).ravel()
            FP = (confmat[1])
            TP = (confmat[3])
            PPV = (TP / (TP + FP))
            cm = pd.crosstab(y_ext_0, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
            print(cm,'\n')
            print('PPV:', format(PPV, '.2f'))
            print("Accuracy:",format(metrics.accuracy_score(y_ext_0, y_pred),'.2f'))
            print("Recall:",format(metrics.recall_score(y_ext_0, y_pred), '.2f'))
            print("F1:",format(metrics.f1_score(y_ext_0, y_pred), '.2f'),'\n')
        
        X_ext_0['Receptor'] = ext_0_receptors
        X_ext_0['hits_actual'] = ext_0_hits_actual
        X_ext_0['Score Type'] = ext_0_score_types
        X_ext_0['subset'] = ext_0_subsets
        X_ext_0['match_features'] = ext_0_match_features
        
        if 'quality' in init_ext_df:
            X_ext_0['quality'] = y_ext_0
        
        X_ext_0['quality_pred'] = y_pred
        if 'quality' in init_ext_df:
            X_ext_0.to_csv('results/'+subset+'/0cluster_results_singlemodel.csv')
        if 'quality' not in init_ext_df:
            print(X_ext_0.loc[X_ext_0['quality_pred'] == 1], '\n')
            ph4_preds = X_ext_0.loc[X_ext_0['quality_pred'] == 1]
            ph4_preds.to_csv('predictions/0cluster_ph4_preds_singlemodel.csv')

In [6]:
classify_ext_data('..\..\data\score_based_alldata_binary.csv', 'all')

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 85 

Classification Results

----------------------

Predicted    0   1
Actual            
0          112  63
1           43  42 

PPV: 0.40
Accuracy: 0.59
Recall: 0.49
F1: 0.44 



In [7]:
classify_ext_data('..\..\data\score_based_moefrags_data_binary.csv', 'moe')

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 16 

Classification Results

----------------------

Predicted   0   1
Actual           
0          24  12
1           7   9 

PPV: 0.43
Accuracy: 0.63
Recall: 0.56
F1: 0.49 



In [8]:
classify_ext_data('..\..\data\score_based_efdata_binary.csv', 'ef')

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 20 

Classification Results

----------------------

Predicted   0   1
Actual           
0          20  12
1          11   9 

PPV: 0.43
Accuracy: 0.56
Recall: 0.45
F1: 0.44 



In [9]:
classify_ext_data('..\..\data\score_based_ghdata_binary.csv', 'gh')

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 21 

Classification Results

----------------------

Predicted   0   1
Actual           
0          20  11
1          12   9 

PPV: 0.45
Accuracy: 0.56
Recall: 0.43
F1: 0.44 



In [10]:
classify_ext_data('..\..\data\score_based_recefdata_binary.csv', 'rec_ef')

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 13 

Classification Results

----------------------

Predicted   0   1
Actual           
0          24  15
1           4   9 

PPV: 0.38
Accuracy: 0.63
Recall: 0.69
F1: 0.49 



In [11]:
classify_ext_data('..\..\data\score_based_recghdata_binary.csv', 'rec_gh')

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 15 

Classification Results

----------------------

Predicted   0   1
Actual           
0          24  13
1          10   5 

PPV: 0.28
Accuracy: 0.56
Recall: 0.33
F1: 0.30 



In [12]:
classify_ext_data('..\..\data\hm_score_based_alldata_binary.csv', 'hm_all')

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 58 

Classification Results

----------------------

Predicted    0   1
Actual            
0          132  70
1           33  25 

PPV: 0.26
Accuracy: 0.60
Recall: 0.43
F1: 0.33 



In [13]:
classify_ext_data('..\..\data\hm_score_based_moefrags_data_binary.csv', 'hm_moe')

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 9 

Classification Results

----------------------

Predicted   0   1
Actual           
0          31  12
1           5   4 

PPV: 0.25
Accuracy: 0.67
Recall: 0.44
F1: 0.32 



In [14]:
classify_ext_data('..\..\data\hm_score_based_efdata_binary.csv', 'hm_ef')

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 11 

Classification Results

----------------------

Predicted   0   1
Actual           
0          26  15
1           6   5 

PPV: 0.25
Accuracy: 0.60
Recall: 0.45
F1: 0.32 



In [15]:
classify_ext_data('..\..\data\hm_score_based_ghdata_binary.csv', 'hm_gh')

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 11 

Classification Results

----------------------

Predicted   0   1
Actual           
0          29  12
1           7   4 

PPV: 0.25
Accuracy: 0.63
Recall: 0.36
F1: 0.30 



In [16]:
classify_ext_data('..\..\data\hm_score_based_recefdata_binary.csv', 'hm_rec_ef')

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 15 

Classification Results

----------------------

Predicted   0   1
Actual           
0          25  12
1           9   6 

PPV: 0.33
Accuracy: 0.60
Recall: 0.40
F1: 0.36 



In [17]:
classify_ext_data('..\..\data\hm_score_based_recghdata_binary.csv', 'hm_rec_gh')

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop', 'aniacc_prop'] 

score based Q ph4s: 12 

Classification Results

----------------------

Predicted   0   1
Actual           
0          27  13
1           7   5 

PPV: 0.28
Accuracy: 0.62
Recall: 0.42
F1: 0.33 



In [18]:
classify_ext_data('..\..\data\gpr101_data_binary_5feats.csv')

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop'] 

Classification Results

----------------------

     s_score      Hits  max_feat  avg_feat  max_centr  min_centr  avg_centr  \
9   0.529199  1.867722 -0.433811 -0.772172   0.614168  -0.484577   0.668287   
21 -0.111182 -0.696069  1.111583  0.981812   0.880114   0.461244   1.079794   
24  0.872532 -0.501221  1.387788  1.793672   0.982683   1.138647   0.830567   
25  0.521481 -0.644793  1.452955  1.898732   0.684034   0.461244   0.476928   
26  1.108816  1.119095  0.453813  0.111265   1.208540   1.138647   1.409090   
27 -1.825249 -0.675559 -1.138959 -0.990683  -0.166530   1.446139   0.624721   
28  0.958453  1.478026  0.207928  0.386664   0.982683   0.795748   0.637593   
29 -0.067695 -0.706324  1.321852  0.493215   1.120982   0.899270   1.359525   
30  1.103565  2.195887 -0.924910 -0.929552   0.758520   1.138647  

In [19]:
classify_ext_data('..\..\data\gpr101_data_binary_6feats.csv')

Predictors: ['s_score', 'Hits', 'max_feat', 'avg_feat', 'max_centr', 'min_centr', 'avg_centr', 'features', 'all_same', 'hyd_prop', 'don_prop', 'catdon_prop', 'hydaro_prop'] 

Classification Results

----------------------

     s_score      Hits  max_feat  avg_feat  max_centr  min_centr  avg_centr  \
9   0.529199  1.756811 -0.433811 -0.772172   0.614168  -0.484577   0.668287   
21 -0.111182 -0.647949  1.111583  0.981812   0.880114   0.461244   1.079794   
24  0.872532 -0.247156  1.387788  1.793672   0.982683   1.138647   0.830567   
25  0.521481 -0.514352  1.452955  1.898732   0.684034   0.461244   0.476928   
26  1.108816  1.756811  0.453813  0.111265   1.208540   1.138647   1.409090   
27 -1.825249 -0.647949 -1.138959 -0.990683  -0.166530   1.446139   0.624721   
28  0.958453  1.222420  0.207928  0.386664   0.982683   0.795748   0.637593   
29 -0.067695 -0.647949  1.321852  0.493215   1.120982   0.899270   1.359525   
30  1.103565  3.760779 -0.924910 -0.929552   0.758520   1.138647  

In [20]:
#save models
import joblib
from joblib import dump, load

joblib.dump(sgdc0 , 'models/model_sgdc_singular')

NameError: name 'sgdc0' is not defined