In [2]:
# import libraries from cnn env
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

import os
SEED=56789
import numpy as np
np.random.seed(SEED)
import random as python_random
python_random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
from sklearn.metrics import roc_curve, confusion_matrix, auc, average_precision_score, precision_recall_curve
import tensorflow as tf
from numpy import mean
from numpy import absolute
from numpy import std
from collections import Counter
import itertools
from mlxtend.evaluate import bias_variance_decomp
import shap

import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('GPCR_Opt_ICHEM_FuzCav_Ligand.csv')
df.head()

Unnamed: 0,PDB_ID,poc_bit2,poc_bit4,poc_bit5,poc_bit6,poc_bit7,poc_bit9,poc_bit10,poc_bit11,poc_bit15,...,TM_7-7.43_HBond_PROT,TM_7-7.43_Hyd,TM_7-7.44_Hyd,TM_7-7.45_HBond_LIG,TM_7-7.45_Hyd,TM_7-7.46_HBond_PROT,TM_7-7.46_Hyd,TM_7-7.47_Hyd,TM_7-7.53_Hyd,Class
0,1f88,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,1gzm,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,1hzx,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,1jfp,0,0,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1l9h,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [4]:
def lr_classifier():

    X = df.iloc[:,1:-1]
    y = df.iloc[:,-1]
    
    estimator = LogisticRegression(random_state=123, solver='liblinear')
        
    print();print(format('Performing the K-Fold for Logistic Regression Classifier','*^82'))
    
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=5678)
    
    acc_score_skf = []
    mcc_score_skf = []
    f1_score_skf = []
    kappa_score_skf = []
    recall_score_skf = []
    precision_score_skf = []
    roc_auc_score_skf = []
    auPR_score_skf = []
    cm_score_skf = []
    
    fold_no = 1
    for train_index, test_index in skf.split(X,y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = estimator
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_train_pred = model.predict(X_train)

        acc_skf = accuracy_score(y_test, y_pred)
        mcc_skf = metrics.matthews_corrcoef(y_test, y_pred)
        f1_skf = metrics.f1_score(y_test, y_pred, average='macro')
        kappa_skf = metrics.cohen_kappa_score(y_test, y_pred)
        recall_skf = metrics.recall_score(y_test, y_pred, average='macro')
        precision_skf = metrics.precision_score(y_test, y_pred, average='macro')
        
        ytests = tf.keras.utils.to_categorical(np.array(y_test), num_classes=2)
        ytest_preds = tf.keras.utils.to_categorical(np.array(y_pred), num_classes=2)        
        roc_auc_skf = metrics.roc_auc_score(ytests, ytest_preds, average='macro')
        auPR_skf = average_precision_score(ytests, ytest_preds, average='macro')
        cm_skf = confusion_matrix(y_test, y_pred)
        
        acc_score_skf.append(acc_skf)
        mcc_score_skf.append(mcc_skf)
        f1_score_skf.append(f1_skf)
        kappa_score_skf.append(kappa_skf)
        recall_score_skf.append(recall_skf)
        precision_score_skf.append(precision_skf)
        roc_auc_score_skf.append(roc_auc_skf)
        auPR_score_skf.append(auPR_skf)
        cm_score_skf.append(cm_skf)
        
        fold_no = fold_no + 1

    print();print('Logistic Regression Statistics for K-Fold: ')
    print("Accuracy_SKF: %.4f (%.4f)" % (mean(acc_score_skf), std(acc_score_skf)))
    print("MCC_SKF: %.4f (%.4f)" % (mean(mcc_score_skf), std(mcc_score_skf)))
    print("F1_Score_SKF: %.4f (%.4f)" % (mean(f1_score_skf), std(f1_score_skf)))
    print("Recall_SKF: %.4f (%.4f)" % (mean(recall_score_skf), std(recall_score_skf)))
    print("Precision_SKF: %.4f (%.4f)" % (mean(precision_score_skf), std(precision_score_skf)))
    print("ROC_AUC_SKF: %.4f (%.4f)" % (mean(roc_auc_score_skf), std(roc_auc_score_skf)))
    print("auPR_SKF: %.4f (%.4f)" % (mean(auPR_score_skf), std(auPR_score_skf)))
    print("Confusion_Matrix_SKF:", sum(cm_score_skf))
    

    print();print(format('Performing the Test Set prediction','*^82'))
    
    X_w = df.iloc[:,0:-1]
    y_w = df.iloc[:,-1]

    X_train_w, X_test_w, y_train, y_test = train_test_split(X_w, y_w, test_size=0.2, random_state=5678, stratify=y_w)

    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    X_train = X_train_w.iloc[:,1:-1]
    X_test = X_test_w.iloc[:,1:-1]
    X_train_id = X_train_w.iloc[:,0]
    X_test_id = X_test_w.iloc[:,0]

    
    model_TRTS = estimator
    model_TRTS.fit(X_train, y_train)
    
    y_pred_train = model_TRTS.predict(X_train)
    y_pred_test = model_TRTS.predict(X_test)

    acc_ts = accuracy_score(y_test, y_pred_test)
    mcc_ts = metrics.matthews_corrcoef(y_test, y_pred_test)
    f1_score_ts = metrics.f1_score(y_test, y_pred_test, average='macro')
    ytests = tf.keras.utils.to_categorical(np.array(y_test), num_classes=2)
    ytest_preds = tf.keras.utils.to_categorical(np.array(y_pred_test), num_classes=2)
    roc_auc_ts = metrics.roc_auc_score(ytests, ytest_preds, average='macro',multi_class='ovr')

    auPR_ts = average_precision_score(ytests, ytest_preds, average='macro')
    cm_ts = confusion_matrix(y_test, y_pred_test)
    
    print();print('Classifier Statistics for Test: ')
    print("Accuracy_Test: %.4f" % acc_ts)
    print("MCC_Test: %.4f" % mcc_ts)
    print("f1_score_Test: %.4f" % f1_score_ts)
    print("ROC_AUC_Test:", roc_auc_ts)
    print("auPR_Test:", auPR_ts)
    print("Confusion_Matrix_Test:", cm_ts)
    print(classification_report(y_test, y_pred_test))
    

    print();print(format('Performing the Bias-Variance','*^82'))

    # Bias-Variance Decomposition
    
    X_bv = df.iloc[:,1:-1]
    y_bv = df.iloc[:,-1]
    
    X_train, X_test, y_train, y_test = train_test_split(X_bv, y_bv, test_size=0.2, random_state=5678, stratify=y_bv)
    
    X_train = X_train.values
    X_test = X_test.values
    y_train = y_train.values
    y_test = y_test.values
    
    avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(model_TRTS, X_train, y_train, 
                                                            X_test, y_test,
                                                            loss='0-1_loss',random_seed=5678)
    
    print();print('Bias Variance Decomposition Analysis: ')
    print('Average expected loss: %.3f' % avg_expected_loss)
    print('Average bias: %.3f' % avg_bias)
    print('Average variance: %.3f' % avg_var)

In [5]:
lr_classifier()


*************Performing the K-Fold for Logistic Regression Classifier*************

Logistic Regression Statistics for K-Fold: 
Accuracy_SKF: 0.9215 (0.0367)
MCC_SKF: 0.8413 (0.0739)
F1_Score_SKF: 0.9183 (0.0384)
Recall_SKF: 0.9167 (0.0399)
Precision_SKF: 0.9248 (0.0357)
ROC_AUC_SKF: 0.9167 (0.0399)
auPR_SKF: 0.8864 (0.0515)
Confusion_Matrix_SKF: [[145  18]
 [ 13 220]]

************************Performing the Test Set prediction************************

Classifier Statistics for Test: 
Accuracy_Test: 0.9375
MCC_Test: 0.8719
f1_score_Test: 0.9358
ROC_AUC_Test: 0.9377820760799485
auPR_Test: 0.9072366994480914
Confusion_Matrix_Test: [[31  2]
 [ 3 44]]
              precision    recall  f1-score   support

           0       0.91      0.94      0.93        33
           1       0.96      0.94      0.95        47

    accuracy                           0.94        80
   macro avg       0.93      0.94      0.94        80
weighted avg       0.94      0.94      0.94        80


***************