In [None]:
#Load Libraries
import os
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import cohen_kappa_score

In [None]:
#Statistics - Optional - Scikit libraries can be called instead
def accuracy(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    acc = float("{0:.2f}".format(acc))
    return acc

def balanced_accuracy(y_true, y_pred):
    bacc = balanced_accuracy_score(y_true, y_pred)
    bacc = float("{0:.2f}".format(bacc))
    return bacc

def sensitivity_specificity(y_true, y_pred):
    confusion = confusion_matrix(y_true, y_pred)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    accuracy = (TP+TN)/float(TP+TN+FP+FN)
    accuracy = float("{0:.2f}".format(accuracy))
    sensitivity = TP / float(FN + TP)
    sensitivity = float("{0:.2f}".format(sensitivity))
    specificity = TN / float(TN + FP)
    specificity = float("{0:.2f}".format(specificity))
    return sensitivity, specificity

def f1_score(y_true, y_pred):
    confusion = confusion_matrix(y_true, y_pred)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    f1_score = TP /float(TP +((FP+FN)/2.))
    f1_score = float("{0:.2f}".format(f1_score))
    return f1_score
    
def auc_roc(y_true, y_pred_prob):
    auc = roc_auc_score(y_true, y_pred_prob)
    auc = float("{0:.2f}".format(auc))
    return auc

def kappa_score(y_true, y_pred):
    kappa = cohen_kappa_score(y_true, y_pred)
    kappa = float("{0:.2f}".format(kappa))
    return kappa

In [None]:
#Load and Segement Data
#Importing Data
dataset = pd.read_csv('Caspase_data.csv')  #Change Data File Name
#dataset.drop(['Name'], axis=1)
X = dataset.iloc[:,1:39].values   #Seggregated Descriptor sets
y = dataset.iloc[:,0:1]
y = np.array(y).ravel()



#Splitting in to Training and Test sets
np.random.seed(43)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)

#If you are interested in storing data sets
#pd.DataFrame(X_train).to_csv("train_x.csv", header=None, index=None)
#pd.DataFrame(X_test).to_csv("test_x.csv", header=None, index=None)
#pd.DataFrame(y_train).to_csv("train_y.csv", header=None, index=None)
#pd.DataFrame(y_test).to_csv("test_y.csv", header=None, index=None)
#print(X)

#Loading validation data
dataset3 = pd.read_csv('validation.csv', delimiter=',')
Validate = dataset3.iloc[:,1:39]
print("loaded validation data: %s, %s" % (Validate.shape))

In [None]:
#Random Forest
# fit model on training data
#Try different ones too
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
from sklearn.datasets import make_classification
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc


model3 = RandomForestClassifier(n_estimators = 300, random_state = 42)
model3.fit(X_train, y_train)

ns_probs = [0 for _ in range(len(y_test))]

#import pickle
# save the model file
#output = open('randomforest_classifier.pkl', 'wb')
#pickle.dump(model, output)
#output.close()


# get predictions for test data
y_pred = model3.predict(X_test)
y_pred_prob = model3.predict_proba(X_test).T[1]
y_pred_prob = y_pred_prob.ravel()
y_pred_prob = np.round(y_pred_prob, 2)


#get Validation data
# get predictions for test data
y_valid = model3.predict(Validate)
print(y_valid)
y_valid_prob = model3.predict_proba(Validate).T[1]
y_valid_prob = y_valid_prob.ravel()
y_valid_prob = np.round(y_valid_prob, 2)
print(y_valid_prob)
# calculate performance metrics

auc = auc_roc(y_test, y_pred_prob)
ba = balanced_accuracy(y_test, y_pred)
sens, spec = sensitivity_specificity(y_test, y_pred)
kappa = kappa_score(y_test, y_pred)
accuracy_model = accuracy(y_test, y_pred)
f1_score_model = f1_score(y_test, y_pred)

# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
rf_auc = roc_auc_score(y_test, y_pred_prob)
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('RF: ROC AUC=%.3f' % (rf_auc))

# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
rf_fpr, rf_tpr, _ = roc_curve(y_test, y_pred_prob)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(rf_fpr, rf_tpr, marker='.', label='Logistic')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()

#Precision Recall Plot
rf_precision, rf_recall, _ = precision_recall_curve(y_test, y_pred_prob)

# plot the precision-recall curves
no_skill = len(y_test[y_test==1]) / len(y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(rf_recall, rf_precision, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()


#####################################################
confusion = confusion_matrix(y_test, y_pred)
print(confusion)
print('model performance')
print('AUC:\t%s' % auc)
print('BACC:\t%s' % ba)
print('Accuracy:\t%s' % accuracy_model)
print('F1-Score:\t%s' % f1_score_model)
print('Sensitivity:\t%s' % sens)
print('Specificity:\t%s' % spec)
print('Kappa:\t%s' % kappa)

In [None]:
#XGBoost
model2 = XGBClassifier(n_estimators = 300, random_state = 42, objective='binary:logistic', learning_rate=0.05)
model2.fit(X_train, y_train)
# save the model file
#output = open('XGB_classifier.pkl', 'wb')
#pickle.dump(model2, output)
#output.close()


# get predictions for test data
y_pred = model2.predict(X_test)
#y_pred_prob = model.predict_proba(X_test).T[1]
#y_pred_prob = y_pred_prob.ravel()
#y_pred_prob = np.round(y_pred_prob, 2)

# calculate performance metrics

#auc = auc_roc(y_test, y_pred_prob)
#ba = balanced_accuracy(y_test, y_pred)
#sens, spec = sensitivity_specificity(y_test, y_pred)
#kappa = kappa_score(y_test, y_pred)
#accuracy_model = accuracy(y_test, y_pred)
#f1_score_model = f1_score(y_test, y_pred)
#####################################################
confusion = confusion_matrix(y_test, y_pred)
print(confusion)

# get predictions for test data
y_valid = model3.predict(Validate)
print(y_valid)

sens, spec = sensitivity_specificity(y_test, y_pred)
kappa = kappa_score(y_test, y_pred)
accuracy_model = accuracy(y_test, y_pred)
f1_score_model = f1_score(y_test, y_pred)

print('Accuracy:\t%s' % accuracy_model)
print('F1-Score:\t%s' % f1_score_model)
print('Sensitivity:\t%s' % sens)
print('Specificity:\t%s' % spec)
print('Kappa:\t%s' % kappa)

In [None]:
#Support Vectors
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline
SMALL_SIZE = 10
MEDIUM_SIZE = 12

plt.rc('font', size=SMALL_SIZE)
plt.rc('axes', titlesize=MEDIUM_SIZE)
plt.rc('axes', labelsize=MEDIUM_SIZE)
plt.rcParams['figure.dpi']=150


svc = SVC()
training_start = time.perf_counter()
svc.fit(X_train, y_train)
training_end = time.perf_counter()
prediction_start = time.perf_counter()
preds = svc.predict(X_test)
prediction_end = time.perf_counter()
acc_svc = (preds == y_test).sum().astype(float) / len(preds)*100
svc_train_time = training_end-training_start
svc_prediction_time = prediction_end-prediction_start
print("Scikit-Learn's Support Vector Machine Classifier's prediction accuracy is: %3.2f" % (acc_svc))
print("Time consumed for training: %4.3f seconds" % (svc_train_time))
print("Time consumed for prediction: %6.5f seconds" % (svc_prediction_time))


print('model performance')
print('AUC:\t%s' % auc)
print('BACC:\t%s' % ba)
print('Accuracy:\t%s' % accuracy_model)
print('F1-Score:\t%s' % f1_score_model)
print('Sensitivity:\t%s' % sens)
print('Specificity:\t%s' % spec)
print('Kappa:\t%s' % kappa)


In [None]:
#Grid Search with DNN
import tensorflow as tf

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

tf.compat.v1.disable_eager_execution()

np.random.seed(43)

monitor = tf.keras.callbacks.EarlyStopping(
    monitor='loss',
    patience=100,
    min_delta=0.001,
    verbose=0,
    mode='auto',
    restore_best_weights=True,
    )

def create_model(dropout_rate=0.0, neurons=32, layers=1):

    model = Sequential()
        
    while layers > 0:

        layers -= 1

        model.add(Dense(neurons, kernel_initializer='he_uniform', activation='relu'))
        model.add(Dropout(dropout_rate))

    model.add(Dense(1, kernel_initializer='he_uniform', activation='linear'))

    model.compile(loss='mse',
                  optimizer='adam')
 
    return model

X_test = StandardScaler().fit_transform(X_test)
X_train = StandardScaler().fit_transform(X_train)

model = KerasRegressor(build_fn=create_model, verbose=0)

dropout_rate = [0.0, 0.1, 0.2, 0.3]
neurons = [64, 128, 256, 512]
layers = [8, 16, 32]

param_grid = dict(dropout_rate=dropout_rate, neurons=neurons, layers=layers)

grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=5, verbose=2)
grid_result = grid.fit(X_train, y_train, callbacks=[monitor], batch_size=32, epochs=10000)

'''
# save the model file
output = open('dnn_2_classifier.pkl', 'wb')
pickle.dump(grid, output)
output.close()
'''

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

    
# get predictions for test data
predictions = grid.predict(X_test)
y_pred = np.round(predictions, 0)
predictions = np.array(predictions).ravel()
predictions = np.round(predictions, 2)


# calculate performance metrics
confusion = confusion_matrix(y_test, y_pred)
print(confusion)
auc = auc_roc(y_test, predictions)
ba = balanced_accuracy(y_test, y_pred)
sens, spec = sensitivity_specificity(y_test, y_pred)
kappa = kappa_score(y_test, y_pred)
accuracy_model = accuracy(y_test, y_pred)
f1_score_model = f1_score(y_test, y_pred)
#########################################################
print('model performance')
print('AUC:\t%s' % auc)
print('BACC:\t%s' % ba)
print('Accuracy:\t%s' % accuracy_model)
print('F1-Score:\t%s' % f1_score_model)
print('Sensitivity:\t%s' % sens)
print('Specificity:\t%s' % spec)
print('Kappa:\t%s' % kappa)

#Validation
y_valid = grid.predict(Validate)
print(y_valid)