In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'whole_sample_dataset_esm2_t12_35M_UR50D_unified_480_dimension.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# List of classifiers with default parameters
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Multilayer Perceptron': MLPClassifier()
}

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_scores):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5

    for threshold in thresholds:
        y_pred = (y_pred_scores > threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)

        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold

    return best_threshold, best_mcc

# Evaluate each classifier
for clf_name, clf in classifiers.items():
    print(f"\nTraining {clf_name}...")

    # Train the classifier
    clf.fit(X_train, y_train)

    # Get predicted probabilities or decision scores
    if clf_name == 'Support Vector Machine':
        # Use decision_function for SVC as probability=True is not default
        y_pred_scores_test = clf.decision_function(X_test)
    else:
        # Use predict_proba for classifiers that support it
        y_pred_scores_test = clf.predict_proba(X_test)[:, 1]

    # Optimize the threshold based on MCC
    best_threshold_test, best_mcc_test = optimize_threshold(y_test, y_pred_scores_test)
    y_pred_test = (y_pred_scores_test > best_threshold_test).astype(int)

    # Calculate metrics
    accuracy_test = accuracy_score(y_test, y_pred_test)
    sensitivity_test = recall_score(y_test, y_pred_test)  # Sensitivity (Recall)
    TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, y_pred_test).ravel()
    specificity_test = TN_test / (TN_test + FP_test)
    MCC_test = matthews_corrcoef(y_test, y_pred_test)

    # Compute AUC
    if clf_name == 'Support Vector Machine':
        auc_test = roc_auc_score(y_test, y_pred_scores_test)  # Use decision_function scores
    else:
        auc_test = roc_auc_score(y_test, y_pred_scores_test)

    # Compute the correct balanced accuracy
    balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

    # Print results
    print(f"\n{clf_name} Test Dataset Results:")
    print(f"Accuracy (ACC): {accuracy_test}")
    print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
    print(f"Sensitivity (Sn): {sensitivity_test}")
    print(f"Specificity (Sp): {specificity_test}")
    print(f"MCC: {MCC_test}")
    print(f"AUC: {auc_test}")
    print(f"True Positives (TP): {TP_test}")
    print(f"False Positives (FP): {FP_test}")
    print(f"True Negatives (TN): {TN_test}")
    print(f"False Negatives (FN): {FN_test}")



Training Logistic Regression...

Logistic Regression Test Dataset Results:
Accuracy (ACC): 0.9114963503649635
Balanced Accuracy (BACC): 0.8789687076744472
Sensitivity (Sn): 0.8129496402877698
Specificity (Sp): 0.9449877750611247
MCC: 0.7644001726812346
AUC: 0.9469930168334769
True Positives (TP): 226
False Positives (FP): 45
True Negatives (TN): 773
False Negatives (FN): 52

Training Random Forest...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Random Forest Test Dataset Results:
Accuracy (ACC): 0.9142335766423357
Balanced Accuracy (BACC): 0.8867390195423124
Sensitivity (Sn): 0.8309352517985612
Specificity (Sp): 0.9425427872860636
MCC: 0.7734780390846248
AUC: 0.9504802026349579
True Positives (TP): 231
False Positives (FP): 47
True Negatives (TN): 771
False Negatives (FN): 47

Training Support Vector Machine...

Support Vector Machine Test Dataset Results:
Accuracy (ACC): 0.9096715328467153
Balanced Accuracy (BACC): 0.8492506728113842
Sensitivity (Sn): 0.7266187050359713
Specificity (Sp): 0.9718826405867971
MCC: 0.7524297529919032
AUC: 0.9528548310495857
True Positives (TP): 202
False Positives (FP): 23
True Negatives (TN): 795
False Negatives (FN): 76

Training K-Nearest Neighbors...

K-Nearest Neighbors Test Dataset Results:
Accuracy (ACC): 0.9206204379562044
Balanced Accuracy (BACC): 0.8791446060755308
Sensitivity (Sn): 0.7949640287769785
Specificity (Sp): 0.9633251833740831
MCC: 0.7851790658619568
AUC: 0.9395635081177112

In [4]:
#uses default parameters

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
import xgboost as xgb

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'whole_sample_dataset_esm2_t12_35M_UR50D_unified_480_dimension.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the XGBoost classifier with default parameters
xgb_model = xgb.XGBClassifier(use_label_encoder=False)

# Fit the model
xgb_model.fit(X_train, y_train)

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_probas):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5
    
    for threshold in thresholds:
        y_pred = (y_pred_probas > threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)
        
        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold
    
    return best_threshold, best_mcc

# Predict probabilities for the test dataset
predicted_probas_test = xgb_model.predict_proba(X_test)[:, 1]
best_threshold_test, best_mcc_test = optimize_threshold(y_test, predicted_probas_test)
predicted_classes_test = (predicted_probas_test > best_threshold_test).astype(int)

# Calculate metrics for the test dataset with optimized threshold
accuracy_test = accuracy_score(y_test, predicted_classes_test)
sensitivity_test = recall_score(y_test, predicted_classes_test)  # Sensitivity (Recall)
TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, predicted_classes_test).ravel()
specificity_test = TN_test / (TN_test + FP_test)  # Corrected Specificity calculation
MCC_test = matthews_corrcoef(y_test, predicted_classes_test)
auc_test = roc_auc_score(y_test, predicted_probas_test)

# Compute the correct balanced accuracy
balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

# Print the adjusted results for the test dataset
print("\nOptimized Test Dataset Results:")
print(f"Accuracy (ACC): {accuracy_test}")
print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
print(f"Sensitivity (Sn): {sensitivity_test}")
print(f"Specificity (Sp): {specificity_test}")
print(f"MCC: {MCC_test}")
print(f"AUC: {auc_test}")
print(f"True Positives (TP): {TP_test}")
print(f"False Positives (FP): {FP_test}")
print(f"True Negatives (TN): {TN_test}")
print(f"False Negatives (FN): {FN_test}")

# Print the total positive and total negative
total_positive = np.sum(y_test)
total_negative = len(y_test) - total_positive
print(f"Total Positive: {total_positive}")
print(f"Total Negative: {total_negative}")

# Evaluate on the external dataset (KELM)
dataset_external = pd.read_csv('kelm_dataset.csv', na_filter=False)
X_external_data_name = 'kelm_dataset_esm2_t12_35M_UR50D_unified_480_dimension.csv'
X_external_data = pd.read_csv(X_external_data_name, header=0, index_col=0, delimiter=',')
X_external = np.array(X_external_data)
y_external = np.array(dataset_external['label'])

# Normalize the external dataset
X_external_normalized = scaler.transform(X_external)

# Predict probabilities for external dataset
predicted_probas_ext = xgb_model.predict_proba(X_external_normalized)[:, 1]
best_threshold_ext, best_mcc_ext = optimize_threshold(y_external, predicted_probas_ext)
predicted_classes_ext = (predicted_probas_ext > best_threshold_ext).astype(int)

# Calculate metrics for the external dataset with optimized threshold
accuracy_ext = accuracy_score(y_external, predicted_classes_ext)
sensitivity_ext = recall_score(y_external, predicted_classes_ext)  # Sensitivity (Recall)
TN_ext, FP_ext, FN_ext, TP_ext = confusion_matrix(y_external, predicted_classes_ext).ravel()
specificity_ext = TN_ext / (TN_ext + FP_ext)  # Corrected Specificity calculation
MCC_ext = matthews_corrcoef(y_external, predicted_classes_ext)
auc_ext = roc_auc_score(y_external, predicted_probas_ext)

# Compute the correct balanced accuracy
balanced_accuracy_ext = (sensitivity_ext + specificity_ext) / 2

# Print the adjusted results for the external dataset
print("\nOptimized External Dataset (KELM) Results:")
print(f"Accuracy (ACC): {accuracy_ext}")
print(f"Balanced Accuracy (BACC): {balanced_accuracy_ext}")
print(f"Sensitivity (Sn): {sensitivity_ext}")
print(f"Specificity (Sp): {specificity_ext}")
print(f"MCC: {MCC_ext}")
print(f"AUC: {auc_ext}")
print(f"True Positives (TP): {TP_ext}")
print(f"False Positives (FP): {FP_ext}")
print(f"True Negatives (TN): {TN_ext}")
print(f"False Negatives (FN): {FN_ext}")

# Print the total positive and total negative
total_positive_ext = np.sum(y_external)
total_negative_ext = len(y_external) - total_positive_ext
print(f"Total Positive: {total_positive_ext}")
print(f"Total Negative: {total_negative_ext}")


Parameters: { "use_label_encoder" } are not used.




Optimized Test Dataset Results:
Accuracy (ACC): 0.9224452554744526
Balanced Accuracy (BACC): 0.8898656136215721
Sensitivity (Sn): 0.8237410071942446
Specificity (Sp): 0.9559902200488998
MCC: 0.7923563239145421
AUC: 0.961869624105117
True Positives (TP): 229
False Positives (FP): 36
True Negatives (TN): 782
False Negatives (FN): 49
Total Positive: 278
Total Negative: 818

Optimized External Dataset (KELM) Results:
Accuracy (ACC): 0.828125
Balanced Accuracy (BACC): 0.828125
Sensitivity (Sn): 0.6770833333333334
Specificity (Sp): 0.9791666666666666
MCC: 0.6884115395322729
AUC: 0.9019097222222223
True Positives (TP): 65
False Positives (FP): 2
True Negatives (TN): 94
False Negatives (FN): 31
Total Positive: 96
Total Negative: 96
