In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'prot_t5_xl_bfd_per_protein_embeddings.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# List of classifiers with default parameters
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Multilayer Perceptron': MLPClassifier()
}

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_scores):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5

    for threshold in thresholds:
        y_pred = (y_pred_scores > threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)

        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold

    return best_threshold, best_mcc

# Evaluate each classifier
for clf_name, clf in classifiers.items():
    print(f"\nTraining {clf_name}...")

    # Train the classifier
    clf.fit(X_train, y_train)

    # Get predicted probabilities or decision scores
    if clf_name == 'Support Vector Machine':
        # Use decision_function for SVC as probability=True is not default
        y_pred_scores_test = clf.decision_function(X_test)
    else:
        # Use predict_proba for classifiers that support it
        y_pred_scores_test = clf.predict_proba(X_test)[:, 1]

    # Optimize the threshold based on MCC
    best_threshold_test, best_mcc_test = optimize_threshold(y_test, y_pred_scores_test)
    y_pred_test = (y_pred_scores_test > best_threshold_test).astype(int)

    # Calculate metrics
    accuracy_test = accuracy_score(y_test, y_pred_test)
    sensitivity_test = recall_score(y_test, y_pred_test)  # Sensitivity (Recall)
    TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, y_pred_test).ravel()
    specificity_test = TN_test / (TN_test + FP_test)
    MCC_test = matthews_corrcoef(y_test, y_pred_test)

    # Compute AUC
    if clf_name == 'Support Vector Machine':
        auc_test = roc_auc_score(y_test, y_pred_scores_test)  # Use decision_function scores
    else:
        auc_test = roc_auc_score(y_test, y_pred_scores_test)

    # Compute the correct balanced accuracy
    balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

    # Print results
    print(f"\n{clf_name} Test Dataset Results:")
    print(f"Accuracy (ACC): {accuracy_test}")
    print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
    print(f"Sensitivity (Sn): {sensitivity_test}")
    print(f"Specificity (Sp): {specificity_test}")
    print(f"MCC: {MCC_test}")
    print(f"AUC: {auc_test}")
    print(f"True Positives (TP): {TP_test}")
    print(f"False Positives (FP): {FP_test}")
    print(f"True Negatives (TN): {TN_test}")
    print(f"False Negatives (FN): {FN_test}")



Training Logistic Regression...

Logistic Regression Test Dataset Results:
Accuracy (ACC): 0.8978102189781022
Balanced Accuracy (BACC): 0.8923589734569313
Sensitivity (Sn): 0.8812949640287769
Specificity (Sp): 0.9034229828850856
MCC: 0.7482237334653423
AUC: 0.9502515347135495
True Positives (TP): 245
False Positives (FP): 79
True Negatives (TN): 739
False Negatives (FN): 33

Training Random Forest...

Random Forest Test Dataset Results:
Accuracy (ACC): 0.9014598540145985
Balanced Accuracy (BACC): 0.8579972208052629
Sensitivity (Sn): 0.7697841726618705
Specificity (Sp): 0.9462102689486552
MCC: 0.7343055967120047
AUC: 0.9437081141932419
True Positives (TP): 214
False Positives (FP): 44
True Negatives (TN): 774
False Negatives (FN): 64

Training Support Vector Machine...

Support Vector Machine Test Dataset Results:
Accuracy (ACC): 0.9105839416058394
Balanced Accuracy (BACC): 0.8498619197551495
Sensitivity (Sn): 0.7266187050359713
Specificity (Sp): 0.9731051344743277
MCC: 0.7549939450996



In [3]:
#uses default parameters

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
import xgboost as xgb

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'prot_t5_xl_bfd_per_protein_embeddings.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the XGBoost classifier with default parameters
xgb_model = xgb.XGBClassifier(use_label_encoder=False)

# Fit the model
xgb_model.fit(X_train, y_train)

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_probas):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5
    
    for threshold in thresholds:
        y_pred = (y_pred_probas > threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)
        
        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold
    
    return best_threshold, best_mcc

# Predict probabilities for the test dataset
predicted_probas_test = xgb_model.predict_proba(X_test)[:, 1]
best_threshold_test, best_mcc_test = optimize_threshold(y_test, predicted_probas_test)
predicted_classes_test = (predicted_probas_test > best_threshold_test).astype(int)

# Calculate metrics for the test dataset with optimized threshold
accuracy_test = accuracy_score(y_test, predicted_classes_test)
sensitivity_test = recall_score(y_test, predicted_classes_test)  # Sensitivity (Recall)
TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, predicted_classes_test).ravel()
specificity_test = TN_test / (TN_test + FP_test)  # Corrected Specificity calculation
MCC_test = matthews_corrcoef(y_test, predicted_classes_test)
auc_test = roc_auc_score(y_test, predicted_probas_test)

# Compute the correct balanced accuracy
balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

# Print the adjusted results for the test dataset
print("\nOptimized Test Dataset Results:")
print(f"Accuracy (ACC): {accuracy_test}")
print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
print(f"Sensitivity (Sn): {sensitivity_test}")
print(f"Specificity (Sp): {specificity_test}")
print(f"MCC: {MCC_test}")
print(f"AUC: {auc_test}")
print(f"True Positives (TP): {TP_test}")
print(f"False Positives (FP): {FP_test}")
print(f"True Negatives (TN): {TN_test}")
print(f"False Negatives (FN): {FN_test}")

# Print the total positive and total negative
total_positive = np.sum(y_test)
total_negative = len(y_test) - total_positive
print(f"Total Positive: {total_positive}")
print(f"Total Negative: {total_negative}")

# Evaluate on the external dataset (KELM)
dataset_external = pd.read_csv('kelm_dataset.csv', na_filter=False)
X_external_data_name = 'prot_bfd_per_protein_embeddings_kelm_dataset.csv'
X_external_data = pd.read_csv(X_external_data_name, header=0, index_col=0, delimiter=',')
X_external = np.array(X_external_data)
y_external = np.array(dataset_external['label'])

# Normalize the external dataset
X_external_normalized = scaler.transform(X_external)

# Predict probabilities for external dataset
predicted_probas_ext = xgb_model.predict_proba(X_external_normalized)[:, 1]
best_threshold_ext, best_mcc_ext = optimize_threshold(y_external, predicted_probas_ext)
predicted_classes_ext = (predicted_probas_ext > best_threshold_ext).astype(int)

# Calculate metrics for the external dataset with optimized threshold
accuracy_ext = accuracy_score(y_external, predicted_classes_ext)
sensitivity_ext = recall_score(y_external, predicted_classes_ext)  # Sensitivity (Recall)
TN_ext, FP_ext, FN_ext, TP_ext = confusion_matrix(y_external, predicted_classes_ext).ravel()
specificity_ext = TN_ext / (TN_ext + FP_ext)  # Corrected Specificity calculation
MCC_ext = matthews_corrcoef(y_external, predicted_classes_ext)
auc_ext = roc_auc_score(y_external, predicted_probas_ext)

# Compute the correct balanced accuracy
balanced_accuracy_ext = (sensitivity_ext + specificity_ext) / 2

# Print the adjusted results for the external dataset
print("\nOptimized External Dataset (KELM) Results:")
print(f"Accuracy (ACC): {accuracy_ext}")
print(f"Balanced Accuracy (BACC): {balanced_accuracy_ext}")
print(f"Sensitivity (Sn): {sensitivity_ext}")
print(f"Specificity (Sp): {specificity_ext}")
print(f"MCC: {MCC_ext}")
print(f"AUC: {auc_ext}")
print(f"True Positives (TP): {TP_ext}")
print(f"False Positives (FP): {FP_ext}")
print(f"True Negatives (TN): {TN_ext}")
print(f"False Negatives (FN): {FN_ext}")

# Print the total positive and total negative
total_positive_ext = np.sum(y_external)
total_negative_ext = len(y_external) - total_positive_ext
print(f"Total Positive: {total_positive_ext}")
print(f"Total Negative: {total_negative_ext}")

Parameters: { "use_label_encoder" } are not used.




Optimized Test Dataset Results:
Accuracy (ACC): 0.9206204379562044
Balanced Accuracy (BACC): 0.8886431197340416
Sensitivity (Sn): 0.8237410071942446
Specificity (Sp): 0.9535452322738386
MCC: 0.7878565195355035
AUC: 0.9567070060333152
True Positives (TP): 229
False Positives (FP): 38
True Negatives (TN): 780
False Negatives (FN): 49
Total Positive: 278
Total Negative: 818

Optimized External Dataset (KELM) Results:
Accuracy (ACC): 0.8333333333333334
Balanced Accuracy (BACC): 0.8333333333333334
Sensitivity (Sn): 0.6770833333333334
Specificity (Sp): 0.9895833333333334
MCC: 0.7018151412904202
AUC: 0.9098307291666666
True Positives (TP): 65
False Positives (FP): 1
True Negatives (TN): 95
False Negatives (FN): 31
Total Positive: 96
Total Negative: 96


In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'prot_t5_xl_bfd_per_protein_embeddings.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Perform feature selection with RFE
print("\nPerforming feature selection with RFE...")
base_estimator = LogisticRegression()  # Base estimator for RFE
rfe = RFE(estimator=base_estimator, n_features_to_select=50, step=10)  # Select top 50 features
X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

print(f"Number of selected features: {X_train.shape[1]}")

# List of classifiers with default parameters
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Multilayer Perceptron': MLPClassifier()
}

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_scores):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5

    for threshold in thresholds:
        y_pred = (y_pred_scores > threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)

        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold

    return best_threshold, best_mcc

# Evaluate each classifier
for clf_name, clf in classifiers.items():
    print(f"\nTraining {clf_name}...")

    # Train the classifier
    clf.fit(X_train, y_train)

    # Get predicted probabilities or decision scores
    if clf_name == 'Support Vector Machine':
        # Use decision_function for SVC as probability=True is not default
        y_pred_scores_test = clf.decision_function(X_test)
    else:
        # Use predict_proba for classifiers that support it
        y_pred_scores_test = clf.predict_proba(X_test)[:, 1]

    # Optimize the threshold based on MCC
    best_threshold_test, best_mcc_test = optimize_threshold(y_test, y_pred_scores_test)
    y_pred_test = (y_pred_scores_test > best_threshold_test).astype(int)

    # Calculate metrics
    accuracy_test = accuracy_score(y_test, y_pred_test)
    sensitivity_test = recall_score(y_test, y_pred_test)  # Sensitivity (Recall)
    TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, y_pred_test).ravel()
    specificity_test = TN_test / (TN_test + FP_test)
    MCC_test = matthews_corrcoef(y_test, y_pred_test)

    # Compute AUC
    if clf_name == 'Support Vector Machine':
        auc_test = roc_auc_score(y_test, y_pred_scores_test)  # Use decision_function scores
    else:
        auc_test = roc_auc_score(y_test, y_pred_scores_test)

    # Compute the correct balanced accuracy
    balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

    # Print results
    print(f"\n{clf_name} Test Dataset Results:")
    print(f"Accuracy (ACC): {accuracy_test}")
    print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
    print(f"Sensitivity (Sn): {sensitivity_test}")
    print(f"Specificity (Sp): {specificity_test}")
    print(f"MCC: {MCC_test}")
    print(f"AUC: {auc_test}")
    print(f"True Positives (TP): {TP_test}")
    print(f"False Positives (FP): {FP_test}")
    print(f"True Negatives (TN): {TN_test}")
    print(f"False Negatives (FN): {FN_test}")



Performing feature selection with RFE...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Number of selected features: 50

Training Logistic Regression...

Logistic Regression Test Dataset Results:
Accuracy (ACC): 0.8996350364963503
Balanced Accuracy (BACC): 0.842526956429966
Sensitivity (Sn): 0.7266187050359713
Specificity (Sp): 0.9584352078239609
MCC: 0.7251345095648278
AUC: 0.9377187736363477
True Positives (TP): 202
False Positives (FP): 34
True Negatives (TN): 784
False Negatives (FN): 76

Training Random Forest...

Random Forest Test Dataset Results:
Accuracy (ACC): 0.9023722627737226
Balanced Accuracy (BACC): 0.891853265553816
Sensitivity (Sn): 0.8705035971223022
Specificity (Sp): 0.91320293398533
MCC: 0.7549174530693232
AUC: 0.9499942833019648
True Positives (TP): 242
False Positives (FP): 71
True Negatives (TN): 747
False Negatives (FN): 36

Training Support Vector Machine...

Support Vector Machine Test Dataset Results:
Accuracy (ACC): 0.9087591240875912
Balanced Accuracy (BACC): 0.8569506253188158
Sensitivity (Sn): 0.7517985611510791
Specificity (Sp): 0.962102689

In [5]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'prot_t5_xl_bfd_per_protein_embeddings.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Perform feature selection with RFE
print("\nPerforming feature selection with RFE...")
base_estimator = LogisticRegression()  # Base estimator for RFE
rfe = RFE(estimator=base_estimator, n_features_to_select=100, step=10)  # Select top 50 features
X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

print(f"Number of selected features: {X_train.shape[1]}")

# List of classifiers with default parameters
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Multilayer Perceptron': MLPClassifier()
}

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_scores):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5

    for threshold in thresholds:
        y_pred = (y_pred_scores > threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)

        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold

    return best_threshold, best_mcc

# Evaluate each classifier
for clf_name, clf in classifiers.items():
    print(f"\nTraining {clf_name}...")

    # Train the classifier
    clf.fit(X_train, y_train)

    # Get predicted probabilities or decision scores
    if clf_name == 'Support Vector Machine':
        # Use decision_function for SVC as probability=True is not default
        y_pred_scores_test = clf.decision_function(X_test)
    else:
        # Use predict_proba for classifiers that support it
        y_pred_scores_test = clf.predict_proba(X_test)[:, 1]

    # Optimize the threshold based on MCC
    best_threshold_test, best_mcc_test = optimize_threshold(y_test, y_pred_scores_test)
    y_pred_test = (y_pred_scores_test > best_threshold_test).astype(int)

    # Calculate metrics
    accuracy_test = accuracy_score(y_test, y_pred_test)
    sensitivity_test = recall_score(y_test, y_pred_test)  # Sensitivity (Recall)
    TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, y_pred_test).ravel()
    specificity_test = TN_test / (TN_test + FP_test)
    MCC_test = matthews_corrcoef(y_test, y_pred_test)

    # Compute AUC
    if clf_name == 'Support Vector Machine':
        auc_test = roc_auc_score(y_test, y_pred_scores_test)  # Use decision_function scores
    else:
        auc_test = roc_auc_score(y_test, y_pred_scores_test)

    # Compute the correct balanced accuracy
    balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

    # Print results
    print(f"\n{clf_name} Test Dataset Results:")
    print(f"Accuracy (ACC): {accuracy_test}")
    print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
    print(f"Sensitivity (Sn): {sensitivity_test}")
    print(f"Specificity (Sp): {specificity_test}")
    print(f"MCC: {MCC_test}")
    print(f"AUC: {auc_test}")
    print(f"True Positives (TP): {TP_test}")
    print(f"False Positives (FP): {FP_test}")
    print(f"True Negatives (TN): {TN_test}")
    print(f"False Negatives (FN): {FN_test}")



Performing feature selection with RFE...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Number of selected features: 100

Training Logistic Regression...

Logistic Regression Test Dataset Results:
Accuracy (ACC): 0.8996350364963503
Balanced Accuracy (BACC): 0.867460554783557
Sensitivity (Sn): 0.802158273381295
Specificity (Sp): 0.9327628361858191
MCC: 0.734921109567114
AUC: 0.9445920036586868
True Positives (TP): 223
False Positives (FP): 55
True Negatives (TN): 763
False Negatives (FN): 55

Training Random Forest...

Random Forest Test Dataset Results:
Accuracy (ACC): 0.9014598540145985
Balanced Accuracy (BACC): 0.8556225923906352
Sensitivity (Sn): 0.762589928057554
Specificity (Sp): 0.9486552567237164
MCC: 0.7334077331699584
AUC: 0.9442468030465603
True Positives (TP): 212
False Positives (FP): 42
True Negatives (TN): 776
False Negatives (FN): 66

Training Support Vector Machine...

Support Vector Machine Test Dataset Results:
Accuracy (ACC): 0.9105839416058394
Balanced Accuracy (BACC): 0.8522365481697771
Sensitivity (Sn): 0.7338129496402878
Specificity (Sp): 0.97066014

In [6]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'prot_t5_xl_bfd_per_protein_embeddings.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Perform feature selection with RFE
print("\nPerforming feature selection with RFE...")
base_estimator = LogisticRegression()  # Base estimator for RFE
rfe = RFE(estimator=base_estimator, n_features_to_select=320, step=10)  # Select top 50 features
X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

print(f"Number of selected features: {X_train.shape[1]}")

# List of classifiers with default parameters
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Multilayer Perceptron': MLPClassifier()
}

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_scores):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5

    for threshold in thresholds:
        y_pred = (y_pred_scores > threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)

        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold

    return best_threshold, best_mcc

# Evaluate each classifier
for clf_name, clf in classifiers.items():
    print(f"\nTraining {clf_name}...")

    # Train the classifier
    clf.fit(X_train, y_train)

    # Get predicted probabilities or decision scores
    if clf_name == 'Support Vector Machine':
        # Use decision_function for SVC as probability=True is not default
        y_pred_scores_test = clf.decision_function(X_test)
    else:
        # Use predict_proba for classifiers that support it
        y_pred_scores_test = clf.predict_proba(X_test)[:, 1]

    # Optimize the threshold based on MCC
    best_threshold_test, best_mcc_test = optimize_threshold(y_test, y_pred_scores_test)
    y_pred_test = (y_pred_scores_test > best_threshold_test).astype(int)

    # Calculate metrics
    accuracy_test = accuracy_score(y_test, y_pred_test)
    sensitivity_test = recall_score(y_test, y_pred_test)  # Sensitivity (Recall)
    TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, y_pred_test).ravel()
    specificity_test = TN_test / (TN_test + FP_test)
    MCC_test = matthews_corrcoef(y_test, y_pred_test)

    # Compute AUC
    if clf_name == 'Support Vector Machine':
        auc_test = roc_auc_score(y_test, y_pred_scores_test)  # Use decision_function scores
    else:
        auc_test = roc_auc_score(y_test, y_pred_scores_test)

    # Compute the correct balanced accuracy
    balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

    # Print results
    print(f"\n{clf_name} Test Dataset Results:")
    print(f"Accuracy (ACC): {accuracy_test}")
    print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
    print(f"Sensitivity (Sn): {sensitivity_test}")
    print(f"Specificity (Sp): {specificity_test}")
    print(f"MCC: {MCC_test}")
    print(f"AUC: {auc_test}")
    print(f"True Positives (TP): {TP_test}")
    print(f"False Positives (FP): {FP_test}")
    print(f"True Negatives (TN): {TN_test}")
    print(f"False Negatives (FN): {FN_test}")



Performing feature selection with RFE...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Number of selected features: 320

Training Logistic Regression...

Logistic Regression Test Dataset Results:
Accuracy (ACC): 0.8959854014598541
Balanced Accuracy (BACC): 0.8792633374962622
Sensitivity (Sn): 0.8453237410071942
Specificity (Sp): 0.91320293398533
MCC: 0.7356913487758071
AUC: 0.9488927195651792
True Positives (TP): 235
False Positives (FP): 71
True Negatives (TN): 747
False Negatives (FN): 43

Training Random Forest...

Random Forest Test Dataset Results:
Accuracy (ACC): 0.9114963503649635
Balanced Accuracy (BACC): 0.8682828798086226
Sensitivity (Sn): 0.7805755395683454
Specificity (Sp): 0.9559902200488998
MCC: 0.7605653363339069
AUC: 0.9454583032840231
True Positives (TP): 217
False Positives (FP): 36
True Negatives (TN): 782
False Negatives (FN): 61

Training Support Vector Machine...

Support Vector Machine Test Dataset Results:
Accuracy (ACC): 0.9133211678832117
Balanced Accuracy (BACC): 0.854070289001073
Sensitivity (Sn): 0.7338129496402878
Specificity (Sp): 0.9743276



In [7]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'prot_t5_xl_bfd_per_protein_embeddings.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Perform feature selection with RFE
print("\nPerforming feature selection with RFE...")
base_estimator = LogisticRegression()  # Base estimator for RFE
rfe = RFE(estimator=base_estimator, n_features_to_select=500, step=10)  # Select top 50 features
X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

print(f"Number of selected features: {X_train.shape[1]}")

# List of classifiers with default parameters
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Multilayer Perceptron': MLPClassifier()
}

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_scores):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5

    for threshold in thresholds:
        y_pred = (y_pred_scores > threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)

        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold

    return best_threshold, best_mcc

# Evaluate each classifier
for clf_name, clf in classifiers.items():
    print(f"\nTraining {clf_name}...")

    # Train the classifier
    clf.fit(X_train, y_train)

    # Get predicted probabilities or decision scores
    if clf_name == 'Support Vector Machine':
        # Use decision_function for SVC as probability=True is not default
        y_pred_scores_test = clf.decision_function(X_test)
    else:
        # Use predict_proba for classifiers that support it
        y_pred_scores_test = clf.predict_proba(X_test)[:, 1]

    # Optimize the threshold based on MCC
    best_threshold_test, best_mcc_test = optimize_threshold(y_test, y_pred_scores_test)
    y_pred_test = (y_pred_scores_test > best_threshold_test).astype(int)

    # Calculate metrics
    accuracy_test = accuracy_score(y_test, y_pred_test)
    sensitivity_test = recall_score(y_test, y_pred_test)  # Sensitivity (Recall)
    TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, y_pred_test).ravel()
    specificity_test = TN_test / (TN_test + FP_test)
    MCC_test = matthews_corrcoef(y_test, y_pred_test)

    # Compute AUC
    if clf_name == 'Support Vector Machine':
        auc_test = roc_auc_score(y_test, y_pred_scores_test)  # Use decision_function scores
    else:
        auc_test = roc_auc_score(y_test, y_pred_scores_test)

    # Compute the correct balanced accuracy
    balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

    # Print results
    print(f"\n{clf_name} Test Dataset Results:")
    print(f"Accuracy (ACC): {accuracy_test}")
    print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
    print(f"Sensitivity (Sn): {sensitivity_test}")
    print(f"Specificity (Sp): {specificity_test}")
    print(f"MCC: {MCC_test}")
    print(f"AUC: {auc_test}")
    print(f"True Positives (TP): {TP_test}")
    print(f"False Positives (FP): {FP_test}")
    print(f"True Negatives (TN): {TN_test}")
    print(f"False Negatives (FN): {FN_test}")



Performing feature selection with RFE...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Number of selected features: 500

Training Logistic Regression...

Logistic Regression Test Dataset Results:
Accuracy (ACC): 0.8996350364963503
Balanced Accuracy (BACC): 0.8828956394786371
Sensitivity (Sn): 0.8489208633093526
Specificity (Sp): 0.9168704156479217
MCC: 0.7442349882360484
AUC: 0.9495831207894322
True Positives (TP): 236
False Positives (FP): 68
True Negatives (TN): 750
False Negatives (FN): 42

Training Random Forest...

Random Forest Test Dataset Results:
Accuracy (ACC): 0.9032846715328468
Balanced Accuracy (BACC): 0.8758421135951875
Sensitivity (Sn): 0.8201438848920863
Specificity (Sp): 0.9315403422982885
MCC: 0.7464441221751931
AUC: 0.9434244780214948
True Positives (TP): 228
False Positives (FP): 56
True Negatives (TN): 762
False Negatives (FN): 50

Training Support Vector Machine...

Support Vector Machine Test Dataset Results:
Accuracy (ACC): 0.9096715328467153
Balanced Accuracy (BACC): 0.8492506728113842
Sensitivity (Sn): 0.7266187050359713
Specificity (Sp): 0.9718

In [8]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'prot_t5_xl_bfd_per_protein_embeddings.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Perform feature selection with RFE
print("\nPerforming feature selection with RFE...")
base_estimator = LogisticRegression()  # Base estimator for RFE
rfe = RFE(estimator=base_estimator, n_features_to_select=640, step=10)  # Select top 50 features
X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

print(f"Number of selected features: {X_train.shape[1]}")

# List of classifiers with default parameters
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Multilayer Perceptron': MLPClassifier()
}

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_scores):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5

    for threshold in thresholds:
        y_pred = (y_pred_scores > threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)

        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold

    return best_threshold, best_mcc

# Evaluate each classifier
for clf_name, clf in classifiers.items():
    print(f"\nTraining {clf_name}...")

    # Train the classifier
    clf.fit(X_train, y_train)

    # Get predicted probabilities or decision scores
    if clf_name == 'Support Vector Machine':
        # Use decision_function for SVC as probability=True is not default
        y_pred_scores_test = clf.decision_function(X_test)
    else:
        # Use predict_proba for classifiers that support it
        y_pred_scores_test = clf.predict_proba(X_test)[:, 1]

    # Optimize the threshold based on MCC
    best_threshold_test, best_mcc_test = optimize_threshold(y_test, y_pred_scores_test)
    y_pred_test = (y_pred_scores_test > best_threshold_test).astype(int)

    # Calculate metrics
    accuracy_test = accuracy_score(y_test, y_pred_test)
    sensitivity_test = recall_score(y_test, y_pred_test)  # Sensitivity (Recall)
    TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, y_pred_test).ravel()
    specificity_test = TN_test / (TN_test + FP_test)
    MCC_test = matthews_corrcoef(y_test, y_pred_test)

    # Compute AUC
    if clf_name == 'Support Vector Machine':
        auc_test = roc_auc_score(y_test, y_pred_scores_test)  # Use decision_function scores
    else:
        auc_test = roc_auc_score(y_test, y_pred_scores_test)

    # Compute the correct balanced accuracy
    balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

    # Print results
    print(f"\n{clf_name} Test Dataset Results:")
    print(f"Accuracy (ACC): {accuracy_test}")
    print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
    print(f"Sensitivity (Sn): {sensitivity_test}")
    print(f"Specificity (Sp): {specificity_test}")
    print(f"MCC: {MCC_test}")
    print(f"AUC: {auc_test}")
    print(f"True Positives (TP): {TP_test}")
    print(f"False Positives (FP): {FP_test}")
    print(f"True Negatives (TN): {TN_test}")
    print(f"False Negatives (FN): {FN_test}")



Performing feature selection with RFE...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Number of selected features: 640

Training Logistic Regression...

Logistic Regression Test Dataset Results:
Accuracy (ACC): 0.9014598540145985
Balanced Accuracy (BACC): 0.8841181333661677
Sensitivity (Sn): 0.8489208633093526
Specificity (Sp): 0.9193154034229829
MCC: 0.7481352750694481
AUC: 0.9491873493869941
True Positives (TP): 236
False Positives (FP): 66
True Negatives (TN): 752
False Negatives (FN): 42

Training Random Forest...

Random Forest Test Dataset Results:
Accuracy (ACC): 0.9096715328467153
Balanced Accuracy (BACC): 0.8634984432991504
Sensitivity (Sn): 0.7697841726618705
Specificity (Sp): 0.9572127139364304
MCC: 0.7549013848153419
AUC: 0.9440159363951381
True Positives (TP): 214
False Positives (FP): 35
True Negatives (TN): 783
False Negatives (FN): 64

Training Support Vector Machine...

Support Vector Machine Test Dataset Results:
Accuracy (ACC): 0.9105839416058394
Balanced Accuracy (BACC): 0.8462999771332078
Sensitivity (Sn): 0.7158273381294964
Specificity (Sp): 0.9767

In [9]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'prot_t5_xl_bfd_per_protein_embeddings.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Perform feature selection with RFE
print("\nPerforming feature selection with RFE...")
base_estimator = LogisticRegression()  # Base estimator for RFE
rfe = RFE(estimator=base_estimator, n_features_to_select=800, step=10)  # Select top 50 features
X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

print(f"Number of selected features: {X_train.shape[1]}")

# List of classifiers with default parameters
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Multilayer Perceptron': MLPClassifier()
}

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_scores):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5

    for threshold in thresholds:
        y_pred = (y_pred_scores > threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)

        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold

    return best_threshold, best_mcc

# Evaluate each classifier
for clf_name, clf in classifiers.items():
    print(f"\nTraining {clf_name}...")

    # Train the classifier
    clf.fit(X_train, y_train)

    # Get predicted probabilities or decision scores
    if clf_name == 'Support Vector Machine':
        # Use decision_function for SVC as probability=True is not default
        y_pred_scores_test = clf.decision_function(X_test)
    else:
        # Use predict_proba for classifiers that support it
        y_pred_scores_test = clf.predict_proba(X_test)[:, 1]

    # Optimize the threshold based on MCC
    best_threshold_test, best_mcc_test = optimize_threshold(y_test, y_pred_scores_test)
    y_pred_test = (y_pred_scores_test > best_threshold_test).astype(int)

    # Calculate metrics
    accuracy_test = accuracy_score(y_test, y_pred_test)
    sensitivity_test = recall_score(y_test, y_pred_test)  # Sensitivity (Recall)
    TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, y_pred_test).ravel()
    specificity_test = TN_test / (TN_test + FP_test)
    MCC_test = matthews_corrcoef(y_test, y_pred_test)

    # Compute AUC
    if clf_name == 'Support Vector Machine':
        auc_test = roc_auc_score(y_test, y_pred_scores_test)  # Use decision_function scores
    else:
        auc_test = roc_auc_score(y_test, y_pred_scores_test)

    # Compute the correct balanced accuracy
    balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

    # Print results
    print(f"\n{clf_name} Test Dataset Results:")
    print(f"Accuracy (ACC): {accuracy_test}")
    print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
    print(f"Sensitivity (Sn): {sensitivity_test}")
    print(f"Specificity (Sp): {specificity_test}")
    print(f"MCC: {MCC_test}")
    print(f"AUC: {auc_test}")
    print(f"True Positives (TP): {TP_test}")
    print(f"False Positives (FP): {FP_test}")
    print(f"True Negatives (TN): {TN_test}")
    print(f"False Negatives (FN): {FN_test}")



Performing feature selection with RFE...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Number of selected features: 800

Training Logistic Regression...

Logistic Regression Test Dataset Results:
Accuracy (ACC): 0.9032846715328468
Balanced Accuracy (BACC): 0.8734674851805597
Sensitivity (Sn): 0.8129496402877698
Specificity (Sp): 0.9339853300733496
MCC: 0.7451740930456007
AUC: 0.949578723329405
True Positives (TP): 226
False Positives (FP): 54
True Negatives (TN): 764
False Negatives (FN): 52

Training Random Forest...

Random Forest Test Dataset Results:
Accuracy (ACC): 0.9014598540145985
Balanced Accuracy (BACC): 0.873432305500343
Sensitivity (Sn): 0.8165467625899281
Specificity (Sp): 0.9303178484107579
MCC: 0.7416581042626176
AUC: 0.9484903519727006
True Positives (TP): 227
False Positives (FP): 57
True Negatives (TN): 761
False Negatives (FN): 51

Training Support Vector Machine...

Support Vector Machine Test Dataset Results:
Accuracy (ACC): 0.9087591240875912
Balanced Accuracy (BACC): 0.8462647974529911
Sensitivity (Sn): 0.7194244604316546
Specificity (Sp): 0.973105

In [11]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'prot_t5_xl_bfd_per_protein_embeddings.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Perform feature selection with RFE
print("\nPerforming feature selection with RFE...")
base_estimator = LogisticRegression()  # Base estimator for RFE
rfe = RFE(estimator=base_estimator, n_features_to_select=1000, step=10)  # Select top 50 features
X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

print(f"Number of selected features: {X_train.shape[1]}")

# List of classifiers with default parameters
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Multilayer Perceptron': MLPClassifier()
}

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_scores):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5

    for threshold in thresholds:
        y_pred = (y_pred_scores > threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)

        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold

    return best_threshold, best_mcc

# Evaluate each classifier
for clf_name, clf in classifiers.items():
    print(f"\nTraining {clf_name}...")

    # Train the classifier
    clf.fit(X_train, y_train)

    # Get predicted probabilities or decision scores
    if clf_name == 'Support Vector Machine':
        # Use decision_function for SVC as probability=True is not default
        y_pred_scores_test = clf.decision_function(X_test)
    else:
        # Use predict_proba for classifiers that support it
        y_pred_scores_test = clf.predict_proba(X_test)[:, 1]

    # Optimize the threshold based on MCC
    best_threshold_test, best_mcc_test = optimize_threshold(y_test, y_pred_scores_test)
    y_pred_test = (y_pred_scores_test > best_threshold_test).astype(int)

    # Calculate metrics
    accuracy_test = accuracy_score(y_test, y_pred_test)
    sensitivity_test = recall_score(y_test, y_pred_test)  # Sensitivity (Recall)
    TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, y_pred_test).ravel()
    specificity_test = TN_test / (TN_test + FP_test)
    MCC_test = matthews_corrcoef(y_test, y_pred_test)

    # Compute AUC
    if clf_name == 'Support Vector Machine':
        auc_test = roc_auc_score(y_test, y_pred_scores_test)  # Use decision_function scores
    else:
        auc_test = roc_auc_score(y_test, y_pred_scores_test)

    # Compute the correct balanced accuracy
    balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

    # Print results
    print(f"\n{clf_name} Test Dataset Results:")
    print(f"Accuracy (ACC): {accuracy_test}")
    print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
    print(f"Sensitivity (Sn): {sensitivity_test}")
    print(f"Specificity (Sp): {specificity_test}")
    print(f"MCC: {MCC_test}")
    print(f"AUC: {auc_test}")
    print(f"True Positives (TP): {TP_test}")
    print(f"False Positives (FP): {FP_test}")
    print(f"True Negatives (TN): {TN_test}")
    print(f"False Negatives (FN): {FN_test}")



Performing feature selection with RFE...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Number of selected features: 1000

Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression Test Dataset Results:
Accuracy (ACC): 0.9032846715328468
Balanced Accuracy (BACC): 0.8865279414610121
Sensitivity (Sn): 0.8525179856115108
Specificity (Sp): 0.9205378973105135
MCC: 0.7528287854385157
AUC: 0.9502163550333328
True Positives (TP): 237
False Positives (FP): 65
True Negatives (TN): 753
False Negatives (FN): 41

Training Random Forest...

Random Forest Test Dataset Results:
Accuracy (ACC): 0.9096715328467153
Balanced Accuracy (BACC): 0.8813081564088583
Sensitivity (Sn): 0.8237410071942446
Specificity (Sp): 0.9388753056234719
MCC: 0.7617141278957217
AUC: 0.9467049832017027
True Positives (TP): 229
False Positives (FP): 50
True Negatives (TN): 768
False Negatives (FN): 49

Training Support Vector Machine...

Support Vector Machine Test Dataset Results:
Accuracy (ACC): 0.9096715328467153
Balanced Accuracy (BACC): 0.8480633586040702
Sensitivity (Sn): 0.7230215827338129
Specificity (Sp): 0.9731051344743277
MCC: 0.7523636394833879
AUC: 0.9506517035760145
True 

In [14]:
#uses default parameters

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
import xgboost as xgb

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'prot_t5_xl_bfd_per_protein_embeddings.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Perform feature selection with RFE
print("\nPerforming feature selection with RFE...")
base_estimator = LogisticRegression()  # Base estimator for RFE
rfe = RFE(estimator=base_estimator, n_features_to_select=1000, step=10)  # Select top 50 features
X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

# Initialize the XGBoost classifier with default parameters
xgb_model = xgb.XGBClassifier(use_label_encoder=False)

# Fit the model
xgb_model.fit(X_train, y_train)

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_probas):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5
    
    for threshold in thresholds:
        y_pred = (y_pred_probas > threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)
        
        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold
    
    return best_threshold, best_mcc

# Predict probabilities for the test dataset
predicted_probas_test = xgb_model.predict_proba(X_test)[:, 1]
best_threshold_test, best_mcc_test = optimize_threshold(y_test, predicted_probas_test)
predicted_classes_test = (predicted_probas_test > best_threshold_test).astype(int)

# Calculate metrics for the test dataset with optimized threshold
accuracy_test = accuracy_score(y_test, predicted_classes_test)
sensitivity_test = recall_score(y_test, predicted_classes_test)  # Sensitivity (Recall)
TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, predicted_classes_test).ravel()
specificity_test = TN_test / (TN_test + FP_test)  # Corrected Specificity calculation
MCC_test = matthews_corrcoef(y_test, predicted_classes_test)
auc_test = roc_auc_score(y_test, predicted_probas_test)

# Compute the correct balanced accuracy
balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

# Print the adjusted results for the test dataset
print("\nOptimized Test Dataset Results:")
print(f"Accuracy (ACC): {accuracy_test}")
print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
print(f"Sensitivity (Sn): {sensitivity_test}")
print(f"Specificity (Sp): {specificity_test}")
print(f"MCC: {MCC_test}")
print(f"AUC: {auc_test}")
print(f"True Positives (TP): {TP_test}")
print(f"False Positives (FP): {FP_test}")
print(f"True Negatives (TN): {TN_test}")
print(f"False Negatives (FN): {FN_test}")

# Print the total positive and total negative
total_positive = np.sum(y_test)
total_negative = len(y_test) - total_positive
print(f"Total Positive: {total_positive}")
print(f"Total Negative: {total_negative}")


Performing feature selection with RFE...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Optimized Test Dataset Results:
Accuracy (ACC): 0.9151459854014599
Balanced Accuracy (BACC): 0.8742898102056252
Sensitivity (Sn): 0.7913669064748201
Specificity (Sp): 0.9572127139364304
MCC: 0.7708483750283488
AUC: 0.9572039190163762
True Positives (TP): 220
False Positives (FP): 35
True Negatives (TN): 783
False Negatives (FN): 58
Total Positive: 278
Total Negative: 818


In [15]:
#uses default parameters

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
import xgboost as xgb

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'prot_t5_xl_bfd_per_protein_embeddings.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Perform feature selection with RFE
print("\nPerforming feature selection with RFE...")
base_estimator = LogisticRegression()  # Base estimator for RFE
rfe = RFE(estimator=base_estimator, n_features_to_select=800, step=10)  # Select top 50 features
X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

# Initialize the XGBoost classifier with default parameters
xgb_model = xgb.XGBClassifier(use_label_encoder=False)

# Fit the model
xgb_model.fit(X_train, y_train)

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_probas):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5
    
    for threshold in thresholds:
        y_pred = (y_pred_probas > threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)
        
        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold
    
    return best_threshold, best_mcc

# Predict probabilities for the test dataset
predicted_probas_test = xgb_model.predict_proba(X_test)[:, 1]
best_threshold_test, best_mcc_test = optimize_threshold(y_test, predicted_probas_test)
predicted_classes_test = (predicted_probas_test > best_threshold_test).astype(int)

# Calculate metrics for the test dataset with optimized threshold
accuracy_test = accuracy_score(y_test, predicted_classes_test)
sensitivity_test = recall_score(y_test, predicted_classes_test)  # Sensitivity (Recall)
TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, predicted_classes_test).ravel()
specificity_test = TN_test / (TN_test + FP_test)  # Corrected Specificity calculation
MCC_test = matthews_corrcoef(y_test, predicted_classes_test)
auc_test = roc_auc_score(y_test, predicted_probas_test)

# Compute the correct balanced accuracy
balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

# Print the adjusted results for the test dataset
print("\nOptimized Test Dataset Results:")
print(f"Accuracy (ACC): {accuracy_test}")
print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
print(f"Sensitivity (Sn): {sensitivity_test}")
print(f"Specificity (Sp): {specificity_test}")
print(f"MCC: {MCC_test}")
print(f"AUC: {auc_test}")
print(f"True Positives (TP): {TP_test}")
print(f"False Positives (FP): {FP_test}")
print(f"True Negatives (TN): {TN_test}")
print(f"False Negatives (FN): {FN_test}")

# Print the total positive and total negative
total_positive = np.sum(y_test)
total_negative = len(y_test) - total_positive
print(f"Total Positive: {total_positive}")
print(f"Total Negative: {total_negative}")


Performing feature selection with RFE...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Optimized Test Dataset Results:
Accuracy (ACC): 0.9206204379562044
Balanced Accuracy (BACC): 0.9005162618071801
Sensitivity (Sn): 0.8597122302158273
Specificity (Sp): 0.941320293398533
MCC: 0.7927458740057995
AUC: 0.961632161263654
True Positives (TP): 239
False Positives (FP): 48
True Negatives (TN): 770
False Negatives (FN): 39
Total Positive: 278
Total Negative: 818


In [16]:
#uses default parameters

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
import xgboost as xgb

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'prot_t5_xl_bfd_per_protein_embeddings.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Perform feature selection with RFE
print("\nPerforming feature selection with RFE...")
base_estimator = LogisticRegression()  # Base estimator for RFE
rfe = RFE(estimator=base_estimator, n_features_to_select=640, step=10)  # Select top 50 features
X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

# Initialize the XGBoost classifier with default parameters
xgb_model = xgb.XGBClassifier(use_label_encoder=False)

# Fit the model
xgb_model.fit(X_train, y_train)

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_probas):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5
    
    for threshold in thresholds:
        y_pred = (y_pred_probas > threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)
        
        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold
    
    return best_threshold, best_mcc

# Predict probabilities for the test dataset
predicted_probas_test = xgb_model.predict_proba(X_test)[:, 1]
best_threshold_test, best_mcc_test = optimize_threshold(y_test, predicted_probas_test)
predicted_classes_test = (predicted_probas_test > best_threshold_test).astype(int)

# Calculate metrics for the test dataset with optimized threshold
accuracy_test = accuracy_score(y_test, predicted_classes_test)
sensitivity_test = recall_score(y_test, predicted_classes_test)  # Sensitivity (Recall)
TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, predicted_classes_test).ravel()
specificity_test = TN_test / (TN_test + FP_test)  # Corrected Specificity calculation
MCC_test = matthews_corrcoef(y_test, predicted_classes_test)
auc_test = roc_auc_score(y_test, predicted_probas_test)

# Compute the correct balanced accuracy
balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

# Print the adjusted results for the test dataset
print("\nOptimized Test Dataset Results:")
print(f"Accuracy (ACC): {accuracy_test}")
print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
print(f"Sensitivity (Sn): {sensitivity_test}")
print(f"Specificity (Sp): {specificity_test}")
print(f"MCC: {MCC_test}")
print(f"AUC: {auc_test}")
print(f"True Positives (TP): {TP_test}")
print(f"False Positives (FP): {FP_test}")
print(f"True Negatives (TN): {TN_test}")
print(f"False Negatives (FN): {FN_test}")

# Print the total positive and total negative
total_positive = np.sum(y_test)
total_negative = len(y_test) - total_positive
print(f"Total Positive: {total_positive}")
print(f"Total Negative: {total_negative}")


Performing feature selection with RFE...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Optimized Test Dataset Results:
Accuracy (ACC): 0.9197080291970803
Balanced Accuracy (BACC): 0.8714094738878824
Sensitivity (Sn): 0.7733812949640287
Specificity (Sp): 0.969437652811736
MCC: 0.7815189619833018
AUC: 0.952419482506904
True Positives (TP): 215
False Positives (FP): 25
True Negatives (TN): 793
False Negatives (FN): 63
Total Positive: 278
Total Negative: 818


In [17]:
#uses default parameters

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
import xgboost as xgb

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'prot_t5_xl_bfd_per_protein_embeddings.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Perform feature selection with RFE
print("\nPerforming feature selection with RFE...")
base_estimator = LogisticRegression()  # Base estimator for RFE
rfe = RFE(estimator=base_estimator, n_features_to_select=500, step=10)  # Select top 50 features
X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

# Initialize the XGBoost classifier with default parameters
xgb_model = xgb.XGBClassifier(use_label_encoder=False)

# Fit the model
xgb_model.fit(X_train, y_train)

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_probas):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5
    
    for threshold in thresholds:
        y_pred = (y_pred_probas > threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)
        
        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold
    
    return best_threshold, best_mcc

# Predict probabilities for the test dataset
predicted_probas_test = xgb_model.predict_proba(X_test)[:, 1]
best_threshold_test, best_mcc_test = optimize_threshold(y_test, predicted_probas_test)
predicted_classes_test = (predicted_probas_test > best_threshold_test).astype(int)

# Calculate metrics for the test dataset with optimized threshold
accuracy_test = accuracy_score(y_test, predicted_classes_test)
sensitivity_test = recall_score(y_test, predicted_classes_test)  # Sensitivity (Recall)
TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, predicted_classes_test).ravel()
specificity_test = TN_test / (TN_test + FP_test)  # Corrected Specificity calculation
MCC_test = matthews_corrcoef(y_test, predicted_classes_test)
auc_test = roc_auc_score(y_test, predicted_probas_test)

# Compute the correct balanced accuracy
balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

# Print the adjusted results for the test dataset
print("\nOptimized Test Dataset Results:")
print(f"Accuracy (ACC): {accuracy_test}")
print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
print(f"Sensitivity (Sn): {sensitivity_test}")
print(f"Specificity (Sp): {specificity_test}")
print(f"MCC: {MCC_test}")
print(f"AUC: {auc_test}")
print(f"True Positives (TP): {TP_test}")
print(f"False Positives (FP): {FP_test}")
print(f"True Negatives (TN): {TN_test}")
print(f"False Negatives (FN): {FN_test}")

# Print the total positive and total negative
total_positive = np.sum(y_test)
total_negative = len(y_test) - total_positive
print(f"Total Positive: {total_positive}")
print(f"Total Negative: {total_negative}")


Performing feature selection with RFE...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Optimized Test Dataset Results:
Accuracy (ACC): 0.9124087591240876
Balanced Accuracy (BACC): 0.8950150393132926
Sensitivity (Sn): 0.8597122302158273
Specificity (Sp): 0.9303178484107579
MCC: 0.7741976115612884
AUC: 0.9540685300170622
True Positives (TP): 239
False Positives (FP): 57
True Negatives (TN): 761
False Negatives (FN): 39
Total Positive: 278
Total Negative: 818


In [18]:
#uses default parameters

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
import xgboost as xgb

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'prot_t5_xl_bfd_per_protein_embeddings.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Perform feature selection with RFE
print("\nPerforming feature selection with RFE...")
base_estimator = LogisticRegression()  # Base estimator for RFE
rfe = RFE(estimator=base_estimator, n_features_to_select=320, step=10)  # Select top 50 features
X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

# Initialize the XGBoost classifier with default parameters
xgb_model = xgb.XGBClassifier(use_label_encoder=False)

# Fit the model
xgb_model.fit(X_train, y_train)

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_probas):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5
    
    for threshold in thresholds:
        y_pred = (y_pred_probas > threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)
        
        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold
    
    return best_threshold, best_mcc

# Predict probabilities for the test dataset
predicted_probas_test = xgb_model.predict_proba(X_test)[:, 1]
best_threshold_test, best_mcc_test = optimize_threshold(y_test, predicted_probas_test)
predicted_classes_test = (predicted_probas_test > best_threshold_test).astype(int)

# Calculate metrics for the test dataset with optimized threshold
accuracy_test = accuracy_score(y_test, predicted_classes_test)
sensitivity_test = recall_score(y_test, predicted_classes_test)  # Sensitivity (Recall)
TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, predicted_classes_test).ravel()
specificity_test = TN_test / (TN_test + FP_test)  # Corrected Specificity calculation
MCC_test = matthews_corrcoef(y_test, predicted_classes_test)
auc_test = roc_auc_score(y_test, predicted_probas_test)

# Compute the correct balanced accuracy
balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

# Print the adjusted results for the test dataset
print("\nOptimized Test Dataset Results:")
print(f"Accuracy (ACC): {accuracy_test}")
print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
print(f"Sensitivity (Sn): {sensitivity_test}")
print(f"Specificity (Sp): {specificity_test}")
print(f"MCC: {MCC_test}")
print(f"AUC: {auc_test}")
print(f"True Positives (TP): {TP_test}")
print(f"False Positives (FP): {FP_test}")
print(f"True Negatives (TN): {TN_test}")
print(f"False Negatives (FN): {FN_test}")

# Print the total positive and total negative
total_positive = np.sum(y_test)
total_negative = len(y_test) - total_positive
print(f"Total Positive: {total_positive}")
print(f"Total Negative: {total_negative}")


Performing feature selection with RFE...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Optimized Test Dataset Results:
Accuracy (ACC): 0.906934306569343
Balanced Accuracy (BACC): 0.8937221860653287
Sensitivity (Sn): 0.8669064748201439
Specificity (Sp): 0.9205378973105135
MCC: 0.7637384831910247
AUC: 0.9551503051837258
True Positives (TP): 241
False Positives (FP): 65
True Negatives (TN): 753
False Negatives (FN): 37
Total Positive: 278
Total Negative: 818


In [19]:
#uses default parameters

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
import xgboost as xgb

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'prot_t5_xl_bfd_per_protein_embeddings.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Perform feature selection with RFE
print("\nPerforming feature selection with RFE...")
base_estimator = LogisticRegression()  # Base estimator for RFE
rfe = RFE(estimator=base_estimator, n_features_to_select=100, step=10)  # Select top 50 features
X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

# Initialize the XGBoost classifier with default parameters
xgb_model = xgb.XGBClassifier(use_label_encoder=False)

# Fit the model
xgb_model.fit(X_train, y_train)

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_probas):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5
    
    for threshold in thresholds:
        y_pred = (y_pred_probas > threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)
        
        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold
    
    return best_threshold, best_mcc

# Predict probabilities for the test dataset
predicted_probas_test = xgb_model.predict_proba(X_test)[:, 1]
best_threshold_test, best_mcc_test = optimize_threshold(y_test, predicted_probas_test)
predicted_classes_test = (predicted_probas_test > best_threshold_test).astype(int)

# Calculate metrics for the test dataset with optimized threshold
accuracy_test = accuracy_score(y_test, predicted_classes_test)
sensitivity_test = recall_score(y_test, predicted_classes_test)  # Sensitivity (Recall)
TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, predicted_classes_test).ravel()
specificity_test = TN_test / (TN_test + FP_test)  # Corrected Specificity calculation
MCC_test = matthews_corrcoef(y_test, predicted_classes_test)
auc_test = roc_auc_score(y_test, predicted_probas_test)

# Compute the correct balanced accuracy
balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

# Print the adjusted results for the test dataset
print("\nOptimized Test Dataset Results:")
print(f"Accuracy (ACC): {accuracy_test}")
print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
print(f"Sensitivity (Sn): {sensitivity_test}")
print(f"Specificity (Sp): {specificity_test}")
print(f"MCC: {MCC_test}")
print(f"AUC: {auc_test}")
print(f"True Positives (TP): {TP_test}")
print(f"False Positives (FP): {FP_test}")
print(f"True Negatives (TN): {TN_test}")
print(f"False Negatives (FN): {FN_test}")

# Print the total positive and total negative
total_positive = np.sum(y_test)
total_negative = len(y_test) - total_positive
print(f"Total Positive: {total_positive}")
print(f"Total Negative: {total_negative}")


Performing feature selection with RFE...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Optimized Test Dataset Results:
Accuracy (ACC): 0.916970802919708
Balanced Accuracy (BACC): 0.8921347029955498
Sensitivity (Sn): 0.841726618705036
Specificity (Sp): 0.9425427872860636
MCC: 0.781506076507832
AUC: 0.9564211711315544
True Positives (TP): 234
False Positives (FP): 47
True Negatives (TN): 771
False Negatives (FN): 44
Total Positive: 278
Total Negative: 818


In [20]:
#uses default parameters

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
import xgboost as xgb

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'prot_t5_xl_bfd_per_protein_embeddings.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Perform feature selection with RFE
print("\nPerforming feature selection with RFE...")
base_estimator = LogisticRegression()  # Base estimator for RFE
rfe = RFE(estimator=base_estimator, n_features_to_select=50, step=10)  # Select top 50 features
X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

# Initialize the XGBoost classifier with default parameters
xgb_model = xgb.XGBClassifier(use_label_encoder=False)

# Fit the model
xgb_model.fit(X_train, y_train)

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_probas):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5
    
    for threshold in thresholds:
        y_pred = (y_pred_probas > threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)
        
        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold
    
    return best_threshold, best_mcc

# Predict probabilities for the test dataset
predicted_probas_test = xgb_model.predict_proba(X_test)[:, 1]
best_threshold_test, best_mcc_test = optimize_threshold(y_test, predicted_probas_test)
predicted_classes_test = (predicted_probas_test > best_threshold_test).astype(int)

# Calculate metrics for the test dataset with optimized threshold
accuracy_test = accuracy_score(y_test, predicted_classes_test)
sensitivity_test = recall_score(y_test, predicted_classes_test)  # Sensitivity (Recall)
TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, predicted_classes_test).ravel()
specificity_test = TN_test / (TN_test + FP_test)  # Corrected Specificity calculation
MCC_test = matthews_corrcoef(y_test, predicted_classes_test)
auc_test = roc_auc_score(y_test, predicted_probas_test)

# Compute the correct balanced accuracy
balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

# Print the adjusted results for the test dataset
print("\nOptimized Test Dataset Results:")
print(f"Accuracy (ACC): {accuracy_test}")
print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
print(f"Sensitivity (Sn): {sensitivity_test}")
print(f"Specificity (Sp): {specificity_test}")
print(f"MCC: {MCC_test}")
print(f"AUC: {auc_test}")
print(f"True Positives (TP): {TP_test}")
print(f"False Positives (FP): {FP_test}")
print(f"True Negatives (TN): {TN_test}")
print(f"False Negatives (FN): {FN_test}")

# Print the total positive and total negative
total_positive = np.sum(y_test)
total_negative = len(y_test) - total_positive
print(f"Total Positive: {total_positive}")
print(f"Total Negative: {total_negative}")


Performing feature selection with RFE...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Optimized Test Dataset Results:
Accuracy (ACC): 0.9087591240875912
Balanced Accuracy (BACC): 0.8747603384285236
Sensitivity (Sn): 0.8057553956834532
Specificity (Sp): 0.9437652811735942
MCC: 0.7568516502186059
AUC: 0.9502119575733057
True Positives (TP): 224
False Positives (FP): 46
True Negatives (TN): 772
False Negatives (FN): 54
Total Positive: 278
Total Negative: 818


Parameters: { "use_label_encoder" } are not used.

