In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'whole_sample_dataset_esm2_t6_8M_UR50D_unified_320_dimension.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the k-NN classifier with GridSearch for hyperparameter tuning
def train_knn(X_train, y_train):
    # Hyperparameter tuning using GridSearchCV
    param_grid = {
        'n_neighbors': [3, 5, 7, 9],
        'metric': ['euclidean', 'manhattan', 'minkowski']
    }
    knn = KNeighborsClassifier()
    grid_search = GridSearchCV(knn, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    # Get the best model from GridSearchCV
    best_knn = grid_search.best_estimator_
    return best_knn

# Train the k-NN model
best_knn_model = train_knn(X_train, y_train)

# Make predictions on the test dataset
predicted_classes_test = best_knn_model.predict(X_test)

# Calculate metrics for the test dataset
accuracy_test = accuracy_score(y_test, predicted_classes_test)
sensitivity_test = recall_score(y_test, predicted_classes_test)  # Sensitivity (Recall)
TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, predicted_classes_test).ravel()
specificity_test = TN_test / (TN_test + FP_test)  # Corrected Specificity calculation
MCC_test = matthews_corrcoef(y_test, predicted_classes_test)
auc_test = roc_auc_score(y_test, predicted_classes_test)

# Compute the correct balanced accuracy
balanced_accuracy_test = (sensitivity_test + specificity_test) / 2


In [2]:
# Print the adjusted results for the test dataset
print("\nOptimized Test Dataset Results (k-NN):")
print(f"Accuracy (ACC): {accuracy_test}")
print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
print(f"Sensitivity (Sn): {sensitivity_test}")
print(f"Specificity (Sp): {specificity_test}")
print(f"MCC: {MCC_test}")
print(f"AUC: {auc_test}")
print(f"True Positives (TP): {TP_test}")
print(f"False Positives (FP): {FP_test}")
print(f"True Negatives (TN): {TN_test}")
print(f"False Negatives (FN): {FN_test}")

# Print the total positive and total negative
total_positive = np.sum(y_test)
total_negative = len(y_test) - total_positive
print(f"Total Positive: {total_positive}")
print(f"Total Negative: {total_negative}")


Optimized Test Dataset Results (k-NN):
Accuracy (ACC): 0.926094890510949
Balanced Accuracy (BACC): 0.886374030360064
Sensitivity (Sn): 0.8057553956834532
Specificity (Sp): 0.9669926650366748
MCC: 0.8001506427101813
AUC: 0.8863740303600639
True Positives (TP): 224
False Positives (FP): 27
True Negatives (TN): 791
False Negatives (FN): 54
Total Positive: 278
Total Negative: 818


In [3]:
# Evaluate on the external dataset (KELM)
dataset_external = pd.read_csv('kelm_dataset.csv', na_filter=False)
X_external_data_name = 'kelm_dataset_esm2_t6_8M_UR50D_unified_320_dimension.csv'
X_external_data = pd.read_csv(X_external_data_name, header=0, index_col=0, delimiter=',')
X_external = np.array(X_external_data)
y_external = np.array(dataset_external['label'])

# Normalize the external dataset
X_external_normalized = scaler.transform(X_external)

# Predict for the external dataset
predicted_classes_ext = best_knn_model.predict(X_external_normalized)

# Calculate metrics for the external dataset
accuracy_ext = accuracy_score(y_external, predicted_classes_ext)
sensitivity_ext = recall_score(y_external, predicted_classes_ext)  # Sensitivity (Recall)
TN_ext, FP_ext, FN_ext, TP_ext = confusion_matrix(y_external, predicted_classes_ext).ravel()
specificity_ext = TN_ext / (TN_ext + FP_ext)  # Corrected Specificity calculation
MCC_ext = matthews_corrcoef(y_external, predicted_classes_ext)
auc_ext = roc_auc_score(y_external, predicted_classes_ext)

# Compute the correct balanced accuracy
balanced_accuracy_ext = (sensitivity_ext + specificity_ext) / 2

# Print the adjusted results for the external dataset
print("\nOptimized External Dataset (KELM) Results (k-NN):")
print(f"Accuracy (ACC): {accuracy_ext}")
print(f"Balanced Accuracy (BACC): {balanced_accuracy_ext}")
print(f"Sensitivity (Sn): {sensitivity_ext}")
print(f"Specificity (Sp): {specificity_ext}")
print(f"MCC: {MCC_ext}")
print(f"AUC: {auc_ext}")
print(f"True Positives (TP): {TP_ext}")
print(f"False Positives (FP): {FP_ext}")
print(f"True Negatives (TN): {TN_ext}")
print(f"False Negatives (FN): {FN_ext}")

# Print the total positive and total negative
total_positive_ext = np.sum(y_external)
total_negative_ext = len(y_external) - total_positive_ext
print(f"Total Positive: {total_positive_ext}")
print(f"Total Negative: {total_negative_ext}")


Optimized External Dataset (KELM) Results (k-NN):
Accuracy (ACC): 0.8125
Balanced Accuracy (BACC): 0.8125
Sensitivity (Sn): 0.625
Specificity (Sp): 1.0
MCC: 0.6741998624632421
AUC: 0.8125
True Positives (TP): 60
False Positives (FP): 0
True Negatives (TN): 96
False Negatives (FN): 36
Total Positive: 96
Total Negative: 96


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'whole_sample_dataset_esm2_t6_8M_UR50D_unified_320_dimension.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Function to optimize the number of neighbors (k)
def optimize_k(X_train, y_train):
    param_grid = {'n_neighbors': range(1, 21), 'weights': ['uniform', 'distance']}
    knn = KNeighborsClassifier()
    grid_search = GridSearchCV(knn, param_grid, scoring='accuracy', cv=5, verbose=1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_params_

# Train the k-NN model
def train_knn(X_train, y_train, X_test, y_test):
    # Optimize k and weights
    best_params = optimize_k(X_train, y_train)
    k = best_params['n_neighbors']
    weights = best_params['weights']

    knn = KNeighborsClassifier(n_neighbors=k, weights=weights)
    knn.fit(X_train, y_train)
    return knn, k, weights

# Evaluate the model with optimized threshold
def evaluate_model(knn, X_test, y_test):
    predicted_probas = knn.predict_proba(X_test)[:, 1]
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5

    for threshold in thresholds:
        y_pred = (predicted_probas > threshold).astype(int)
        mcc = matthews_corrcoef(y_test, y_pred)
        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold

    y_pred_final = (predicted_probas > best_threshold).astype(int)
    accuracy = accuracy_score(y_test, y_pred_final)
    sensitivity = recall_score(y_test, y_pred_final)
    TN, FP, FN, TP = confusion_matrix(y_test, y_pred_final).ravel()
    specificity = TN / (TN + FP)
    mcc = matthews_corrcoef(y_test, y_pred_final)
    auc = roc_auc_score(y_test, y_pred_final)
    balanced_accuracy = (sensitivity + specificity) / 2

    return accuracy, balanced_accuracy, sensitivity, specificity, mcc, auc, best_threshold

# Train the model
knn_model, best_k, best_weights = train_knn(X_train, y_train, X_test, y_test)

# Evaluate on test dataset
acc_test, bacc_test, sn_test, sp_test, mcc_test, auc_test, best_threshold_test = evaluate_model(knn_model, X_test, y_test)

# Print results for test dataset
print("\nTest Dataset Results:")
print(f"Accuracy (ACC): {acc_test}")
print(f"Balanced Accuracy (BACC): {bacc_test}")
print(f"Sensitivity (Sn): {sn_test}")
print(f"Specificity (Sp): {sp_test}")
print(f"MCC: {mcc_test}")
print(f"AUC: {auc_test}")
print(f"Best Threshold: {best_threshold_test}")

# Load and normalize external dataset
dataset_external = pd.read_csv('kelm_dataset.csv', na_filter=False)
X_external_data_name = 'kelm_dataset_esm2_t6_8M_UR50D_unified_320_dimension.csv'
X_external_data = pd.read_csv(X_external_data_name, header=0, index_col=0, delimiter=',')
X_external = np.array(X_external_data)
y_external = np.array(dataset_external['label'])

X_external_normalized = scaler.transform(X_external)

# Evaluate on external dataset
acc_ext, bacc_ext, sn_ext, sp_ext, mcc_ext, auc_ext, best_threshold_ext = evaluate_model(knn_model, X_external_normalized, y_external)

# Print results for external dataset
print("\nExternal Dataset Results:")
print(f"Accuracy (ACC): {acc_ext}")
print(f"Balanced Accuracy (BACC): {bacc_ext}")
print(f"Sensitivity (Sn): {sn_ext}")
print(f"Specificity (Sp): {sp_ext}")
print(f"MCC: {mcc_ext}")
print(f"AUC: {auc_ext}")
print(f"Best Threshold: {best_threshold_ext}")

Fitting 5 folds for each of 40 candidates, totalling 200 fits

Test Dataset Results:
Accuracy (ACC): 0.9306569343065694
Balanced Accuracy (BACC): 0.8965541503227735
Sensitivity (Sn): 0.8273381294964028
Specificity (Sp): 0.9657701711491442
MCC: 0.8133915993157494
AUC: 0.8965541503227735
Best Threshold: 0.45000000000000007

External Dataset Results:
Accuracy (ACC): 0.859375
Balanced Accuracy (BACC): 0.859375
Sensitivity (Sn): 0.7604166666666666
Specificity (Sp): 0.9583333333333334
MCC: 0.7332546199393071
AUC: 0.8593749999999999
Best Threshold: 0.20000000000000004


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

# Load the dataset
dataset = pd.read_excel('Final_non_redundant_sequences.xlsx', na_filter=False)
X_data_name = 'whole_sample_dataset_esm2_t6_8M_UR50D_unified_320_dimension.csv'
X_data = pd.read_csv(X_data_name, header=0, index_col=0, delimiter=',')
X = np.array(X_data)
y = np.array(dataset['label'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build and train k-NN model
def train_knn(X_train, y_train, n_neighbors=5):
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, metric='euclidean')
    knn.fit(X_train, y_train)
    return knn

# Train the k-NN model
knn_model = train_knn(X_train, y_train)

# Function to optimize threshold based on MCC
def optimize_threshold(y_true, y_pred_probas):
    thresholds = np.arange(0.1, 1.0, 0.05)
    best_mcc = -1
    best_threshold = 0.5

    for threshold in thresholds:
        y_pred = (y_pred_probas >= threshold).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)

        if mcc > best_mcc:
            best_mcc = mcc
            best_threshold = threshold

    return best_threshold, best_mcc

# Evaluate on the test dataset
predicted_probas_test = knn_model.predict_proba(X_test)[:, 1]
best_threshold_test, best_mcc_test = optimize_threshold(y_test, predicted_probas_test)
predicted_classes_test = (predicted_probas_test >= best_threshold_test).astype(int)

# Calculate metrics for the test dataset with optimized threshold
accuracy_test = accuracy_score(y_test, predicted_classes_test)
sensitivity_test = recall_score(y_test, predicted_classes_test)  # Sensitivity (Recall)
TN_test, FP_test, FN_test, TP_test = confusion_matrix(y_test, predicted_classes_test).ravel()
specificity_test = TN_test / (TN_test + FP_test)
MCC_test = matthews_corrcoef(y_test, predicted_classes_test)
auc_test = roc_auc_score(y_test, predicted_probas_test)

# Compute the balanced accuracy
balanced_accuracy_test = (sensitivity_test + specificity_test) / 2

# Print the adjusted results for the test dataset
print("\nOptimized Test Dataset Results:")
print(f"Accuracy (ACC): {accuracy_test}")
print(f"Balanced Accuracy (BACC): {balanced_accuracy_test}")
print(f"Sensitivity (Sn): {sensitivity_test}")
print(f"Specificity (Sp): {specificity_test}")
print(f"MCC: {MCC_test}")
print(f"AUC: {auc_test}")
print(f"True Positives (TP): {TP_test}")
print(f"False Positives (FP): {FP_test}")
print(f"True Negatives (TN): {TN_test}")
print(f"False Negatives (FN): {FN_test}")

# Evaluate on the external dataset (KELM)
dataset_external = pd.read_csv('kelm_dataset.csv', na_filter=False)
X_external_data_name = 'kelm_dataset_esm2_t6_8M_UR50D_unified_320_dimension.csv'
X_external_data = pd.read_csv(X_external_data_name, header=0, index_col=0, delimiter=',')
X_external = np.array(X_external_data)
y_external = np.array(dataset_external['label'])

# Normalize the external dataset
X_external_normalized = scaler.transform(X_external)

# Predict probabilities for the external dataset
predicted_probas_ext = knn_model.predict_proba(X_external_normalized)[:, 1]
best_threshold_ext, best_mcc_ext = optimize_threshold(y_external, predicted_probas_ext)
predicted_classes_ext = (predicted_probas_ext >= best_threshold_ext).astype(int)

# Calculate metrics for the external dataset with optimized threshold
accuracy_ext = accuracy_score(y_external, predicted_classes_ext)
sensitivity_ext = recall_score(y_external, predicted_classes_ext)
TN_ext, FP_ext, FN_ext, TP_ext = confusion_matrix(y_external, predicted_classes_ext).ravel()
specificity_ext = TN_ext / (TN_ext + FP_ext)
MCC_ext = matthews_corrcoef(y_external, predicted_classes_ext)
auc_ext = roc_auc_score(y_external, predicted_probas_ext)

# Compute the balanced accuracy
balanced_accuracy_ext = (sensitivity_ext + specificity_ext) / 2

# Print the adjusted results for the external dataset
print("\nOptimized External Dataset (KELM) Results:")
print(f"Accuracy (ACC): {accuracy_ext}")
print(f"Balanced Accuracy (BACC): {balanced_accuracy_ext}")
print(f"Sensitivity (Sn): {sensitivity_ext}")
print(f"Specificity (Sp): {specificity_ext}")
print(f"MCC: {MCC_ext}")
print(f"AUC: {auc_ext}")
print(f"True Positives (TP): {TP_ext}")
print(f"False Positives (FP): {FP_ext}")
print(f"True Negatives (TN): {TN_ext}")
print(f"False Negatives (FN): {FN_ext}")



Optimized Test Dataset Results:
Accuracy (ACC): 0.926094890510949
Balanced Accuracy (BACC): 0.886374030360064
Sensitivity (Sn): 0.8057553956834532
Specificity (Sp): 0.9669926650366748
MCC: 0.8001506427101813
AUC: 0.9514432463808904
True Positives (TP): 224
False Positives (FP): 27
True Negatives (TN): 791
False Negatives (FN): 54

Optimized External Dataset (KELM) Results:
Accuracy (ACC): 0.8385416666666666
Balanced Accuracy (BACC): 0.8385416666666666
Sensitivity (Sn): 0.6979166666666666
Specificity (Sp): 0.9791666666666666
MCC: 0.7055637431113225
AUC: 0.8685438368055556
True Positives (TP): 67
False Positives (FP): 2
True Negatives (TN): 94
False Negatives (FN): 29
