In [7]:
import os
from dotenv import load_dotenv
import pandas as pd
from lib.utils import enzyme_split30_preprocessing, read_h5, apply_embedding, read_fasta, calculate_f1, bootstrap_statistic, round_to_significance

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
import joblib

In [6]:
mode = "test"

In [3]:
load_dotenv()
if mode == "train":
    path_to_non_enzymes = os.getenv("FASTA_NON_ENZYMES", "fasta not found")
    path_to_enzyme_csv = os.getenv("CSV30_ENZYMES", "csv not found")
    path_to_esm2_ne = os.getenv("ESM2_NON_ENZYMES", "esm2 ne not found")
    path_to_esm2 = os.getenv("ESM2_ENZYMES_SPLIT_30", "esm2 not found")
elif mode == "test":
    path_to_non_enzymes = os.getenv("TEST_FASTA_NON_ENZYMES", "fasta not found")
    path_to_enzyme_csv = os.getenv("TEST_CSV_ENZYMES", "csv not found")
    path_to_esm2_ne = os.getenv("TEST_ESM2_NON_ENZYMES", "esm2 ne not found")
    path_to_esm2 = os.getenv("TEST_ESM2_ENZYMES_SPLIT_30", "esm2 not found")

#path_to_enzyme_csv_esm2 = os.getenv("CSV30_ENZYMES_ESM25_APPLIED")
#path_to_non_enzymes_esm2 = os.getenv("NON_ENZYMES_ESM2_APPLIED")

In [4]:
enzymes=pd.read_csv(path_to_enzyme_csv, delimiter=",")
print("RAW ENZYMES SIZE: " + str(enzymes.count()))
enzymes = enzyme_split30_preprocessing(enzymes)
print("PREPROCESSED ENZYMES SIZE: " + str(enzymes.count()))
enzymes.head()

RAW ENZYMES SIZE: Entry        392
EC number    392
Sequence     392
dtype: int64
PREPROCESSED ENZYMES SIZE: ID              334
EC number       334
Sequence        334
Enzyme class    334
dtype: int64


Unnamed: 0,ID,EC number,Sequence,Enzyme class
0,E0VIU9,2.3.2.31,MSILEWFWNILCGMAQYLTFSKNLTNDNLVNIYVKSNVGGTISVNL...,2
1,Q838J7,4.2.1.113,MNIQSIETYQVRLPLKTPFVTSYGRLEEKAFDLFVITDEQGNQGFG...,4
2,B1VB82,2.7.1.177,MAVAQCPASCGELIQGWILGSEKLVSCPVEWYSTVEVTSGSPLTDE...,2
3,R9QMR1,4.2.3.121,MALVSAVPLNSKLCLCRTLFGFSHELKAIHSTVPNLGMCRGGKSIA...,4
4,R9QMW2,4.2.3.121,MALVSAVPLNSKLCLCRTLFGFSHELKAIHSTVPNLGMCRGGKSIA...,4


In [5]:
enzymes = apply_embedding(read_h5(path_to_esm2, False), enzymes)
enzymes.head()

FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = 'esm2 not found', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [None]:
non_enzymes = read_fasta(path_to_non_enzymes)
print("NON-ENZYMES SIZE: " + str(non_enzymes.count()))
non_enzymes.head()

In [None]:
import numpy as np
# Split data
enzymes["Label"] = 1
non_enzymes["Label"] = 0

bin = pd.concat([enzymes[["Label", "Embedding"]], non_enzymes[["Label", "Embedding"]]], ignore_index=True)

bin = bin.sample(frac=1, random_state=42).reset_index(drop=True)

X = [value for value in bin["Embedding"]]
y = bin["Label"]
if mode == "train":
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    y_test = np.array(y_test)
elif mode == "test":
    X_test = X
    y_test = y
    y_test = np.array(y_test)

In [None]:
if mode == "train":
    k = 7
    knn_classifier = KNeighborsClassifier(n_neighbors=k)

    # Fit the classifier to the training data
    knn_classifier.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = knn_classifier.predict(X_test) # y_pred are predicted labels for embeddings at index
elif mode == "test":
    knn_classifier = joblib.load('./var/level0_esm2_knn.pkl')
    y_pred = knn_classifier.predict(X_test) # y_pred are predicted labels for embeddings at index


In [None]:
with open(f'./var/{mode}_knn_esm2_y_pred.txt', 'w') as file:
    for label in y_pred:
        file.write(f"{label}\n")
with open(f'./var/{mode}_knn_esm2_y_test.txt', 'w') as file:
    for label in y_test:
        file.write(f"{label}\n")

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
cm = confusion_matrix(y_test, y_pred)
# Define custom colors (e.g., green and purple)
colors = ['yellow', 'purple']

# Create a colormap using custom colors
cmap = mcolors.ListedColormap(colors)

# Plot the confusion matrix heatmap
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.colorbar()
plt.xticks([0, 1], ["Predicted 0", "Predicted 1"])
plt.yticks([0, 1], ["Actual 0", "Actual 1"])

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')

for i in range(2):
    for j in range(2):
        plt.text(j, i, str(cm[i, j]), ha='center', va='center', color='white', fontsize=16)

plt.show()

In [None]:
initial_f1 = calculate_f1(np.array(y_test), y_pred)
mean_f1, se_f1, ci_95 = bootstrap_statistic(y_test, y_pred, calculate_f1)
rounded_mean_f1, rounded_se_f1 = round_to_significance(mean_f1, se_f1)

In [None]:
print(f"ESM2 KNN:")
print(f"  - Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"  - Initial F1 Score: {initial_f1:.2f}")
print(f"  - MCC: {matthews_corrcoef(y_test, y_pred)}")
print(f"  - Mean F1 ± SE F1: {rounded_mean_f1} ± {rounded_se_f1}")
print(f"  - 95% CI: [{ci_95[0]:.2f}, {ci_95[1]:.2f}]")

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
# roc curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)

# Calculate the AUC (Area Under the ROC Curve)
roc_auc = auc(fpr, tpr)
# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for KNN (k=7)')
plt.legend(loc='lower right')
plt.show()

In [None]:
if mode == "train":
    joblib.dump(knn_classifier, './var/level0_esm2_knn.pkl')