# Reading in ESM2 Embeddings

In [7]:
import h5py
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import auc, roc_curve
from sklearn.metrics import ConfusionMatrixDisplay
from dotenv import load_dotenv
import os
from data_manipulation.reading_util import *

load_dotenv()

True

In [2]:
def read_esm2(path_to_esm2:str, is_enzyme:bool) -> pd.DataFrame:
    """
    :param path_to_esm2: Absolute path to esm2 file
    :return: A dataframe
    """
    with h5py.File(path_to_esm2) as hdf_handle:
        headers = []
        embeddings = []

        for header, emb in hdf_handle.items():
            headers.append(header)
            embeddings.append(np.array(list(emb)))

        df = pd.DataFrame(data={"Entry": headers, "Embedding": embeddings})

        if is_enzyme:
            df["Label"] = 1
        else:
            df["Label"] = -1

        return df

In [3]:
def load_ml_df(path_to_enzyme_esm2:str,path_to_non_enzyme_esm2:str):

    enzymes = read_esm2(path_to_enzyme_esm2, True)
    non_enzymes = read_esm2(path_to_non_enzyme_esm2, False)

    print(len(enzymes))
    print(len(non_enzymes))

    return pd.concat([enzymes, non_enzymes])


In [4]:
esm2_enzymes_30 = os.getenv("ESM2_ENZYMES_SPLIT_30")
esm2_non_enzymes = os.getenv("ESM2_NON_ENZYMES")

ml_df = load_ml_df(path_to_enzyme_esm2=esm2_enzymes_30, path_to_non_enzyme_esm2=esm2_non_enzymes)
ml_df.head()

# TODO: We probably need to reduce the size of our non_enzyme embeddings

9204
39502


Unnamed: 0,Entry,Embedding,Label
0,A0A024RBG1,"[-0.015143169, 0.035552002, -0.02231326, 0.002...",1
1,A0A024SMV2,"[0.059097216, -0.034141198, 0.061426997, 0.006...",1
2,A0A060S684,"[-0.016934631, -0.04490467, 0.0054878886, -0.0...",1
3,A0A075TXZ3,"[-0.06815035, -0.034723807, 0.03341713, -0.057...",1
4,A0A077K8G3,"[-0.03639361, 0.046453245, 0.06140146, -0.0697...",1


# SVM

In [5]:

X = np.array(list(ml_df["Embedding"]))

# creating a training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, ml_df["Label"],random_state=42)

# TODO: enable param 'probability=True' for ROC
clf_svm = SVC(kernel='linear', random_state=42)
clf_svm.fit(X_train, y_train)
# ConfusionMatrixDisplay.from_estimator(clf_svm, np.array(X_test).reshape(-1, 1), np.array(y_test))

accuracy = clf_svm.score(X_test, y_test)

predictions = clf_svm.predict(X_test)
f1 = f1_score(y_test, predictions)
print("Accuracy:", accuracy)
print("F1: ", f1)


Accuracy: 0.934220251293422
F1:  0.8301886792452831


# Creating ROC curve

In [9]:
# Get predicted probabilities for the positive class
y_prob = clf_svm.predict_proba(X_test)[:, 1]  # Use positive class probabilities

fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()


AttributeError: predict_proba is not available when  probability=False