# Reading in ESM2 Embeddings

In [16]:
import h5py
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import os
from data_manipulation.reading_util import *

load_dotenv()

True

In [17]:
def read_esm2(path_to_esm2:str, is_enzyme:bool) -> pd.DataFrame:
    """
    :param path_to_esm2: Absolute path to esm2 file
    :return: A dataframe
    """
    with h5py.File(path_to_esm2) as hdf_handle:
        headers = []
        embeddings = []

        for header, emb in hdf_handle.items():
            headers.append(header)
            embeddings.append(np.array(list(emb)))

        df = pd.DataFrame(data={"Entry": headers, "Embedding": embeddings})

        if is_enzyme:
            df["Label"] = 1
        else:
            df["Label"] = -1

        return df

In [18]:

def load_ml_df(path_to_enzyme_esm2:str,path_to_non_enzyme_esm2:str):

    enzymes = read_esm2(path_to_enzyme_esm2, True)
    non_enzymes = read_esm2(path_to_non_enzyme_esm2, False)
    enzymes = filter_unwanted_seqs(enzymes, True)
    non_enzymes = filter_unwanted_seqs(non_enzymes, False)

    print(len(enzymes))
    print(len(non_enzymes))

    return pd.concat([enzymes, non_enzymes])


In [None]:
esm2_enzymes_30 = os.getenv("ESM2_ENZYMES_SPLIT_30")
esm2_non_enzymes_30 = os.getenv("ESM2_NON_ENZYMES_SPLIT_30")

ml_df = load_ml_df(path_to_enzyme_esm2=esm2_enzymes_30, path_to_non_enzyme_esm2=esm2_non_enzymes_30)
ml_df.head()
