In [None]:
!pip install git+https://github.com/Profluent-AI/E1.git

In [None]:
import torch

if torch.cuda.is_available():
    cuda_capabilities = torch.cuda.get_device_capability(0)
    if cuda_capabilities[0] >= 8:
        print("CUDA 8.0 or higher detected; installing flash-attention")
        !pip install flash-attn --no-build-isolation
    else:
        print("CUDA capability lower than 8.0; will not be using flash attention")
else:
    print("CUDA not available")

### Embedding Adenylate Kinase variants

In this notebook, we will use E1 model in single sequence mode to compute embeddings of variants of a protein called Adenylate Kinase. Each variant of this protein has a specific structural feature called lid type and we will see if the protein embeddings can be used to cluster the variants based on their lid type. The data for this is taken from repository https://github.com/keiserlab/face-plm. We will use the `E1Predictor` tool to efficiently compute and return these embeddings.

NOTE: You can also use `python3 -m E1.tools.predict` to compute embeddings for a list of sequences stored as fasta file. Use `torchrun --nproc-per-node=gpu -m E1.tools.predict` to run it on multiple GPUs. See file for more details.

In [None]:
import polars as pl

data = pl.read_csv(
    "https://raw.githubusercontent.com/keiserlab/face-plm/refs/heads/main/data/adk_evo-scale_dataset.csv"
)
data.head()

In [None]:
import E1.dist as dist
from E1.modeling import E1ForMaskedLM
from E1.predictor import E1Predictor

model_name = "Profluent-Bio/E1-300m"

In [None]:
model = E1ForMaskedLM.from_pretrained(model_name, dtype=torch.float).to(dist.get_device()).eval()
predictor = E1Predictor(
    model=model,
    max_batch_tokens=16384,  # Change to 4096 on Colab T4 GPU
    # We save token_embeddings of shape (Sequence Length, Embedding Dim) for all positions in the sequence.
    # and mean_token_embeddings of shape (Embedding Dim, ) which is the mean of token_embeddings over all positions.
    fields_to_save=["token_embeddings", "mean_token_embeddings"],
)

In [None]:
sequence_embeddings = []

for prediction in predictor.predict(
    sequences=data["sequence"].to_list(), sequence_ids=data["org_name"].to_list(), context_seqs=None
):
    # Note, predictions may not be in the same order as the input sequences due to batching by length
    # Use prediction["id"] to match with the input sequences which is set to sequence id.
    org_name = prediction["id"]
    token_embeddings = prediction["token_embeddings"]  # (Sequence Length, Embedding Dim)
    mean_token_embeddings = prediction["mean_token_embeddings"]  # (Embedding Dim, )
    sequence_embeddings.append({"org_name": org_name, "sequence_embedding": mean_token_embeddings.cpu().numpy()})

sequence_embeddings = pl.DataFrame(sequence_embeddings).join(
    data.select(["org_name", "lid_type"]), on="org_name", how="inner"
)

We will use UMAP to reduce the dimensionality of the embeddings to 2D and plot the results.

In [None]:
!pip install umap-learn matplotlib seaborn

In [None]:
import numpy as np
from umap import UMAP

sequence_embeddings_array = np.array(sequence_embeddings["sequence_embedding"].to_list())
lid_types = sequence_embeddings["lid_type"].to_list()

umap = UMAP(n_components=2, random_state=314)
sequence_embeddings_2d = umap.fit_transform(sequence_embeddings_array)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.scatterplot(x=sequence_embeddings_2d[:, 0], y=sequence_embeddings_2d[:, 1], hue=lid_types)
plt.legend()
plt.show()