In [24]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import torch

# Set device to GPU with index 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Using device:", device)
print(torch.cuda.get_device_name(device))

Using device: cuda
NVIDIA A100-PCIE-40GB


In [25]:
pip install transformers torch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [26]:
import torch
from transformers import AutoTokenizer, AutoModel, T5EncoderModel
import re
import numpy as np
from tqdm.auto import tqdm

In [27]:
def get_peptide_embeddings(sequences, model_name, device):
    """
    Generates peptide embeddings using a pretrained transformer model.

    Args:
        sequences (list): A list of peptide sequences (e.g., ['GAVW', 'LLNQELLLNPTHQIYPV']).
        model_name (str): The name of the Hugging Face model to use.
        device (str): 'cuda' or 'cpu'.

    Returns:
        np.array: A NumPy array of peptide embeddings.
    """
    
    # 1. Load the tokenizer and model
    if 't5' in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = T5EncoderModel.from_pretrained(model_name).to(device)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name).to(device)
    
    print(model.eval())
    all_embeddings = []

    # 2. Pre-process sequences
    # Add a space between each amino acid for ProtBERT and ProtT5
    sequences_processed = [" ".join(list(re.sub(r"[UZOB]", "X", seq))) for seq in sequences]
    
    for seq in tqdm(sequences_processed, desc=f"Generating embeddings with {model_name}"):
        # 3. Tokenize and encode
        ids = tokenizer.batch_encode_plus([seq], add_special_tokens=True, padding=True, return_tensors='pt').to(device)
        
        with torch.no_grad():
            outputs = model(**ids)
        
        # 4. Get the last hidden state (the main embedding layer)
        # This gives you a per-residue embedding for each amino acid in the sequence.
        last_hidden_state = outputs.last_hidden_state
        
        # 5. Pool the per-residue embeddings to get a single peptide embedding
        # We take the mean of the embeddings for all amino acids in the peptide.
        # This is a common and effective pooling strategy.
        embedding = last_hidden_state[0, 1:-1].mean(dim=0).cpu().numpy()
        all_embeddings.append(embedding)

    return np.array(all_embeddings)


In [28]:
# ABCP_tokenisation
with open("ABCP_CDHIT90.fasta") as abcp:
    abcp_lines = [line.strip("\n") for line in abcp.readlines()[1::2]]

abcp_lines

['KWLRRVWRWWR',
 'KRLRRVWRRWR',
 'FLGMIPGLIGGLISAFK',
 'FLSLIPKLVKKIIKAFK',
 'FLGMIPKLIKKLIKAFK',
 'RGDLLRHVVKILEKYL',
 'RGDLLRHVVKILSKYL',
 'GLWSKIKEVGKEAAKAAAKAAGKAALGAVSEAV',
 'KVKVKVKVPPTKVKVKVK',
 'GLFGKLIKKFGRKAISYAVKKARGKH',
 'KWKSFAKTFKSAKKTVAHTALKAISS',
 'KWKSFLKTFKSAKKTVAHTAAKAISS',
 'KWKSFLKTFKSAKKTVLHTALKAISS',
 'KWKSFLKTFKSLKKTVLHTLLKAISS',
 'KQLIRFLKRLDRNGGGKLLLKLLKKLLKLLKKK',
 'THRPPMWSPVWPGGGKLLLKLLKKLLKLLKKK',
 'KAAKKWAKAAKKWAKAWKKAA',
 'KAAKKWAKAWKKAAKAWKKAA',
 'KAAKKAWKAWKKAAKAAWKKAA',
 'KAAKKAWKAAKKAAKWWKKAA',
 'KAAKKAWKWAKKAAKWAKKAA',
 'KWWKKAAKAAKKAAKAAKKWA',
 'KAAKKAWKAAKKAWKAAKKAA',
 'AWKKWAKAWKWAKAKWWAKAA',
 'AAWKWAWAKKWAKAKKWAKAA',
 'AAKKWAKAKWAKAKKWAKAA',
 'GRRKRKWLRRIGKGVKIIGGAALDHL',
 'RWGKWFKKATHVGKHVGKAALTAYL',
 'GWRTLLKKAEVKTVGKLALKHYL',
 'YHWYGYTPQNVIGGGKLLLKLLKKLLKLLKKK',
 'KWKLFKKIGAVLKVL',
 'ILPILSLIGGLLGK',
 'RAGLQFPVGRLLRRLLRRLLR',
 'VRRFPWWWPFLRR',
 'KKKFPWWWPFKKKCKKKFPWWWPFKKKC',
 'MRKEFHNVLSSGQLLADKRPARDYNRK',
 'MWKWFHNVLSSWQLLADKRPARDYNRK',
 'M

In [29]:
# ABCP_tokenisation
with open("Non-ABCP_CDHIT.fasta") as non_abcp:
    non_abcp_lines = [line.strip("\n") for line in non_abcp.readlines()[1::2]]

non_abcp_lines = [l for l in non_abcp_lines if len(l) <51]
non_abcp_lines

['AVQKIPVSSLSQEIDYTLEYGWHTNMPRLETRNYLDVFGHPTSP',
 'NGRKISLDLRAPLYKKIIKKLLES',
 'WHAQLSLNLAMLGSLTIVVAHHMYSMPPYPYLAIDYG',
 'PHLVIPEIEAIATQTLVEMEAEGLN',
 'GLPVCGETCFGGTCNTPGCSCTWPICTRD',
 'DPFFKVPVNKLAAAVSNFGYDLYRVRSSTSPTTN',
 'ALPKKLKYLNLFNDGFNYMGVV',
 'ISIALSLGCRVFTTVGSAEK',
 'RIWYDTLCDKT',
 'VTMCMRCQEPFNSITKRR',
 'DSHEERHHGRHGHHKYGRKFHEKHHSHRGYRSNYLYDN',
 'CFSSLVVDETYIPSPFSADKFI',
 'CLGEVENSQLFVGILGSR',
 'TINPNEVVINDNISDNLKRQINMVTETITVR',
 'NLGRYGSRGQQRSIALALKIGEAGLMRRRSGEAPVLLLDDVLSE',
 'DRVMQELTEYELVPEAWGGDTIFAPISAKFGEGL',
 'KVEASKELWGKIVGTIDTAEKFEAKRLTLARREWARMRAS',
 'HILTSKTRKNKRNLRKGGIVAASDHKNISCLIPYKMPKMKTHRGSAKR',
 'YVAAHHIIKAHAQAWHSYNNTWRSKQHGLVGISLNCDW',
 'HFGREHFPWEKTDKAQLLREAAGLKMRRLFT',
 'LAGTNRWTQYGALMVADA',
 'TFKELVYETVKVPGCAHQADSLYTYPVATECHCGKCDSDSTD',
 'FFQMMDPNVRYVAVVLSVTAGIIASQALITGAFTMVSEATGL',
 'DYKLALKAIEGGADKIRINPGNI',
 'VALQALTQRGIPCYFIHGNRDF',
 'RARNLVFHVNCFCC',
 'NAVMGQGPLQFYDKVPTKFAGYGEAAFKAGGYRVV',
 'FLSLIPHIVSGVAALAKHL',
 'QEPHRHSIFTPQTNPRADLEKN',
 'GHFSSKCR

In [30]:
# Define models
model_names = {
        'ProtBERT': 'Rostlab/prot_bert_bfd',
        'ESM-2': 'facebook/esm2_t33_650M_UR50D'
    }

# running embeddings for ABCPs
abcp_embedding_list = []
# --- Generate embeddings for each model ---
for model_alias, model_id in model_names.items():
        print(f"\n--- Generating embeddings with {model_alias} ---")
        embeddings = get_peptide_embeddings(abcp_lines, model_id, device)
        abcp_embedding_list.append(embeddings)
        print(f"Shape of embeddings for {model_alias}: {embeddings.shape}")
        print(f"Example embedding for the first peptide ('{abcp_lines[0]}'):")
        print(embeddings[0][:5]) # Print the first 5 dimensions as a sampl


--- Generating embeddings with ProtBERT ---
BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30, 1024, padding_idx=0)
    (position_embeddings): Embedding(40000, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-29): 30 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=

Generating embeddings with Rostlab/prot_bert_bfd: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 462/462 [00:05<00:00, 77.68it/s]


Shape of embeddings for ProtBERT: (462, 1024)
Example embedding for the first peptide ('KWLRRVWRWWR'):
[ 0.06143294 -0.09303566  0.01488026  0.05878055 -0.10568105]

--- Generating embeddings with ESM-2 ---


Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EsmModel(
  (embeddings): EsmEmbeddings(
    (word_embeddings): Embedding(33, 1280, padding_idx=1)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): EsmEncoder(
    (layer): ModuleList(
      (0-32): 33 x EsmLayer(
        (attention): EsmAttention(
          (self): EsmSelfAttention(
            (query): Linear(in_features=1280, out_features=1280, bias=True)
            (key): Linear(in_features=1280, out_features=1280, bias=True)
            (value): Linear(in_features=1280, out_features=1280, bias=True)
            (rotary_embeddings): RotaryEmbedding()
          )
          (output): EsmSelfOutput(
            (dense): Linear(in_features=1280, out_features=1280, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (LayerNorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        )
        (intermediate): EsmIntermediate(
          (dense): Linear(in_features=1280, out_features=5120, bias=True)
        )
        (output): EsmOut

Generating embeddings with facebook/esm2_t33_650M_UR50D: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 462/462 [00:11<00:00, 38.76it/s]

Shape of embeddings for ESM-2: (462, 1280)
Example embedding for the first peptide ('KWLRRVWRWWR'):
[ 0.13043459  0.0113699   0.06345813  0.01201109 -0.1272963 ]





In [31]:
# Define models
model_names = {
        'ProtBERT': 'Rostlab/prot_bert_bfd',
        'ESM-2': 'facebook/esm2_t33_650M_UR50D'
    }
# running embeddings for ABCPs
non_abcp_embedding_list = []
# --- Generate embeddings for each model ---
for model_alias, model_id in model_names.items():
        print(f"\n--- Generating embeddings with {model_alias} ---")
        embeddings = get_peptide_embeddings(non_abcp_lines, model_id, device)
        non_abcp_embedding_list.append(embeddings)
        print(f"Shape of embeddings for {model_alias}: {embeddings.shape}")
        print(f"Example embedding for the first peptide ('{non_abcp_lines[0]}'):")
        print(embeddings[0][:5]) # Print the first 5 dimensions as a sampl


--- Generating embeddings with ProtBERT ---
BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30, 1024, padding_idx=0)
    (position_embeddings): Embedding(40000, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-29): 30 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=

Generating embeddings with Rostlab/prot_bert_bfd: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 851/851 [00:10<00:00, 78.23it/s]


Shape of embeddings for ProtBERT: (851, 1024)
Example embedding for the first peptide ('AVQKIPVSSLSQEIDYTLEYGWHTNMPRLETRNYLDVFGHPTSP'):
[-0.06764774 -0.02303542 -0.0089743   0.02488636  0.03943023]

--- Generating embeddings with ESM-2 ---


Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EsmModel(
  (embeddings): EsmEmbeddings(
    (word_embeddings): Embedding(33, 1280, padding_idx=1)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): EsmEncoder(
    (layer): ModuleList(
      (0-32): 33 x EsmLayer(
        (attention): EsmAttention(
          (self): EsmSelfAttention(
            (query): Linear(in_features=1280, out_features=1280, bias=True)
            (key): Linear(in_features=1280, out_features=1280, bias=True)
            (value): Linear(in_features=1280, out_features=1280, bias=True)
            (rotary_embeddings): RotaryEmbedding()
          )
          (output): EsmSelfOutput(
            (dense): Linear(in_features=1280, out_features=1280, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (LayerNorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        )
        (intermediate): EsmIntermediate(
          (dense): Linear(in_features=1280, out_features=5120, bias=True)
        )
        (output): EsmOut

Generating embeddings with facebook/esm2_t33_650M_UR50D: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 851/851 [00:23<00:00, 36.87it/s]

Shape of embeddings for ESM-2: (851, 1280)
Example embedding for the first peptide ('AVQKIPVSSLSQEIDYTLEYGWHTNMPRLETRNYLDVFGHPTSP'):
[ 0.06806633  0.04276744  0.00315045  0.01909297 -0.07001569]





In [32]:
abcp_embed_ProtBERT = abcp_embedding_list[0]
abcp_embed_ESM2 = abcp_embedding_list[1]


In [33]:
non_abcp_embed_ProtBERT = non_abcp_embedding_list[0]
non_abcp_embed_ESM2 = non_abcp_embedding_list[1]


In [34]:
import pandas as pd

In [35]:
df_abcp_ProtBERT = pd.DataFrame(abcp_embed_ProtBERT)
df_abcp_ESM2 = pd.DataFrame(abcp_embed_ESM2)

df_non_abcp_ProtBERT = pd.DataFrame(non_abcp_embed_ProtBERT)
df_non_abcp_ESM2 = pd.DataFrame(non_abcp_embed_ESM2)

In [36]:
df_abcp_ProtBERT["Target"] = 1
df_abcp_ESM2["Target"] = 1

df_non_abcp_ProtBERT["Target"] = 0
df_non_abcp_ESM2["Target"] = 0

In [37]:
df_ProtBERT = pd.concat([df_abcp_ProtBERT,df_non_abcp_ProtBERT])
df_ESM2 = pd.concat([df_abcp_ESM2,df_non_abcp_ESM2])

In [38]:
df_ProtBERT.to_csv("ProtBERT_embeddings.csv",index=False)
df_ESM2.to_csv("ESM2_embeddings.csv",index=False)