In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import os
from os.path import join
import json
from selfpeptide.utils.constants import MIN_PEPTIDE_LEN, MAX_PEPTIDE_LEN


from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from torch.utils.data import Dataset, DataLoader

2024-02-08 10:58:17.960883: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-08 10:58:18.170089: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-08 10:58:18.209468: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-02-08 10:58:18.209491: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudar

In [17]:
with open("../trained_models/PeptideESM/config.json", "r") as f:
    config = json.load(f)
config

{'accumulate_batches': 1,
 'batch_size': 64,
 'cool_down': 0.8,
 'data_folder': '/home/gvisona/SelfPeptides',
 'early_stopping': False,
 'experiment_group': 'FinetuneESM2',
 'experiment_name': 'ESM_sweep_3_cont',
 'force_restart': False,
 'lr': 3.69224483345e-07,
 'max_updates': 100000,
 'min_frac': 0.1,
 'mlm_fraction': 0.15,
 'momentum': 0.99,
 'nesterov_momentum': True,
 'patience': 10000,
 'peptides_dataframes': ['processed_data/Immunogenicity/Processed_TCell_IEDB_beta_summed.csv',
  'processed_data/Immunogenicity/DHLAP_immunogenicity_data.csv',
  'processed_data/Binding_Affinity/DHLAP_binding_affinity_data.csv',
  'processed_data/Binding_Affinity/HLA_Ligand_Atlas_processed.csv'],
 'pretrained_model': 'facebook/esm2_t12_35M_UR50D',
 'project_folder': '/fast/gvisona/SelfPeptides',
 'ramp_up': 0.1,
 'resume_checkpoint_path': '/lustre/fast/fast/gvisona/SelfPeptides/outputs/FinetuneESM2/ESM_sweep_3/proud-sweep-12/checkpoints/001_checkpoint.pt',
 'seed': 23217,
 'seed2': 93256,
 'test_r

In [15]:

class PeptidesDataset(Dataset):
    def __init__(self, peptides):
        super().__init__()
        self.peptides = peptides
        
    def __len__(self):
        return len(self.peptides)
    
    def __getitem__(self, ix):
        return self.peptides[ix]


In [11]:
dataframes = [
        "processed_data/Immunogenicity/Processed_TCell_IEDB_beta_summed.csv",
        "processed_data/Immunogenicity/DHLAP_immunogenicity_data.csv",
        # "processed_data/Binding_Affinity/DHLAP_binding_affinity_data.csv",
        # "processed_data/Binding_Affinity/HLA_Ligand_Atlas_processed.csv"        
    ]

In [91]:
peptides_set = set()
for dname in dataframes:
    dpath = join("..", dname)
    df = pd.read_csv(dpath)
    peptides_set.update(df["Peptide"].values)
n_peptides = len(peptides_set)
peptides_set = sorted(list(peptides_set))
peptides_set = [p for p in peptides_set if len(p)>=MIN_PEPTIDE_LEN and len(p)<=MAX_PEPTIDE_LEN]
n_peptides = len(peptides_set)
print(f"Total number of peptides: {n_peptides}")

Total number of peptides: 18653
Total number of peptides: 17784


In [67]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained(config["pretrained_model"])
model = AutoModelForMaskedLM.from_pretrained(config["pretrained_model"])
# model.to(device)
checkpoint = torch.load("../trained_models/PeptideESM/checkpoints/001_checkpoint.pt", map_location=device)
model.load_state_dict(checkpoint)
model

EsmForMaskedLM(
  (esm): EsmModel(
    (embeddings): EsmEmbeddings(
      (word_embeddings): Embedding(33, 480, padding_idx=1)
      (dropout): Dropout(p=0.0, inplace=False)
      (position_embeddings): Embedding(1026, 480, padding_idx=1)
    )
    (encoder): EsmEncoder(
      (layer): ModuleList(
        (0-11): 12 x EsmLayer(
          (attention): EsmAttention(
            (self): EsmSelfAttention(
              (query): Linear(in_features=480, out_features=480, bias=True)
              (key): Linear(in_features=480, out_features=480, bias=True)
              (value): Linear(in_features=480, out_features=480, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
              (rotary_embeddings): RotaryEmbedding()
            )
            (output): EsmSelfOutput(
              (dense): Linear(in_features=480, out_features=480, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (LayerNorm): LayerNorm((480,), eps=1e-05, elementwise

In [60]:
peptides = sorted(list(peptides_set))
dset = PeptidesDataset(peptides)
loader = DataLoader(dset, batch_size=config["batch_size"], drop_last=False)
len(loader)

278

In [71]:
idx2peptide = {i: p for i, p in enumerate(peptides)}
peptide2idx = {v: k for k, v in idx2peptide.items()}

In [142]:
peptide_embeddings = []
model.eval()
for ix, batch in tqdm(enumerate(loader)):
    encoded_batch = tokenizer(batch, return_tensors="pt", padding=True)#.to(device)
    # masked_batch = mask_tokenized_inputs(encoded_batch, mlm_fraction=config["mlm_fraction"], mask_token_id=tokenizer.mask_token_id).to(device)
    # labels = torch.where(masked_batch.input_ids == tokenizer.mask_token_id, encoded_batch["input_ids"].to(device), -100)
    outputs = model(**encoded_batch, output_hidden_states=True)
    
    for j in range(len(batch)):
        a_mask = encoded_batch["attention_mask"][j]
        
        sample_embedding = outputs["hidden_states"][-1][j] #torch.mean(, dim=1)
        p_embedding = torch.mean(sample_embedding[a_mask.bool()][1:-1], dim=0) # remove special tokens
        peptide_embeddings.append(p_embedding.detach().numpy())
peptide_embeddings = np.vstack(peptide_embeddings)
peptide_embeddings

0it [00:00, ?it/s]

array([[ 0.32482213, -0.261438  ,  0.06593953, ..., -0.17181304,
        -0.02496444,  0.06054433],
       [ 0.22507007, -0.32995966,  0.0617824 , ..., -0.09029019,
        -0.06718121, -0.04255965],
       [ 0.257103  , -0.256661  ,  0.08057274, ..., -0.03303841,
        -0.05134671,  0.03509901],
       ...,
       [ 0.11365138, -0.0996558 ,  0.1326552 , ...,  0.00160837,
        -0.0227115 ,  0.26673147],
       [-0.04526203, -0.04576126,  0.15663567, ..., -0.07438464,
         0.08082509,  0.20639467],
       [ 0.07167291,  0.03618845,  0.06238944, ...,  0.02734216,
         0.12952381,  0.31710804]], dtype=float32)

In [143]:
peptide_embeddings.shape

(17784, 480)

In [83]:
tokenizer.decode(encoded_batch['input_ids'][0])

'<cls> A A A A A I F V I <eos> <pad> <pad> <pad>'

In [145]:
np.save("../processed_data/Peptide_embeddings/FinetunedESM2_imm_peptides_embeddings.npy", peptide_embeddings)

In [146]:
with open("../processed_data/Peptide_embeddings/ESM2_idx2peptide.json", "w") as f:
    json.dump(idx2peptide, f, indent=2)