In [3]:
%reload_ext autoreload
%autoreload 2

import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"


from transformers import T5Tokenizer, T5EncoderModel
import torch
import re

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


---

## Prost


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained("Rostlab/ProstT5", do_lower_case=False)

# Load the model
# model = T5EncoderModel.from_pretrained("Rostlab/ProstT5").to(device)

# only GPUs support half-precision currently; if you want to run on CPU use full-precision (not recommended, much slower)
# model.full() if device=='cpu' else model.half()

# prepare your protein sequences/structures as a list. Amino acid sequences are expected to be upper-case ("PRTEINO" below) while 3Di-sequences need to be lower-case ("strctr" below).
sequence_examples = ["PRTEINO", "strct"]

# replace all rare/ambiguous amino acids by X (3Di sequences does not have those) and introduce white-space between all sequences (AAs and 3Di)
sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequence_examples]

sequence_examples = ["<AA2fold>" + " " + s if s.isupper() else "<fold2AA>" + " " + s for s in sequence_examples]

In [None]:
print(sequence_examples)

In [None]:
# tokenize sequences and return pytorch tensors
ids = tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding=True, return_tensors="pt")
print(ids)

In [None]:
# decode back to sequences
decoded_sequences = tokenizer.batch_decode(ids["input_ids"], skip_special_tokens=True)

# remove spaces between amino acids
decoded_sequences = [seq.replace(" ", "") for seq in decoded_sequences]

print("Decoded sequences:", decoded_sequences)


---

## Prot


In [None]:
model = T5EncoderModel.from_pretrained("Rostlab/ProstT5").to(device)
tokenizer = T5Tokenizer.from_pretrained("Rostlab/ProstT5", do_lower_case=False)

sequence_examples = ["PRTEINO", "SEQWENCE"]
# this will replace all rare/ambiguous amino acids by X and introduce white-space between all amino acids
sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequence_examples]

# tokenize sequences and pad up to the longest sequence in the batch
ids = tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding="longest")
input_ids = torch.tensor(ids["input_ids"]).to(device)
attention_mask = torch.tensor(ids["attention_mask"]).to(device)

# generate embeddings
with torch.no_grad():
    embedding_repr = model(input_ids=input_ids, attention_mask=attention_mask)

# extract embeddings for the first ([0,:]) sequence in the batch while removing padded & special tokens ([0,:7])
emb_0 = embedding_repr.last_hidden_state[0, :7]  # shape (7 x 1024)
print(f"Shape of per-residue embedding of first sequences: {emb_0.shape}")
# do the same for the second ([1,:]) sequence in the batch while taking into account different sequence lengths ([1,:8])
emb_1 = embedding_repr.last_hidden_state[1, :8]  # shape (8 x 1024)

# if you want to derive a single representation (per-protein embedding) for the whole protein
emb_0_per_protein = emb_0.mean(dim=0)  # shape (1024)

print(f"Shape of per-protein embedding of first sequences: {emb_0_per_protein.shape}")


In [None]:
embedding_repr