In [1]:
import os
import numpy as np
import pandas as pd
from Bio import SeqIO
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.patches import Patch
import mpl_stylesheet
import re
import gc
mpl_stylesheet.banskt_presentation(fontfamily = 'mono', fontsize = 20, colors = 'banskt', dpi = 300)

In [4]:
from transformers import T5Tokenizer, T5EncoderModel, T5Model, T5ForConditionalGeneration
import torch
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## Download models from: 
## https://github.com/sacdallago/bio_embeddings/blob/develop/bio_embeddings/utilities/defaults.yml

# prottrans_t5_bfd:
#   model_directory: "http://data.bioembeddings.com/public/embeddings/embedding_models/t5/prottrans_t5_bfd.zip"
# prottrans_t5_uniref50:
#   model_directory: "http://data.bioembeddings.com/public/embeddings/embedding_models/t5/prottrans_t5_uniref50.zip"
# prottrans_t5_xl_u50:
#   model_directory: "http://data.bioembeddings.com/public/embeddings/embedding_models/t5/prottrans_t5_xl_u50.zip"
#   half_precision_model_directory: "http://data.bioembeddings.com/public/embeddings/embedding_models/t5/half_prottrans_t5_xl_u50.zip"


  torch.utils._pytree._register_pytree_node(


In [7]:
fullmodel = T5ForConditionalGeneration.from_pretrained("/data/franco/datasets/prot_embedding_weights/prottrans_t5_xl_u50/").to(device)

OSError: Can't load the configuration of '/data/franco/datasets/prot_embedding_weights/prottrans_t5_xl_u50/'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure '/data/franco/datasets/prot_embedding_weights/prottrans_t5_xl_u50/' is the correct path to a directory containing a config.json file

In [28]:
fullmodel_nohead = T5Model.from_pretrained("/data/franco/datasets/prot_embedding_weights/prottrans_t5_xl_u50/").to(device)

Some weights of the model checkpoint at /data/franco/datasets/prot_embedding_weights/prottrans_t5_xl_u50/ were not used when initializing T5Model: ['lm_head.weight']
- This IS expected if you are initializing T5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained('/data/franco/datasets/prot_embedding_weights/prottrans_t5_xl_u50', do_lower_case=False)


In [5]:
# Load the model
#model = T5EncoderModel.from_pretrained("models/half_prottrans_t5_xl_u50").to(device)

In [6]:
device

device(type='cpu')

In [7]:
# only GPUs support half-precision currently; if you want to run on CPU use full-precision (not recommended, much slower)
#fullmodel.full() if device=='cpu' else fullmodel.half()
#gc.collect()

# prepare your protein sequences as a list
sequence_examples = ["PRTEINO", "SEQWENCE"]

# replace all rare/ambiguous amino acids by X and introduce white-space between all amino acids
sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequence_examples]


In [8]:
sequence_examples

['P R T E I N X', 'S E Q W E N C E']

In [9]:

# tokenize sequences and pad up to the longest sequence in the batch
ids = tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding="longest")

input_ids = torch.tensor(ids['input_ids']).to(device)
attention_mask = torch.tensor(ids['attention_mask']).to(device)

# generate embeddings
with torch.no_grad():
    embedding_repr = fullmodel(input_ids=input_ids,attention_mask=attention_mask, decoder_input_ids=input_ids)

## A better way to obtain the 'correct' embedding length
# features = [] 
# for seq_num in range(len(embedding)):
#     seq_len = (attention_mask[seq_num] == 1).sum()
#     seq_emd = embedding[seq_num][:seq_len-1]
#     features.append(seq_emd)
    
# # extract residue embeddings for the first ([0,:]) sequence in the batch and remove padded & special tokens ([0,:7]) 
# emb_0 = embedding_repr.last_hidden_state[0,:7] # shape (7 x 1024)
# # same for the second ([1,:]) sequence but taking into account different sequence lengths ([1,:8])
# emb_1 = embedding_repr.last_hidden_state[1,:8] # shape (8 x 1024)

# # if you want to derive a single representation (per-protein embedding) for the whole protein
# emb_0_per_protein = emb_0.mean(dim=0) # shape (1024)

In [24]:
print(embedding_repr.)

None


In [140]:
import random


testsequence = "PROTEINA"
input_test = [" ".join(list(re.sub(r"[UZOB]", "X", testsequence)))]
print(input_test)

ix2replace = int(random.random()*len(testsequence))
tmp =  input_test[0].split()
tmp[ix2replace] = "<extra_id_0>"
new_input_test = [" ".join(tmp)]
print(new_input_test)

ids1 = tokenizer.batch_encode_plus(input_test, add_special_tokens=True, padding="longest")
print(ids1)
ids2 = tokenizer.batch_encode_plus(new_input_test, add_special_tokens=True, padding="longest")
print(ids2)

['P R X T E I N A']
['P <extra_id_0> X T E I N A']
{'input_ids': [[13, 8, 23, 11, 9, 12, 17, 3, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1]]}
{'input_ids': [[13, 127, 23, 11, 9, 12, 17, 3, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [91]:
tokenizer("P R <extra_id_0> T E I N A")

{'input_ids': [13, 8, 127, 11, 9, 12, 17, 3, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [88]:
tokenizer.get_vocab()

{'<pad>': 0,
 '</s>': 1,
 '<unk>': 2,
 '▁A': 3,
 '▁L': 4,
 '▁G': 5,
 '▁V': 6,
 '▁S': 7,
 '▁R': 8,
 '▁E': 9,
 '▁D': 10,
 '▁T': 11,
 '▁I': 12,
 '▁P': 13,
 '▁K': 14,
 '▁F': 15,
 '▁Q': 16,
 '▁N': 17,
 '▁Y': 18,
 '▁M': 19,
 '▁H': 20,
 '▁W': 21,
 '▁C': 22,
 '▁X': 23,
 '▁B': 24,
 '▁O': 25,
 '▁U': 26,
 '▁Z': 27,
 '<extra_id_99>': 28,
 '<extra_id_98>': 29,
 '<extra_id_97>': 30,
 '<extra_id_96>': 31,
 '<extra_id_95>': 32,
 '<extra_id_94>': 33,
 '<extra_id_93>': 34,
 '<extra_id_92>': 35,
 '<extra_id_91>': 36,
 '<extra_id_90>': 37,
 '<extra_id_89>': 38,
 '<extra_id_88>': 39,
 '<extra_id_87>': 40,
 '<extra_id_86>': 41,
 '<extra_id_85>': 42,
 '<extra_id_84>': 43,
 '<extra_id_83>': 44,
 '<extra_id_82>': 45,
 '<extra_id_81>': 46,
 '<extra_id_80>': 47,
 '<extra_id_79>': 48,
 '<extra_id_78>': 49,
 '<extra_id_77>': 50,
 '<extra_id_76>': 51,
 '<extra_id_75>': 52,
 '<extra_id_74>': 53,
 '<extra_id_73>': 54,
 '<extra_id_72>': 55,
 '<extra_id_71>': 56,
 '<extra_id_70>': 57,
 '<extra_id_69>': 58,
 '<extra_id_

In [141]:

input_ids = torch.tensor(ids2['input_ids']).to(device)
attention_mask = torch.tensor(ids2['attention_mask']).to(device)

with torch.no_grad():
    embfull    = fullmodel(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=input_ids)
    embfull_nh = fullmodel_nohead(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=input_ids)

In [142]:
loss = fullmodel(input_ids=input_ids, labels=input_ids).loss
loss.item()

1.3036208152770996

In [143]:
outputs = fullmodel.generate(input_ids=input_ids)

In [144]:
tokenizer.decode(outputs[0])

'<pad> P S X T E I N A</s>'

In [145]:
input_ids

tensor([[ 13, 127,  23,  11,   9,  12,  17,   3,   1]])

In [146]:
ids2

{'input_ids': [[13, 127, 23, 11, 9, 12, 17, 3, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1]]}