In [1]:

# UTILITARIES
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# TORCH MODULES FOR METRICS COMPUTATION :
import torch
from torch.utils.data import Dataset
from torch import nn
from torch.utils.data import random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau


In [2]:
#import pandas as pd
from Bio import SeqIO

In [3]:
class config:
    train_sequences_path = "/Train/1433_all_Interpro.fasta"
    num_labels = 500
    n_epochs = 5
    batch_size = 128
    lr = 0.001
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
from transformers import T5Tokenizer, T5EncoderModel
import re
tokenizer = T5Tokenizer.from_pretrained("Rostlab/t5", do_lower_case=False )
model = T5EncoderModel.from_pretrained("Rostlab/t5").to(config.device)

def get_bert_embedding(
    sequence : str,
    len_seq_limit : int
):
    sequence_w_spaces = ' '.join(list(sequence))
    encoded_input = tokenizer(
        sequence_w_spaces,
        truncation=True,
        max_length=len_seq_limit,
        padding='max_length',
        return_tensors='pt').to(config.device)
    output = model(**encoded_input)
    output_hidden = output['last_hidden_state'][:,0][0].detach().cpu().numpy()
    assert len(output_hidden)==1024
    return output_hidden

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
  return self.fget.__get__(instance, owner)()


In [5]:
print(config.device)

cuda


In [6]:
### COLLECTING FOR TRAIN SAMPLES :

fasta_train = SeqIO.parse(config.train_sequences_path, "fasta")
ids_list = []
embed_vects_list = []
t0 = time.time()
checkpoint = 0
for item in tqdm(fasta_train):
    ids_list.append(item.id)
    embed_vects_list.append(
        get_bert_embedding(sequence = item.seq, len_seq_limit = 1200))
    checkpoint+=1
    if checkpoint>=100:
        df_res = pd.DataFrame(data={"id" : ids_list, "embed_vect" : embed_vects_list})
        np.save('1433_all_Interpro_t5_train_ids.npy',np.array(ids_list))
        np.save('1433_all_Interpro_t5_train_embeddings.npy',np.array(embed_vects_list))
        checkpoint=0

np.save('1433_all_Interpro_t5_train_ids.npy',np.array(ids_list))
np.save('1433_all_Interpro_t5_train_embeddings.npy',np.array(embed_vects_list))
print('Total Elapsed Time:',time.time()-t0)

16027it [1:13:09,  3.65it/s]

Total Elapsed Time: 4389.590974569321



