In [1]:
from transformers import AutoTokenizer, AutoModel
from Bio import SeqIO
import torch
from torch.utils.data import DataLoader, TensorDataset
import random
import numpy as np
import torch.nn as nn
# https://huggingface.co/blog/AmelieSchreiber/esmbind
# the minimum for the ESM2 is 650M if we want better performance than ESM1b with 650M as well.
import pandas as pd
from pathlib import Path
from itertools import islice

In [2]:
dataset = TensorDataset(torch.arange(40, dtype=torch.float32).view(10, 4), torch.tensor([i for i in range(10)]))

In [3]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


In [14]:
torch.arange(40).shape

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39])

In [36]:
a = nn.Conv1d(1, 10, 2)
g = torch.Generator()
g.manual_seed(0)
dl = DataLoader(dataset, batch_size=4, worker_init_fn=seed_worker, generator=g, shuffle=True, num_workers=2)
for i in range(2):
    print("Epoch", i)
    for batch, label in dl:
       print(batch.unsqueeze(1).shape)
       print(batch.unsqueeze(1))
       o = a(batch.unsqueeze(1))
       print(o.shape)
       print(o)

Epoch 0
torch.Size([4, 1, 4])
tensor([[[12., 13., 14., 15.]],

        [[28., 29., 30., 31.]],

        [[20., 21., 22., 23.]],

        [[ 8.,  9., 10., 11.]]])
torch.Size([4, 10, 3])
tensor([[[  7.9515,   8.6100,   9.2686],
         [ -5.7440,  -6.2499,  -6.7559],
         [ -2.4015,  -2.5816,  -2.7617],
         [-12.2291, -13.2001, -14.1711],
         [ 10.4888,  11.3591,  12.2294],
         [  3.9718,   4.3172,   4.6625],
         [  1.2479,   1.3680,   1.4881],
         [ -0.7694,  -0.8526,  -0.9357],
         [-12.2041, -13.1510, -14.0978],
         [ -2.9194,  -3.1050,  -3.2906]],

        [[ 18.4879,  19.1464,  19.8049],
         [-13.8394, -14.3453, -14.8513],
         [ -5.2831,  -5.4632,  -5.6433],
         [-27.7649, -28.7359, -29.7069],
         [ 24.4138,  25.2841,  26.1544],
         [  9.4968,   9.8421,  10.1874],
         [  3.1696,   3.2897,   3.4098],
         [ -2.0995,  -2.1826,  -2.2658],
         [-27.3533, -28.3001, -29.2469],
         [ -5.8897,  -6.0753,  -6.

## Prepare the data and the device

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device="cpu"

In [3]:
with open('../data/whole_sequence.fasta', 'r') as f:
    seqs = list(SeqIO.parse(f, 'fasta'))
seq = {s.id:str(s.seq) for s in seqs}

In [4]:
a = iter(seq.items())

In [5]:
u = islice(a, 10)
dict(u)

{'EH1(72)': 'MLLPETRNLLDLMDAATRGGRPRLETLPHAVGRKAVDKMSEDGEADPPEVAEVANGGFAGPASEIRFRRYRPLGEAAGLLPTLIYYHGGGFVIGNIETHDSTCRRLANKSRCQVISIDYRLAPEHPFPAPIDDGIAAFRHIRDNAESFGADAARLAVGGDSAGGAMAAVVCQACRDAGETGPAFQMLIYPATDSSRESASRVAFAEGYFLSKALMDWFWEAYVPEDTDLTDLRLSPLLATDFTGLPPAFVLTAGYDPLRDEGRAYADRLIEAGIKTTYVNYPGTIHGFFSLTRFLSQGLKANDEAAAVMGAHFGT',
 'EH2(71)': 'MGLQKLIVRTLMKLPESWILKLAGGTPVEIDGRTMDPRIQLLAAQGAKAPSMTSMSIEDARKSADEGLALLDAKPRRTVSILSRTIPGPAGDLHVRIYTPAGATGPLPGIVYYHMGGCVIGNLETCNTFCSILADDCRAIVVSVDYRLAPEHKFPAAMDDAVASFDWVSENAAALGIDPTRLGVGGDSAGGWLSAVVCQTRKAEGKTQPKAQLLIYPATDLDAKEGSMQSCAEIYPLTAEIMDWFMQQFLNSPEDAKDLKASPAHSEDLSGLAPALIMTAGFDVLRDQGEAYGNRLRDAGVPVTYRCYDSLSHAYTAFSGAVPAARQACEEIARDMARALG',
 'EH3(69)': 'MPDTTSLNIADDVRMDPRLKAMLAAFPMMEQQTFQTREEQVANANTPEATAAREQLKMMMDMMDSEEFAPSDNLDISTREFTSSPDGNAIKIQFIRPKGKQKVPCVYYIHGGGMMIMSAFYGNYRAWGKMIANNGVAVAMVDFRNCLSPSSAPEVAPFPAGLNDCVSGLKWVSENADELSIDKNKIIIAGESGGGNLTLATGLKLKQDGNIDLVKGLYALCPYIAGKWPQDRFPSSSENNGIMIELHNNQGALAYGIEQLEAENPLAWPSFASAEDMQGLPPTVINVNECDPLRDEGID

## Initialize the tokenizer and the models

low_cpu_mem_usage: when loading try not to use more memory.

In [6]:
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D", low_mem_usage=True)
tok = tokenizer(list(seq.values())[:2], padding=True, truncation=True, return_tensors="pt", is_split_into_words=False)
tok = tok.to(device)

In [14]:
model = AutoModel.from_pretrained("facebook/esm2_t6_8M_UR50D", add_pooling_layer=False, output_hidden_states=True, low_cpu_mem_usage=True, dtype=torch.float32)
model.to(device)

TypeError: EsmModel.__init__() got an unexpected keyword argument 'dtype'

In [12]:
model.get_memory_footprint() / 100_000

309.58004

In [22]:
output = model(**tok)
output.last_hidden_state

tensor([[[ 0.1419,  0.5839, -0.0722,  ...,  1.1745, -0.0934, -0.4214],
         [ 0.4127,  0.0940, -0.1655,  ...,  0.8397, -0.2216, -0.2433],
         [ 0.0768, -0.5086, -0.0294,  ...,  0.3882, -0.0305,  0.1067],
         ...,
         [-0.2578,  0.2489,  0.5511,  ...,  0.6463, -0.5266, -0.0942],
         [-0.3510,  0.2589,  0.5853,  ...,  0.4988, -0.5679, -0.1550],
         [-0.3296,  0.2368,  0.2401,  ...,  0.4682, -0.6849, -0.3094]],

        [[ 0.0936,  0.6978, -0.0490,  ...,  1.0340, -0.1707, -0.3038],
         [ 0.3237,  0.4791, -0.1456,  ...,  0.7708, -0.2135, -0.2907],
         [ 0.0065, -0.2744,  0.2615,  ..., -0.0429,  0.2885,  0.1101],
         ...,
         [-0.4722, -0.2125, -0.2723,  ...,  0.9457, -0.3602,  0.2324],
         [-0.0936, -0.4826,  0.0658,  ...,  0.3896, -0.1248, -0.1090],
         [-0.0047,  0.0668,  0.1097,  ...,  0.6548, -0.6016, -0.2547]]],
       grad_fn=<NativeLayerNormBackward0>)

In [42]:
output.last_hidden_state[1].shape

torch.Size([343, 320])

In [35]:
torch.nn.AvgPool1d(2)(output.last_hidden_state[-1]).shape

torch.Size([343, 160])

## Use the attention mask to remove the padding

In [52]:
results = {}
mask = tok["attention_mask"].bool()
for num, x in enumerate(output.last_hidden_state):
    masked_x = x[mask[num]]
    results[num] = masked_x.mean(dim=0).detach().cpu().numpy()
    # detach removes the tensor from the computation graph (the gradient won't be computed)

In [71]:
torch.max(results[0], dim=0)[0].shape

torch.Size([320])

In [103]:
path = "data2.csv"
embeddings = pd.DataFrame(results).T
embeddings.to_csv(path, mode='a', header=not Path(path).exists())

In [21]:
x[tok["attention_mask"].bool()[1]].shape

torch.Size([343, 320])

## Adapt the code to Load large datasets

In [22]:
from datasets import load_dataset, Dataset
from Bio import SeqIO
import pandas as pd
# https://huggingface.co/docs/datasets/loading

There are many ways to load files within datasets -> from local or remote files (in these formats json, csv, text, parquet)  
Since we have a fasta file that is not supported (because it will treat each line as a row so it will double the rows, but in fasta the first line is an id).  
So we can process it in-memory to pandas, generators, dictionaries or list of dictionaries and use Datasets instead of load_dataset.  
The load dataset returns a dataset dict with different splits (train, test, val) as keys and then a dataset object as values.

But we are using a dataset object directl

To load fasta files use from generator beacause it is in-memory and the file might be too large to process.

In [23]:
a = load_dataset("text", data_files="../data/whole_sequence.fasta")
a

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 294
    })
})

In [26]:
def fasta_generator(fasta_file: str="../data/whole_sequence.fasta"):
    with open(fasta_file, 'r') as f:
        seqs = SeqIO.parse(f, 'fasta')
        for seq in seqs:
            yield {"id":seq.id, "seq":str(seq.seq)}

with open("../data/whole_sequence.fasta", 'r') as f:
    seqs = SeqIO.parse(f, 'fasta')
    d = pd.Series({s.id:str(s.seq) for s in seqs}).to_frame()
    d.columns = ["sequences"]

In [27]:
b = Dataset.from_generator(fasta_generator, gen_kwargs={"fasta_file":"../data/whole_sequence.fasta"})
b

Dataset({
    features: ['id', 'seq'],
    num_rows: 147
})

## Process or tokenize

Use map to apply the tokenizer function to the entire dataset
The map will create and add the new columns ('input_ids', 'attention_mask') coming from the tokenizer to the datatset   
but you will have to change its format to torch tensors for the models to read it

In [34]:
dataset = b.map(lambda examples: tokenizer(examples["seq"], return_tensors="np",padding=True, truncation=True), batched=True)
type(dataset["input_ids"])

list

In [36]:
u = dataset.select_columns(["input_ids", "attention_mask"])
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"], device=device)

In [37]:
type(dataset["input_ids"])

torch.Tensor

Now to extract the embeddings use the dataloader from pytorch to create the batches for you  
It will only return the input_ids and the attention mask (the ids are lost, so yoou don't know which sequence is which)

In [40]:
dataloader = DataLoader(dataset, batch_size=4)
for batch in dataloader:
    u = batch
u

{'input_ids': tensor([[ 0, 20, 10,  ...,  1,  1,  1],
         [ 0, 20, 15,  ...,  1,  1,  1],
         [ 0, 20, 15,  ...,  1,  1,  1]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [5]:
import BioML.deep.embeddings as emb

In [3]:
data = emb.TokenizeFasta(emb.LLMConfig()).tokenize("../data/whole_sequence.fasta")
embed = emb.ExtractEmbeddings(emb.LLMConfig())
seq_keys = list(data["id"])

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

In [4]:
batch_size = 8

In [8]:
for num, batch in enumerate(DataLoader(data, batch_size=batch_size)):
    batch_seq_keys = seq_keys[num*batch_size:(num+1)*batch_size]
    results = embed.extract(batch_seq_keys, batch)
    #embed.save(results, "../data/embeddings.csv")

## Other ways to create emebeddings

In [82]:
attention_weights = torch.nn.Linear(320, 1)

In [87]:
attention_scores = attention_weights(output.hidden_states[-1])
attention_weights = torch.softmax(attention_scores, -1)

TypeError: 'Tensor' object is not callable

In [89]:
attention_weights.shape

torch.Size([2, 343, 1])

In [29]:
_temp = output.hidden_states[-1].reshape(output.hidden_states[-1].shape[0], -1)
_temp.shape

torch.Size([2, 109760])

In [53]:
_temp[0]

tensor([ 0.1419,  0.5839, -0.0722,  ...,  0.4682, -0.6849, -0.3094],
       grad_fn=<SelectBackward0>)

In [32]:
(0, 2048 - _temp.shape[1])

(0, -107712)

In [66]:
o = torch.nn.functional.pad(_temp, (0, 2048 - _temp.shape[1]))

In [68]:
o[0][:10]

tensor([ 0.1419,  0.5839, -0.0722,  0.3390, -0.1853, -0.0982, -0.9235,  0.1019,
        -0.4527, -0.6959], grad_fn=<SliceBackward0>)

In [50]:
len(set(o[0].detach().numpy()).intersection(_temp[0].detach().numpy()))xx

109670

In [74]:
len(set(output.hidden_states[-1][0][0].detach().numpy()).intersection(_temp[0][:100].detach().numpy()))

100

# Test training using the embeddings

## Regression

In [76]:
import BioML.models.regression as regression

In [80]:
embeddings = pd.read_csv("../data/embeddings.csv", index_col=0)
label = list(range(len(embeddings)))

In [82]:
data = regression.DataParser("../data/embeddings.csv", label)
experiment = regression.PycaretInterface("regression", 200, scaler= "zscore", budget_time=20, best_model=3, 
                                        output_path="regression_training", optimize="RMSE")

regressor = regression.Regressor(test_size=0.2, optimize="RMSE")
training = regression.Trainer(experiment, regressor, 5, 30)


23-02-2024 12:07:16 INFO ------------------------------------------------------------------------------
23-02-2024 12:07:16 INFO PycaretInterface parameters
23-02-2024 12:07:16 INFO Seed: 200
23-02-2024 12:07:16 INFO Budget time: 20
23-02-2024 12:07:16 INFO The number of models to select: 3
23-02-2024 12:07:16 INFO Output path: regression_training
23-02-2024 12:07:16 INFO ----------------Trainer inputs-------------------------
23-02-2024 12:07:16 INFO Number of kfolds: 5
23-02-2024 12:07:16 INFO Number of iterations: 30


Split the data according to sequence similarity

In [83]:
c = regression.split.ClusterSpliter("../data/resultsDB_clu.tsv", 5, random_state=experiment.seed, test_size=0.2)
X_train, X_test = c.train_test_split(data.features)

In [85]:
results, models_dict = training.generate_training_results(X_train, data.label, True,
                                                          test_data=X_test, fold_strategy=c)

2024/02/23 12:18:52 INFO mlflow.tracking.fluent: Experiment with name 'Regression' does not exist. Creating a new experiment.
23-02-2024 12:18:53 INFO --------------------------------------------------------
23-02-2024 12:18:53 INFO Training regression models
23-02-2024 12:18:53 INFO The models used ['lr', 'lasso', 'ridge', 'en', 'lar', 'llar', 'omp', 'br', 'par', 'huber', 'svm', 'knn', 'dt', 'rf', 'et', 'gbr', 'mlp', 'xgboost', 'catboost', 'dummy']
23-02-2024 12:18:53 INFO Time budget is 20 minutes
23-02-2024 12:20:27 INFO Training over: Total runtime 1.565 minutes
23-02-2024 12:20:27 INFO Analyse the best models and plotting them
23-02-2024 12:20:27 INFO Analyse the top 1 model: catboost
23-02-2024 12:24:10 INFO Analyse the top 2 model: br
23-02-2024 12:24:13 INFO Analyse the top 3 model: rf
23-02-2024 12:24:27 INFO --------Stacking the best models--------
23-02-2024 12:24:27 INFO ----------Stacking the best models--------------
23-02-2024 12:27:58 INFO --------Creating an ensemble m