In [27]:
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
from Bio import SeqIO
import torch
from torch.utils.data import DataLoader, TensorDataset
import random
import numpy as np
import torch.nn as nn
# https://huggingface.co/blog/AmelieSchreiber/esmbind
# the minimum for the ESM2 is 650M if we want better performance than ESM1b with 650M as well.
import pandas as pd
from pathlib import Path
from itertools import islice
import torch.nn.functional as F
import bitsandbytes as bnb
from bitsandbytes.nn import Linear8bitLt

In [4]:
dataset = TensorDataset(torch.arange(40, dtype=torch.float32).view(10, 4), torch.tensor([i for i in range(10)]))

In [5]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


In [6]:
torch.arange(40).shape

torch.Size([40])

In [7]:
a = nn.Conv1d(1, 10, 2)
g = torch.Generator()
g.manual_seed(0)
dl = DataLoader(dataset, batch_size=4, worker_init_fn=seed_worker, generator=g, shuffle=True)
for i in range(2):
    print("Epoch", i)
    for batch, label in dl:
       print(batch.unsqueeze(1).shape)
       print(batch.unsqueeze(1))
       o = a(batch.unsqueeze(1))
       print(o.shape)
       print(o)

Epoch 0
torch.Size([4, 1, 4])
tensor([[[12., 13., 14., 15.]],

        [[28., 29., 30., 31.]],

        [[20., 21., 22., 23.]],

        [[ 8.,  9., 10., 11.]]])
torch.Size([4, 10, 3])
tensor([[[ -4.5959,  -4.9393,  -5.2827],
         [  8.7039,   9.3886,  10.0732],
         [ -0.8663,  -0.8556,  -0.8448],
         [  6.3920,   6.9061,   7.4203],
         [  0.2911,   0.3711,   0.4510],
         [ -1.7599,  -1.8867,  -2.0135],
         [  2.6198,   2.8765,   3.1331],
         [  5.5991,   6.0079,   6.4167],
         [ -1.3713,  -1.3925,  -1.4138],
         [ -5.1378,  -5.5518,  -5.9657]],

        [[-10.0898, -10.4332, -10.7766],
         [ 19.6582,  20.3429,  21.0275],
         [ -0.6943,  -0.6836,  -0.6728],
         [ 14.6182,  15.1323,  15.6464],
         [  1.5701,   1.6500,   1.7300],
         [ -3.7889,  -3.9157,  -4.0425],
         [  6.7264,   6.9831,   7.2398],
         [ 12.1402,  12.5490,  12.9578],
         [ -1.7114,  -1.7326,  -1.7539],
         [-11.7614, -12.1754, -12.

## Prepare the data and the device

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device="cpu"

In [9]:
with open('../data/whole_sequence.fasta', 'r') as f:
    seqs = list(SeqIO.parse(f, 'fasta'))
seq = {s.id:str(s.seq) for s in seqs}

In [10]:
a = iter(seq.items())

In [11]:
u = islice(a, 10)
dict(u)

{'EH1(72)': 'MLLPETRNLLDLMDAATRGGRPRLETLPHAVGRKAVDKMSEDGEADPPEVAEVANGGFAGPASEIRFRRYRPLGEAAGLLPTLIYYHGGGFVIGNIETHDSTCRRLANKSRCQVISIDYRLAPEHPFPAPIDDGIAAFRHIRDNAESFGADAARLAVGGDSAGGAMAAVVCQACRDAGETGPAFQMLIYPATDSSRESASRVAFAEGYFLSKALMDWFWEAYVPEDTDLTDLRLSPLLATDFTGLPPAFVLTAGYDPLRDEGRAYADRLIEAGIKTTYVNYPGTIHGFFSLTRFLSQGLKANDEAAAVMGAHFGT',
 'EH2(71)': 'MGLQKLIVRTLMKLPESWILKLAGGTPVEIDGRTMDPRIQLLAAQGAKAPSMTSMSIEDARKSADEGLALLDAKPRRTVSILSRTIPGPAGDLHVRIYTPAGATGPLPGIVYYHMGGCVIGNLETCNTFCSILADDCRAIVVSVDYRLAPEHKFPAAMDDAVASFDWVSENAAALGIDPTRLGVGGDSAGGWLSAVVCQTRKAEGKTQPKAQLLIYPATDLDAKEGSMQSCAEIYPLTAEIMDWFMQQFLNSPEDAKDLKASPAHSEDLSGLAPALIMTAGFDVLRDQGEAYGNRLRDAGVPVTYRCYDSLSHAYTAFSGAVPAARQACEEIARDMARALG',
 'EH3(69)': 'MPDTTSLNIADDVRMDPRLKAMLAAFPMMEQQTFQTREEQVANANTPEATAAREQLKMMMDMMDSEEFAPSDNLDISTREFTSSPDGNAIKIQFIRPKGKQKVPCVYYIHGGGMMIMSAFYGNYRAWGKMIANNGVAVAMVDFRNCLSPSSAPEVAPFPAGLNDCVSGLKWVSENADELSIDKNKIIIAGESGGGNLTLATGLKLKQDGNIDLVKGLYALCPYIAGKWPQDRFPSSSENNGIMIELHNNQGALAYGIEQLEAENPLAWPSFASAEDMQGLPPTVINVNECDPLRDEGID

## Initialize the tokenizer and the models

low_cpu_mem_usage: when loading try not to use more memory.

In [12]:
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D", low_mem_usage=True)
tok = tokenizer(list(seq.values())[:2], padding=True, truncation=True, return_tensors="pt", is_split_into_words=False)
tok = tok.to(device)

In [13]:
model_8bit = AutoModelForMaskedLM.from_pretrained("facebook/esm2_t6_8M_UR50D", output_hidden_states=True, device_map="auto", load_in_8bit=True)
#model.to(device)
#model.eval()
model = AutoModel.from_pretrained("facebook/esm2_t6_8M_UR50D", add_pooling_layer=False, device_map="cpu")
model_masked_16 = AutoModelForMaskedLM.from_pretrained("facebook/esm2_t6_8M_UR50D", output_hidden_states=True, device_map="cpu", torch_dtype=torch.float16)
model_16 = AutoModel.from_pretrained("facebook/esm2_t6_8M_UR50D", add_pooling_layer=False, device_map="cpu", torch_dtype=torch.float16) 
# most models might not be able to do inference with float 16. Its errors is lower than bfloat16 but you cannot run it in CPUs maybe
model_masked = AutoModelForMaskedLM.from_pretrained("facebook/esm2_t6_8M_UR50D", output_hidden_states=True, device_map="cpu", torch_dtype=torch.float32)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [48]:
list(model.named_modules())[0]

('',
 EsmModel(
   (embeddings): EsmEmbeddings(
     (word_embeddings): Embedding(33, 320, padding_idx=1)
     (dropout): Dropout(p=0.0, inplace=False)
     (position_embeddings): Embedding(1026, 320, padding_idx=1)
   )
   (encoder): EsmEncoder(
     (layer): ModuleList(
       (0-5): 6 x EsmLayer(
         (attention): EsmAttention(
           (self): EsmSelfAttention(
             (query): Linear(in_features=320, out_features=320, bias=True)
             (key): Linear(in_features=320, out_features=320, bias=True)
             (value): Linear(in_features=320, out_features=320, bias=True)
             (dropout): Dropout(p=0.0, inplace=False)
             (rotary_embeddings): RotaryEmbedding()
           )
           (output): EsmSelfOutput(
             (dense): Linear(in_features=320, out_features=320, bias=True)
             (dropout): Dropout(p=0.0, inplace=False)
           )
           (LayerNorm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
         )
         (interme

In [32]:
torch.finfo(torch.float16)

finfo(resolution=0.001, min=-65504, max=65504, eps=0.000976562, smallest_normal=6.10352e-05, tiny=6.10352e-05, dtype=float16)

In [15]:
torch.set_printoptions(precision=30, sci_mode=False)

In [31]:
torch.tensor(1/3, dtype=torch.float32)

tensor(0.333333343267440795898437500000)

In [70]:
torch.tensor(1/3, dtype=torch.float16)

tensor(0.333251953125000000000000000000, dtype=torch.float16)

In [71]:
torch.tensor(1/3, dtype=torch.bfloat16) # los numeros que puede presentar antes de que sea 0 y la precision que puede presentar es diferente.

tensor(0.333984375000000000000000000000, dtype=torch.bfloat16)

In [22]:
output = model_masked(**tok)
output.logits# logits es lo que le das al softmax para que lo convierta en probabilidad -> softmax(logits) es el output the last linear model
# y que es last hidden state entonces? Es muy diferente a los logits? El shape es diferente -> para cada position devuelve la probabilidad de que sea uno de los tokens
# EL maskedLM y el automodel hidden state es lo mismo -> pero la logits cambia. Cual debería usar para el embedding?
#output.hidden_states[-1] # en el caso de MASkedLM si no le especifico de devolver hidden states, no los devuelve, pero en el caso de automodel si los devuelve aun sin lo del hidden state

tensor([[[ 15.2477,  -7.5770,  -6.3456,  ..., -15.4048, -15.6385,  -7.5723],
         [ -9.7658, -16.5052,  -9.5490,  ..., -15.9087, -16.1523, -16.4937],
         [-12.1465, -22.2140, -12.5623,  ..., -15.8097, -15.8878, -22.1935],
         ...,
         [ -5.5602,  -6.6518,  14.4635,  ..., -16.7472, -16.5519,  -6.6859],
         [ -5.3854,  -6.4475,  16.6540,  ..., -16.7547, -16.5586,  -6.4803],
         [ -5.4969,  -6.6457,  16.6575,  ..., -16.6794, -16.5023,  -6.6868]],

        [[ 16.1664,  -6.1153,  -6.5335,  ..., -15.2296, -15.4523,  -6.1025],
         [ -8.9726, -16.1343,  -7.6063,  ..., -15.9684, -16.2367, -16.1393],
         [-11.5151, -19.6775, -10.5682,  ..., -15.8286, -15.9348, -19.6637],
         ...,
         [-10.4293, -21.1545, -12.9438,  ..., -16.0008, -16.0468, -21.1782],
         [-12.7677, -21.3681, -12.9532,  ..., -16.2232, -16.3621, -21.3540],
         [ -6.0046,  -5.6860,  17.9882,  ..., -16.7294, -16.6201,  -5.7410]]],
       grad_fn=<AddBackward0>)

In [47]:
np.argmax(F.softmax(output.logits, dim=0).detach().numpy(), axis=-1) == np.argmax(F.softmax(output_16.logits, dim=0).detach().numpy(), axis=-1)

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True, False,  True,  True,  True,  True,  True,
         True,  True, False,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
      

In [46]:
np.argmax(F.softmax(output_16.logits, dim=0).detach().numpy(), axis=-1)

array([[21, 21,  4, 13, 14,  9, 11, 10, 17,  4,  4, 13,  4, 20, 13,  5,
         5, 11, 10,  6,  6, 10, 14, 10,  4,  9, 22,  4, 14, 21,  5,  7,
        22, 22, 15,  5,  7, 13, 15, 20,  8,  9, 13,  6, 18,  5, 13, 14,
         1,  9,  7,  5,  2,  7,  5,  2,  1,  6, 18,  5,  6, 14,  5,  6,
         9, 12,  1, 18, 10, 12, 19, 10, 14,  4,  6,  9,  5,  5,  6,  4,
         4, 14,  0,  4, 12, 19, 19, 21,  1,  6,  6, 18,  7, 12,  6, 17,
        12,  9, 11, 21, 13,  8, 22, 23, 10, 10,  4,  5,  9,  9,  8, 10,
        23, 16,  7, 12,  8, 12, 13, 19, 10,  4,  5, 14,  9, 21, 14, 18,
         1,  5,  1, 12, 13, 13,  6, 12,  2,  5, 18, 10, 22, 12, 10, 13,
        17,  5,  9,  8, 18,  6,  5, 13, 14,  5, 10,  4,  1,  7,  6,  6,
         6,  0,  5,  6,  1, 17, 20,  5,  5,  7,  7, 23, 16, 19, 23, 10,
        13, 13,  6, 19, 14, 14, 14, 10, 15, 16, 20,  4, 12, 19, 14, 22,
        11, 13, 10, 10, 14,  9,  8, 14, 14, 13, 14, 14, 18,  5,  9,  6,
        19,  9,  4, 13, 15,  9,  9, 20, 13, 22, 18, 22,  9,  2, 

In [42]:
output_16 = model_masked_16(**tok)

In [41]:
(output.hidden_states[-1] - output_16.hidden_states[-1]).max(axis=1)[0].max()

tensor(0.0912, grad_fn=<MaxBackward1>)

In [45]:
model_masked_16 = AutoModelForMaskedLM.from_pretrained("facebook/esm2_t6_8M_UR50D", output_hidden_states=True, 
                                                       device_map="cpu", torch_dtype=torch.float16)

model_masked = AutoModelForMaskedLM.from_pretrained("facebook/esm2_t6_8M_UR50D", output_hidden_states=True, 
                                                    device_map="cpu", torch_dtype=torch.float32)

output = model_masked(**tok)
output_16 = model_masked_16(**tok)

print("max difference:", (output.hidden_states[-1] - output_16.hidden_states[-1]).max().detach().numpy())
print("mean difference:", (output.hidden_states[-1] - output_16.hidden_states[-1]).abs().mean().detach().numpy())

max difference: 0.01247859
mean difference: 0.00063062285


In [54]:
output = model(**tok)
output # si no le especificas hidden states -> tampoco tiene hidden states, sino que tiene last_hidden state

torch.Size([2, 343, 320])

In [None]:
output_16 = model_16(**tok)
output_16.last_hidden_state

tensor([[[ 0.1421,  0.5840, -0.0724,  ...,  1.1748, -0.0931, -0.4209],
         [ 0.4133,  0.0937, -0.1658,  ...,  0.8398, -0.2219, -0.2437],
         [ 0.0767, -0.5088, -0.0298,  ...,  0.3887, -0.0302,  0.1070],
         ...,
         [-0.2573,  0.2483,  0.5522,  ...,  0.6455, -0.5259, -0.0944],
         [-0.3516,  0.2590,  0.5864,  ...,  0.4985, -0.5669, -0.1550],
         [-0.3301,  0.2368,  0.2407,  ...,  0.4675, -0.6846, -0.3098]],

        [[ 0.0927,  0.6987, -0.0489,  ...,  1.0352, -0.1703, -0.3040],
         [ 0.3230,  0.4792, -0.1464,  ...,  0.7700, -0.2129, -0.2917],
         [ 0.0061, -0.2739,  0.2622,  ..., -0.0435,  0.2888,  0.1104],
         ...,
         [-0.4729, -0.2120, -0.2720,  ...,  0.9448, -0.3599,  0.2312],
         [-0.0944, -0.4836,  0.0655,  ...,  0.3899, -0.1250, -0.1101],
         [-0.0054,  0.0670,  0.1100,  ...,  0.6562, -0.6016, -0.2551]]],
       dtype=torch.float16, grad_fn=<NativeLayerNormBackward0>)

In [None]:
model_float16.get_memory_footprint()/1e+6
#model.get_memory_footprint()/1e+6

15.483202

In [None]:
difference_16 = output.last_hidden_state - output_16.last_hidden_state

In [None]:
difference_16.max(axis=1).values.max()

tensor(0.0125, grad_fn=<MaxBackward1>)

In [None]:
output_8bit = model_8bit(**tok)
output_8bit.logits

tensor([[[ 15.0938,  -7.5859,  -6.3984,  ..., -15.4062, -15.6328,  -7.5781],
         [ -9.7422, -16.4844,  -9.3047,  ..., -15.8906, -16.1406, -16.4688],
         [-11.9688, -21.8438, -12.3438,  ..., -15.7969, -15.8750, -21.8281],
         ...,
         [ -5.5430,  -6.7969,  14.7812,  ..., -16.7656, -16.5781,  -6.8281],
         [ -5.4414,  -6.3516,  17.0000,  ..., -16.7031, -16.5156,  -6.3867],
         [ -5.4102,  -6.6562,  16.5625,  ..., -16.6719, -16.4844,  -6.6953]],

        [[ 16.1562,  -5.9922,  -6.4141,  ..., -15.2656, -15.4844,  -5.9805],
         [ -9.0000, -15.7500,  -7.1562,  ..., -15.9609, -16.2344, -15.7500],
         [-11.6250, -19.8281, -10.8594,  ..., -15.8203, -15.9297, -19.8125],
         ...,
         [-10.2578, -20.8750, -12.7188,  ..., -15.9688, -16.0312, -20.9062],
         [-12.6641, -21.1250, -12.9375,  ..., -16.2031, -16.3438, -21.1094],
         [ -6.0078,  -5.7773,  17.8594,  ..., -16.7188, -16.5938,  -5.8281]]],
       grad_fn=<ToCopyBackward0>)

In [27]:
difference = output.logits - output_8bit.logits
state = output.hidden_states[-1]

In [52]:
difference[0,:,-11].max()

tensor(0.4023, grad_fn=<MaxBackward1>)

In [35]:
difference.mean()

tensor(4.7880e-05, grad_fn=<MeanBackward0>)

In [35]:
torch.nn.AvgPool1d(2)(output.last_hidden_state[-1]).shape

torch.Size([343, 160])

## Use the attention mask to remove the padding

In [52]:
results = {}
mask = tok["attention_mask"].bool()
for num, x in enumerate(output.last_hidden_state):
    masked_x = x[mask[num]]
    results[num] = masked_x.mean(dim=0).detach().cpu().numpy()
    # detach removes the tensor from the computation graph (the gradient won't be computed)

In [71]:
torch.max(results[0], dim=0)[0].shape

torch.Size([320])

In [103]:
path = "data2.csv"
embeddings = pd.DataFrame(results).T
embeddings.to_csv(path, mode='a', header=not Path(path).exists())

In [21]:
x[tok["attention_mask"].bool()[1]].shape

torch.Size([343, 320])

## Adapt the code to Load large datasets

In [5]:
from datasets import load_dataset, Dataset
from Bio import SeqIO
import pandas as pd
# https://huggingface.co/docs/datasets/loading

There are many ways to load files within datasets -> from local or remote files (in these formats json, csv, text, parquet)  
Since we have a fasta file that is not supported (because it will treat each line as a row so it will double the rows, but in fasta the first line is an id).  
So we can process it in-memory to pandas, generators, dictionaries or list of dictionaries and use Datasets instead of load_dataset.  
The load dataset returns a dataset dict with different splits (train, test, val) as keys and then a dataset object as values.

But we are using a dataset object directl

To load fasta files use from generator beacause it is in-memory and the file might be too large to process.

In [48]:
a = load_dataset("text", data_files="../data/whole_sequence.fasta")
a

Generating train split: 294 examples [00:00, 6184.43 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 294
    })
})

In [49]:
def fasta_generator(fasta_file: str="../data/whole_sequence.fasta"):
    with open(fasta_file, 'r') as f:
        seqs = SeqIO.parse(f, 'fasta')
        for seq in seqs:
            yield {"id":seq.id, "seq":str(seq.seq)}

with open("../data/whole_sequence.fasta", 'r') as f:
    seqs = SeqIO.parse(f, 'fasta')
    d = pd.Series({s.id:str(s.seq) for s in seqs}).to_frame()
    d.columns = ["sequences"]

In [50]:
b = Dataset.from_generator(fasta_generator, gen_kwargs={"fasta_file":"../data/whole_sequence.fasta"})
b

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 147 examples [00:00, 21001.52 examples/s]


Dataset({
    features: ['id', 'seq'],
    num_rows: 147
})

## Process or tokenize

Use map to apply the tokenizer function to the entire dataset
The map will create and add the new columns ('input_ids', 'attention_mask') coming from the tokenizer to the datatset   
but you will have to change its format to torch tensors for the models to read it

In [65]:
dataset = b.map(lambda examples: tokenizer(examples["seq"], return_tensors="pt",padding=True, truncation=True), batched=True)

In [66]:
u = dataset.select_columns(["id","input_ids", "attention_mask"])
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"], device=device)

Now to extract the embeddings use the dataloader from pytorch to create the batches for you  
It will only return the input_ids and the attention mask (the ids are lost, so yoou don't know which sequence is which)

In [67]:
dataloader = DataLoader(dataset, batch_size=4)
for batch in dataloader:
    u = batch
u

{'input_ids': tensor([[ 0, 20, 10,  ...,  1,  1,  1],
         [ 0, 20, 15,  ...,  1,  1,  1],
         [ 0, 20, 15,  ...,  1,  1,  1]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [1]:
import BioML.deep.embeddings as emb

In [2]:
data = emb.TokenizeFasta(emb.LLMConfig()).tokenize("../data/whole_sequence.fasta")
embed = emb.ExtractEmbeddings(emb.LLMConfig())
seq_keys = list(data["id"])

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

In [3]:
batch_size = 8

In [9]:
for num, batch in enumerate(DataLoader(data, batch_size=batch_size)):
    batch_seq_keys = seq_keys[num*batch_size:(num+1)*batch_size]
    results = embed.extract(batch_seq_keys, batch)
    #embed.save(results, "../data/embeddings.csv")

In [15]:
embeddings = pd.read_csv("../data/embeddings.csv", index_col=0)

## Other ways to create emebeddings

In [82]:
attention_weights = torch.nn.Linear(320, 1)

In [87]:
attention_scores = attention_weights(output.hidden_states[-1])
attention_weights = torch.softmax(attention_scores, -1)

TypeError: 'Tensor' object is not callable

In [89]:
attention_weights.shape

torch.Size([2, 343, 1])

In [29]:
_temp = output.hidden_states[-1].reshape(output.hidden_states[-1].shape[0], -1)
_temp.shape

torch.Size([2, 109760])

In [53]:
_temp[0]

tensor([ 0.1419,  0.5839, -0.0722,  ...,  0.4682, -0.6849, -0.3094],
       grad_fn=<SelectBackward0>)

In [32]:
(0, 2048 - _temp.shape[1])

(0, -107712)

In [66]:
o = torch.nn.functional.pad(_temp, (0, 2048 - _temp.shape[1]))

In [68]:
o[0][:10]

tensor([ 0.1419,  0.5839, -0.0722,  0.3390, -0.1853, -0.0982, -0.9235,  0.1019,
        -0.4527, -0.6959], grad_fn=<SliceBackward0>)

In [50]:
len(set(o[0].detach().numpy()).intersection(_temp[0].detach().numpy()))xx

109670

In [74]:
len(set(output.hidden_states[-1][0][0].detach().numpy()).intersection(_temp[0][:100].detach().numpy()))

100

# Test training using the embeddings

## Regression

In [76]:
import BioML.models.regression as regression

In [80]:
embeddings = pd.read_csv("../data/embeddings.csv", index_col=0)
label = list(range(len(embeddings)))

In [82]:
data = regression.DataParser("../data/embeddings.csv", label)
experiment = regression.PycaretInterface("regression", 200, scaler= "zscore", budget_time=20, best_model=3, 
                                        output_path="regression_training", optimize="RMSE")

regressor = regression.Regressor(test_size=0.2, optimize="RMSE")
training = regression.Trainer(experiment, regressor, 5, 30)


23-02-2024 12:07:16 INFO ------------------------------------------------------------------------------
23-02-2024 12:07:16 INFO PycaretInterface parameters
23-02-2024 12:07:16 INFO Seed: 200
23-02-2024 12:07:16 INFO Budget time: 20
23-02-2024 12:07:16 INFO The number of models to select: 3
23-02-2024 12:07:16 INFO Output path: regression_training
23-02-2024 12:07:16 INFO ----------------Trainer inputs-------------------------
23-02-2024 12:07:16 INFO Number of kfolds: 5
23-02-2024 12:07:16 INFO Number of iterations: 30


Split the data according to sequence similarity

In [83]:
c = regression.split.ClusterSpliter("../data/resultsDB_clu.tsv", 5, random_state=experiment.seed, test_size=0.2)
X_train, X_test = c.train_test_split(data.features)

In [85]:
results, models_dict = training.generate_training_results(X_train, data.label, True,
                                                          test_data=X_test, fold_strategy=c)

2024/02/23 12:18:52 INFO mlflow.tracking.fluent: Experiment with name 'Regression' does not exist. Creating a new experiment.
23-02-2024 12:18:53 INFO --------------------------------------------------------
23-02-2024 12:18:53 INFO Training regression models
23-02-2024 12:18:53 INFO The models used ['lr', 'lasso', 'ridge', 'en', 'lar', 'llar', 'omp', 'br', 'par', 'huber', 'svm', 'knn', 'dt', 'rf', 'et', 'gbr', 'mlp', 'xgboost', 'catboost', 'dummy']
23-02-2024 12:18:53 INFO Time budget is 20 minutes
23-02-2024 12:20:27 INFO Training over: Total runtime 1.565 minutes
23-02-2024 12:20:27 INFO Analyse the best models and plotting them
23-02-2024 12:20:27 INFO Analyse the top 1 model: catboost
23-02-2024 12:24:10 INFO Analyse the top 2 model: br
23-02-2024 12:24:13 INFO Analyse the top 3 model: rf
23-02-2024 12:24:27 INFO --------Stacking the best models--------
23-02-2024 12:24:27 INFO ----------Stacking the best models--------------
23-02-2024 12:27:58 INFO --------Creating an ensemble m