In [12]:
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, BitsAndBytesConfig, AutoModelForSequenceClassification
from Bio import SeqIO
import torch
from torch.utils.data import DataLoader, TensorDataset
import random
import numpy as np
import torch.nn as nn
# https://huggingface.co/blog/AmelieSchreiber/esmbind
# the minimum for the ESM2 is 650M if we want better performance than ESM1b with 650M as well.
import pandas as pd
from pathlib import Path
from itertools import islice
import torch.nn.functional as F
import bitsandbytes as bnb

In [13]:
dataset = TensorDataset(torch.arange(40, dtype=torch.float32).view(10, 4), torch.tensor([i for i in range(10)]))

In [14]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


In [15]:
torch.arange(40).shape

torch.Size([40])

In [16]:
a = nn.Conv1d(1, 10, 2)
g = torch.Generator()
g.manual_seed(0)
dl = DataLoader(dataset, batch_size=4, worker_init_fn=seed_worker, generator=g, shuffle=True)
for i in range(2):
    print("Epoch", i)
    for batch, label in dl:
       print(batch.unsqueeze(1).shape)
       print(batch.unsqueeze(1))
       o = a(batch.unsqueeze(1))
       print(o.shape)
       print(o)

Epoch 0
torch.Size([4, 1, 4])
tensor([[[12., 13., 14., 15.]],

        [[28., 29., 30., 31.]],

        [[20., 21., 22., 23.]],

        [[ 8.,  9., 10., 11.]]])
torch.Size([4, 10, 3])
tensor([[[ 12.9956,  14.0420,  15.0884],
         [ 11.4176,  12.3456,  13.2735],
         [ -9.6122, -10.3326, -11.0529],
         [ -1.6359,  -1.8254,  -2.0149],
         [  5.0438,   5.3842,   5.7246],
         [  0.7708,   0.8625,   0.9543],
         [  8.8450,   9.5876,  10.3303],
         [  8.6126,   9.3116,  10.0106],
         [  6.2343,   6.7019,   7.1694],
         [ -8.3430,  -9.0782,  -9.8133]],

        [[ 29.7380,  30.7844,  31.8308],
         [ 26.2647,  27.1926,  28.1205],
         [-21.1378, -21.8582, -22.5785],
         [ -4.6684,  -4.8579,  -5.0474],
         [ 10.4907,  10.8312,  11.1716],
         [  2.2389,   2.3306,   2.4224],
         [ 20.7272,  21.4699,  22.2125],
         [ 19.7959,  20.4949,  21.1938],
         [ 13.7148,  14.1824,  14.6499],
         [-20.1057, -20.8408, -21.

## Prepare the data and the device

In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [18]:
with open('../data/whole_sequence.fasta', 'r') as f:
    seqs = list(SeqIO.parse(f, 'fasta'))
seq = {s.id:str(s.seq) for s in seqs}

In [19]:
a = iter(seq.items())

In [20]:
u = islice(a, 10)
dict(u)

{'EH1(72)': 'MLLPETRNLLDLMDAATRGGRPRLETLPHAVGRKAVDKMSEDGEADPPEVAEVANGGFAGPASEIRFRRYRPLGEAAGLLPTLIYYHGGGFVIGNIETHDSTCRRLANKSRCQVISIDYRLAPEHPFPAPIDDGIAAFRHIRDNAESFGADAARLAVGGDSAGGAMAAVVCQACRDAGETGPAFQMLIYPATDSSRESASRVAFAEGYFLSKALMDWFWEAYVPEDTDLTDLRLSPLLATDFTGLPPAFVLTAGYDPLRDEGRAYADRLIEAGIKTTYVNYPGTIHGFFSLTRFLSQGLKANDEAAAVMGAHFGT',
 'EH2(71)': 'MGLQKLIVRTLMKLPESWILKLAGGTPVEIDGRTMDPRIQLLAAQGAKAPSMTSMSIEDARKSADEGLALLDAKPRRTVSILSRTIPGPAGDLHVRIYTPAGATGPLPGIVYYHMGGCVIGNLETCNTFCSILADDCRAIVVSVDYRLAPEHKFPAAMDDAVASFDWVSENAAALGIDPTRLGVGGDSAGGWLSAVVCQTRKAEGKTQPKAQLLIYPATDLDAKEGSMQSCAEIYPLTAEIMDWFMQQFLNSPEDAKDLKASPAHSEDLSGLAPALIMTAGFDVLRDQGEAYGNRLRDAGVPVTYRCYDSLSHAYTAFSGAVPAARQACEEIARDMARALG',
 'EH3(69)': 'MPDTTSLNIADDVRMDPRLKAMLAAFPMMEQQTFQTREEQVANANTPEATAAREQLKMMMDMMDSEEFAPSDNLDISTREFTSSPDGNAIKIQFIRPKGKQKVPCVYYIHGGGMMIMSAFYGNYRAWGKMIANNGVAVAMVDFRNCLSPSSAPEVAPFPAGLNDCVSGLKWVSENADELSIDKNKIIIAGESGGGNLTLATGLKLKQDGNIDLVKGLYALCPYIAGKWPQDRFPSSSENNGIMIELHNNQGALAYGIEQLEAENPLAWPSFASAEDMQGLPPTVINVNECDPLRDEGID

## Initialize the tokenizer and the models

In [45]:
from torchmetrics.functional.classification import (
    accuracy, f1_score, precision, recall, auroc, average_precision, cohen_kappa, confusion_matrix, 
    matthews_corrcoef) 
from torchmetrics.functional.regression import (
    mean_absolute_error, mean_squared_error,  pearson_corrcoef, kendall_rank_corrcoef, r2_score,
    mean_absolute_percentage_error, mean_squared_log_error)

low_cpu_mem_usage: when loading try not to use more memory.

In [21]:
bnb_config = BitsAndBytesConfig(
                        load_in_4bit=True,
                        bnb_4bit_use_double_quant=True,
                        bnb_4bit_quant_type="nf4",
                        bnb_4bit_compute_dtype=torch.bfloat16)


In [83]:
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D", low_mem_usage=True)
tok = tokenizer(list(seq.values())[:3], padding=True, truncation=True, return_tensors="pt", is_split_into_words=False)
tok["input_ids"].shape

torch.Size([3, 350])

In [23]:
#model.to(device)
#model.eval()
model_masked_16 = AutoModelForMaskedLM.from_pretrained("facebook/esm2_t6_8M_UR50D", output_hidden_states=True, device_map="cpu", torch_dtype=torch.float16)
model_16 = AutoModelForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D", device_map="cpu", torch_dtype=torch.float16) 
# most models might not be able to do inference with float 16. Its errors is lower than bfloat16 but you cannot run it in CPUs maybe
model_masked = AutoModelForMaskedLM.from_pretrained("facebook/esm2_t6_8M_UR50D", output_hidden_states=True, device_map="cpu", torch_dtype=torch.float32)

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [84]:
state3 = model_16(**tok, output_hidden_states=True)

In [110]:
state3["logits"]

tensor([[-0.0303, -0.0179],
        [-0.0428,  0.0095],
        [-0.0320, -0.0229]], dtype=torch.float16, grad_fn=<AddmmBackward0>)

In [93]:
pred = torch.softmax(state3["logits"], dim=-1)
target = torch.tensor([0, 1, 0])
arg_pred = torch.argmax(pred, dim=-1)
arg_pred

tensor([1, 1, 1])

In [108]:
pred

tensor([[0.4968, 0.5029],
        [0.4868, 0.5132],
        [0.4978, 0.5024]], dtype=torch.float16, grad_fn=<SoftmaxBackward0>)

In [135]:
precision(preds=pred, target=target, num_classes=2, task="multiclass")

tensor(0.3333)

In [127]:
precision(preds=arg_pred, target=target, num_classes=2, task="binary")

tensor(0.3333)

In [49]:
for x, num in enumerate(state3.hidden_states[-1]):
    print(x, num.shape)

0 torch.Size([113, 1280])


In [36]:
state3.hidden_states[0]

tensor([[[ 0.3842,  0.2355, -0.0170,  ..., -0.0648,  0.0435,  0.3384],
         [-0.3330,  0.1190, -0.1903,  ...,  0.0394,  0.6241,  0.1100],
         [ 0.0212,  0.1590,  0.2530,  ..., -0.6242, -0.3696, -0.0616],
         ...,
         [ 0.4657,  0.3142,  0.5557,  ..., -0.3871, -1.2033,  0.1668],
         [ 0.6945,  0.1695,  0.3626,  ..., -0.0984, -0.4144,  0.3217],
         [-0.6857,  0.2913,  1.1568,  ..., -0.8662, -0.8437, -0.0968]]],
       grad_fn=<AddBackward0>)

In [29]:
np.squeeze(state3.hidden_states[0].cpu().detach().numpy()).mean(axis=0).shape


(1280,)

In [1]:
from BioML.deep import embeddings

embeddings.generate_embeddings('../data/whole_sequence.fasta', model_name="ElnaggarLab/ankh-large-encoder", option="mean", save_path="embeddings.csv", 
                               mode="write", batch_size=1, from_flax=True)

flax_model.msgpack:   0%|          | 0.00/4.61G [00:00<?, ?B/s]

Loading a Flax model in PyTorch, requires both PyTorch and Flax to be installed. Please see https://pytorch.org/ and https://flax.readthedocs.io/en/latest/installation.html for installation instructions.


ModuleNotFoundError: No module named 'jax'

In [26]:
quant_8 = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=1.0)
model_8bit = AutoModel.from_pretrained("facebook/esm2_t6_8M_UR50D", add_pooling_layer=False, device_map="cuda", quantization_config=quant_8)

In [32]:
torch.finfo(torch.float16)

finfo(resolution=0.001, min=-65504, max=65504, eps=0.000976562, smallest_normal=6.10352e-05, tiny=6.10352e-05, dtype=float16)

In [3]:
torch.set_printoptions(precision=30, sci_mode=False)

In [4]:
torch.tensor(100_000, dtype=torch.float32)

tensor(100000.)

In [5]:
torch.tensor(100_000, dtype=torch.float16)

tensor(inf, dtype=torch.float16)

In [6]:
torch.tensor(100_000, dtype=torch.bfloat16) 
# los numeros que puede presentar antes de que sea 0 y la precision que puede presentar es diferente.

tensor(99840., dtype=torch.bfloat16)

In [13]:
model = AutoModel.from_pretrained("facebook/esm2_t6_8M_UR50D", add_pooling_layer=False, device_map="auto", 
                                  quantization_config=None)

#output.logits# logits es lo que le das al softmax para que lo convierta en probabilidad -> softmax(logits) es el output the last linear model
# y que es last hidden state entonces? Es muy diferente a los logits? El shape es diferente -> para cada position devuelve la probabilidad de que sea uno de los tokens
# EL maskedLM y el automodel hidden state es lo mismo -> pero la logits cambia. Cual debería usar para el embedding?
#output.hidden_states[-1] # en el caso de MASkedLM si no le especifico de devolver hidden states, no los devuelve, pero en el caso de automodel si los devuelve aun sin lo del hidden state

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig

# Method 1: all linear layers will use the same quantization config
quant_config  = HqqConfig(nbits=8, group_size=64, quant_zero=False, quant_scale=False, axis=0) #axis=0 is used by default

model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    torch_dtype=torch.float16, 
    device_map="cuda", 
    quantization_config=quant_config
)

ImportError: cannot import name 'HqqConfig' from 'transformers' (/home/ruite/miniconda3/envs/bioml_pycaret/lib/python3.10/site-packages/transformers/__init__.py)

In [9]:
from transformers import AutoModel, AutoModelForMaskedLM, BitsAndBytesConfig

model = AutoModel.from_pretrained("facebook/esm2_t6_8M_UR50D", add_pooling_layer=False, device_map="cpu")
model_16 = AutoModel.from_pretrained("facebook/esm2_t6_8M_UR50D", add_pooling_layer=False, device_map="cpu",  torch_dtype=torch.float16) 
# the threshold determines the values that are considered outliers and are calculated using 16 bit precision
# the smaller it is the less memory it will save because, at the eveyrhting will be calculated with 16 bits
quant_8 = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=1.0) 
model_8bit = AutoModel.from_pretrained("facebook/esm2_t6_8M_UR50D", add_pooling_layer=False, device_map="cuda", 
                                       quantization_config=quant_8)
output = model(**tok) # full 32 precision
output_16 = model_16(**tok) # half 16 precision
output_8 = model_8bit(**tok) # quatized to 8 bits
data = {32: output.last_hidden_state, 16: output_16.last_hidden_state, 8: output_8.last_hidden_state}


RuntimeError: No GPU found. A GPU is needed for quantization.

In [10]:
output = model(**tok)

In [25]:
for num, x in enumerate(output.last_hidden_state):
    print(x.shape)


torch.Size([343, 320])
torch.Size([343, 320])


In [26]:
torch.flatten(x, start_dim=1).shape

torch.Size([343, 320])

In [36]:
print("mean of max:", (data[32] - data[16]).abs().max(axis=1)[0].mean().detach().numpy(), "mean:", 
      (data[32] - data[16]).abs().mean().detach().numpy())

mean of max: 0.0029924903 mean: 0.00063062285


In [37]:
print("mean of max:", (data[32] - data[8]).abs().max(axis=1)[0].mean().detach().numpy(), "mean:", 
      (data[32] - data[8]).abs().mean().detach().numpy())

mean of max: 0.053529274 mean: 0.012010117


In [13]:
output = model_masked(**tok)

In [33]:
soft = F.softmax(output.logits, dim=-1).detach()
soft.shape

torch.Size([2, 343, 33])

In [38]:
torch.argmax(soft, dim=1)

tensor([[  0, 336, 316, 336, 258, 145, 163, 114, 161, 124, 120,  98, 115, 256,
         123, 212, 185, 144, 127, 119,   1, 286, 140, 103, 165, 165, 165, 165,
         165, 165, 165, 165, 336],
        [  0, 281, 342, 281, 269, 172, 190, 144, 188, 151, 181, 219,  97, 283,
         150, 153, 212, 171, 154, 146,   1, 313, 167, 130, 192, 222, 192, 192,
         192, 192, 192, 192, 281]])

In [45]:
model_masked_16 = AutoModelForMaskedLM.from_pretrained("facebook/esm2_t6_8M_UR50D", output_hidden_states=True, 
                                                       device_map="cpu", torch_dtype=torch.float16)

model_masked = AutoModelForMaskedLM.from_pretrained("facebook/esm2_t6_8M_UR50D", output_hidden_states=True, 
                                                    device_map="cpu", torch_dtype=torch.float32)

output = model_masked(**tok)
output_16 = model_masked_16(**tok)

print("max difference:", (output.hidden_states[-1] - output_16.hidden_states[-1]).max().detach().numpy())
print("mean difference:", (output.hidden_states[-1] - output_16.hidden_states[-1]).abs().mean().detach().numpy())

max difference: 0.01247859
mean difference: 0.00063062285


In [54]:
output = model(**tok)
output # si no le especificas hidden states -> tampoco tiene hidden states, sino que tiene last_hidden state

torch.Size([2, 343, 320])

In [None]:
output_16 = model_16(**tok)
output_16.last_hidden_state

tensor([[[ 0.1421,  0.5840, -0.0724,  ...,  1.1748, -0.0931, -0.4209],
         [ 0.4133,  0.0937, -0.1658,  ...,  0.8398, -0.2219, -0.2437],
         [ 0.0767, -0.5088, -0.0298,  ...,  0.3887, -0.0302,  0.1070],
         ...,
         [-0.2573,  0.2483,  0.5522,  ...,  0.6455, -0.5259, -0.0944],
         [-0.3516,  0.2590,  0.5864,  ...,  0.4985, -0.5669, -0.1550],
         [-0.3301,  0.2368,  0.2407,  ...,  0.4675, -0.6846, -0.3098]],

        [[ 0.0927,  0.6987, -0.0489,  ...,  1.0352, -0.1703, -0.3040],
         [ 0.3230,  0.4792, -0.1464,  ...,  0.7700, -0.2129, -0.2917],
         [ 0.0061, -0.2739,  0.2622,  ..., -0.0435,  0.2888,  0.1104],
         ...,
         [-0.4729, -0.2120, -0.2720,  ...,  0.9448, -0.3599,  0.2312],
         [-0.0944, -0.4836,  0.0655,  ...,  0.3899, -0.1250, -0.1101],
         [-0.0054,  0.0670,  0.1100,  ...,  0.6562, -0.6016, -0.2551]]],
       dtype=torch.float16, grad_fn=<NativeLayerNormBackward0>)

In [None]:
model_float16.get_memory_footprint()/1e+6
#model.get_memory_footprint()/1e+6

15.483202

In [None]:
difference_16 = output.last_hidden_state - output_16.last_hidden_state

In [None]:
difference_16.max(axis=1).values.max()

tensor(0.0125, grad_fn=<MaxBackward1>)

In [None]:
output_8bit = model_8bit(**tok)
output_8bit.logits

tensor([[[ 15.0938,  -7.5859,  -6.3984,  ..., -15.4062, -15.6328,  -7.5781],
         [ -9.7422, -16.4844,  -9.3047,  ..., -15.8906, -16.1406, -16.4688],
         [-11.9688, -21.8438, -12.3438,  ..., -15.7969, -15.8750, -21.8281],
         ...,
         [ -5.5430,  -6.7969,  14.7812,  ..., -16.7656, -16.5781,  -6.8281],
         [ -5.4414,  -6.3516,  17.0000,  ..., -16.7031, -16.5156,  -6.3867],
         [ -5.4102,  -6.6562,  16.5625,  ..., -16.6719, -16.4844,  -6.6953]],

        [[ 16.1562,  -5.9922,  -6.4141,  ..., -15.2656, -15.4844,  -5.9805],
         [ -9.0000, -15.7500,  -7.1562,  ..., -15.9609, -16.2344, -15.7500],
         [-11.6250, -19.8281, -10.8594,  ..., -15.8203, -15.9297, -19.8125],
         ...,
         [-10.2578, -20.8750, -12.7188,  ..., -15.9688, -16.0312, -20.9062],
         [-12.6641, -21.1250, -12.9375,  ..., -16.2031, -16.3438, -21.1094],
         [ -6.0078,  -5.7773,  17.8594,  ..., -16.7188, -16.5938,  -5.8281]]],
       grad_fn=<ToCopyBackward0>)

In [27]:
difference = output.logits - output_8bit.logits
state = output.hidden_states[-1]

In [52]:
difference[0,:,-11].max()

tensor(0.4023, grad_fn=<MaxBackward1>)

In [35]:
difference.mean()

tensor(4.7880e-05, grad_fn=<MeanBackward0>)

In [35]:
torch.nn.AvgPool1d(2)(output.last_hidden_state[-1]).shape

torch.Size([343, 160])

## Use the attention mask to remove the padding

In [52]:
results = {}
mask = tok["attention_mask"].bool()
for num, x in enumerate(output.last_hidden_state):
    masked_x = x[mask[num]]
    results[num] = masked_x.mean(dim=0).detach().cpu().numpy()
    # detach removes the tensor from the computation graph (the gradient won't be computed)

In [71]:
torch.max(results[0], dim=0)[0].shape

torch.Size([320])

In [103]:
path = "data2.csv"
embeddings = pd.DataFrame(results).T
embeddings.to_csv(path, mode='a', header=not Path(path).exists())

In [21]:
x[tok["attention_mask"].bool()[1]].shape

torch.Size([343, 320])

## Adapt the code to Load large datasets

In [1]:
from datasets import load_dataset, Dataset
from Bio import SeqIO
import pandas as pd
# https://huggingface.co/docs/datasets/loading

There are many ways to load files within datasets -> from local or remote files (in these formats json, csv, text, parquet)  
Since we have a fasta file that is not supported (because it will treat each line as a row so it will double the rows, but in fasta the first line is an id).  
So we can process it in-memory to pandas, generators, dictionaries or list of dictionaries and use Datasets instead of load_dataset.  
The load dataset returns a dataset dict with different splits (train, test, val) as keys and then a dataset object as values.

But we are using a dataset object directl

To load fasta files use from generator beacause it is in-memory and the file might be too large to process.

In [2]:
a = load_dataset("text", data_files="../data/whole_sequence.fasta")
a

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 294
    })
})

In [3]:
def fasta_generator(fasta_file: str="../data/whole_sequence.fasta"):
    with open(fasta_file, 'r') as f:
        seqs = SeqIO.parse(f, 'fasta')
        for seq in seqs:
            yield {"id":seq.id, "seq":str(seq.seq)}

with open("../data/whole_sequence.fasta", 'r') as f:
    seqs = SeqIO.parse(f, 'fasta')
    d = pd.Series({s.id:str(s.seq) for s in seqs}).to_frame()
    d.columns = ["sequences"]

In [18]:
b = Dataset.from_generator(fasta_generator, gen_kwargs={"fasta_file":"../data/whole_sequence.fasta"})
b

Dataset({
    features: ['id', 'seq'],
    num_rows: 147
})

## Process or tokenize

Use map to apply the tokenizer function to the entire dataset
The map will create and add the new columns ('input_ids', 'attention_mask') coming from the tokenizer to the datatset   
but you will have to change its format to torch tensors for the models to read it

In [19]:
dataset = b.map(lambda examples: tokenizer(examples["seq"], return_tensors="pt",padding=True, truncation=True), batched=True)

In [20]:
u = dataset.select_columns(["id","input_ids", "attention_mask"])
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"], device=device)

In [21]:
dataset

Dataset({
    features: ['id', 'seq', 'input_ids', 'attention_mask'],
    num_rows: 147
})

Now to extract the embeddings use the dataloader from pytorch to create the batches for you  
It will only return the input_ids and the attention mask (the ids are lost, so yoou don't know which sequence is which)

In [24]:
dataloader = DataLoader(dataset, batch_size=4)
for batch in dataloader:
    u = batch
u["input_ids"]

tensor([[ 0, 20, 10,  ...,  1,  1,  1],
        [ 0, 20, 15,  ...,  1,  1,  1],
        [ 0, 20, 15,  ...,  1,  1,  1]])

In [1]:
import BioML.deep.embeddings as emb

In [2]:
data = emb.TokenizeFasta(emb.LLMConfig()).tokenize("../data/whole_sequence.fasta")
embed = emb.ExtractEmbeddings(emb.LLMConfig())
seq_keys = list(data["id"])

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

In [3]:
batch_size = 8

In [9]:
for num, batch in enumerate(DataLoader(data, batch_size=batch_size)):
    batch_seq_keys = seq_keys[num*batch_size:(num+1)*batch_size]
    results = embed.extract(batch_seq_keys, batch)
    #embed.save(results, "../data/embeddings.csv")

In [15]:
embeddings = pd.read_csv("../data/embeddings.csv", index_col=0)

## Other ways to create emebeddings

In [82]:
attention_weights = torch.nn.Linear(320, 1)

In [87]:
attention_scores = attention_weights(output.hidden_states[-1])
attention_weights = torch.softmax(attention_scores, -1)

TypeError: 'Tensor' object is not callable

In [89]:
attention_weights.shape

torch.Size([2, 343, 1])

In [29]:
_temp = output.hidden_states[-1].reshape(output.hidden_states[-1].shape[0], -1)
_temp.shape

torch.Size([2, 109760])

In [53]:
_temp[0]

tensor([ 0.1419,  0.5839, -0.0722,  ...,  0.4682, -0.6849, -0.3094],
       grad_fn=<SelectBackward0>)

In [32]:
(0, 2048 - _temp.shape[1])

(0, -107712)

In [66]:
o = torch.nn.functional.pad(_temp, (0, 2048 - _temp.shape[1]))

In [68]:
o[0][:10]

tensor([ 0.1419,  0.5839, -0.0722,  0.3390, -0.1853, -0.0982, -0.9235,  0.1019,
        -0.4527, -0.6959], grad_fn=<SliceBackward0>)

In [50]:
len(set(o[0].detach().numpy()).intersection(_temp[0].detach().numpy()))xx

109670

In [74]:
len(set(output.hidden_states[-1][0][0].detach().numpy()).intersection(_temp[0][:100].detach().numpy()))

100

# Test training using the embeddings

## Regression

In [76]:
import BioML.models.regression as regression

In [80]:
embeddings = pd.read_csv("../data/embeddings.csv", index_col=0)
label = list(range(len(embeddings)))

In [82]:
data = regression.DataParser("../data/embeddings.csv", label)
experiment = regression.PycaretInterface("regression", 200, scaler= "zscore", budget_time=20, best_model=3, 
                                        output_path="regression_training", optimize="RMSE")

regressor = regression.Regressor(test_size=0.2, optimize="RMSE")
training = regression.Trainer(experiment, regressor, 5, 30)


23-02-2024 12:07:16 INFO ------------------------------------------------------------------------------
23-02-2024 12:07:16 INFO PycaretInterface parameters
23-02-2024 12:07:16 INFO Seed: 200
23-02-2024 12:07:16 INFO Budget time: 20
23-02-2024 12:07:16 INFO The number of models to select: 3
23-02-2024 12:07:16 INFO Output path: regression_training
23-02-2024 12:07:16 INFO ----------------Trainer inputs-------------------------
23-02-2024 12:07:16 INFO Number of kfolds: 5
23-02-2024 12:07:16 INFO Number of iterations: 30


Split the data according to sequence similarity

In [83]:
c = regression.split.ClusterSpliter("../data/resultsDB_clu.tsv", 5, random_state=experiment.seed, test_size=0.2)
X_train, X_test = c.train_test_split(data.features)

In [85]:
results, models_dict = training.generate_training_results(X_train, data.label, True,
                                                          test_data=X_test, fold_strategy=c)

2024/02/23 12:18:52 INFO mlflow.tracking.fluent: Experiment with name 'Regression' does not exist. Creating a new experiment.
23-02-2024 12:18:53 INFO --------------------------------------------------------
23-02-2024 12:18:53 INFO Training regression models
23-02-2024 12:18:53 INFO The models used ['lr', 'lasso', 'ridge', 'en', 'lar', 'llar', 'omp', 'br', 'par', 'huber', 'svm', 'knn', 'dt', 'rf', 'et', 'gbr', 'mlp', 'xgboost', 'catboost', 'dummy']
23-02-2024 12:18:53 INFO Time budget is 20 minutes
23-02-2024 12:20:27 INFO Training over: Total runtime 1.565 minutes
23-02-2024 12:20:27 INFO Analyse the best models and plotting them
23-02-2024 12:20:27 INFO Analyse the top 1 model: catboost
23-02-2024 12:24:10 INFO Analyse the top 2 model: br
23-02-2024 12:24:13 INFO Analyse the top 3 model: rf
23-02-2024 12:24:27 INFO --------Stacking the best models--------
23-02-2024 12:24:27 INFO ----------Stacking the best models--------------
23-02-2024 12:27:58 INFO --------Creating an ensemble m