# How to finetune HuggingFace models on text data of any size and format with custom splitting (not random)

A way to handle text data of any size and format with custom split because random splitting is not recommended for protein sequences.

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
import torch
import numpy as np
from Bio import SeqIO
from datasets import Dataset, DatasetDict
from BioML.utilities import split_methods
from BioML.deep.embeddings import LLMConfig, TokenizeFasta
from BioML.deep.utils import set_seed
from peft import get_peft_model, LoraConfig
## https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb
## https://github.com/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb

## Load data

You need to label the target values as labels so Trainer can recognize it.
Dataset can actually be used for any usecases with large   files it doesn't depend on transformers  
Although you would need to use PyTorch Dataloader to transform it into batches (but it only returns inputs ids and attention masks will it also return labels?)

In [2]:
def fasta_generator(fasta_file: str="../data/whole_sequence.fasta"):
    with open(fasta_file, 'r') as f:
        seqs = SeqIO.parse(f, 'fasta')
        for seq in seqs:
            yield {"id":seq.id, "seq":str(seq.seq)}

b = Dataset.from_generator(fasta_generator, gen_kwargs={"fasta_file":"../data/whole_sequence.fasta"})
y = np.random.randint(0, 2, size=len(b))
dataset = b.add_column("labels", y)

In [28]:
tok = TokenizeFasta()
tokens = tok.tokenize("../data/whole_sequence.fasta", (["labels", y],))

## Custom spliting with indices

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
cluster = split_methods.ClusterSpliter("../data/resultsDB_clu.tsv")
train, test = cluster.train_test_split(range(len(dataset)), groups=dataset["id"])

In [None]:
train, test = train_test_split(range(len(dataset)), stratify=dataset["labels"], test_size=0.2) # random splitting

In [None]:
new = DatasetDict({"train":dataset.select(train), "test":dataset.select(test)})

In [None]:
train_, validation = cluster.train_test_split(range(len(new["train"])), groups=new["train"]["id"])

In [None]:
new_2 = DatasetDict({"train":new["train"].select(train_), "test":dataset.select(test), "validation": new["train"].select(validation)})

## Load the protein language models model

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device="cpu"

In [None]:
def model_init(): # 0 or 1 parameters ( the trial hyperparameters)
    return AutoModelForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D", num_labels=2) # torch_dtype=torch.bfloat16 to load in bfloat16 which is accepted by CPUs unlike float16

def model_init2(): # 0 or 1 parameters ( the trial hyperparameters)
	return AutoModelForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D", low_cpu_mem_usage=True)

model = AutoModelForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D", num_labels=2, low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
new["train"] = new["train"].map(lambda examples: tokenizer(examples["seq"], return_tensors="np",padding=True, truncation=True), batched=True)
new["test"] = new["test"].map(lambda examples: tokenizer(examples["seq"], return_tensors="np",padding=True, truncation=True), batched=True)

Map:   0%|          | 0/117 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/30 [00:00<?, ? examples/s]

### Create the training arguments

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

In [None]:
lr = 8e-5
bs = 1
epochs = 4

Se use cpu to False whe you wan to use GPUs (it will automatically use GPUs), when f16 is True it will only use GPUs.

In [None]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.2, lr_scheduler_type='cosine', fp16=False if device=="cpu" else True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to=['mlflow'],
    load_best_model_at_end=True, metric_for_best_model="matthews_correlation", 
    save_total_limit=2, save_strategy="epoch", seed=3242342, gradient_accumulation_steps=4, use_cpu=True if device=="cpu" else False) 

## The warmup step together with cosine learning rate scheduler turns to onecycle learning rate scheduler
## weight decay for the Adam (AdamW) -> this is fast.Ai does
## fp16 is half precision -> mixed training (using fp32 and fp16)
## save_total_limit to 3 -> so only 3 models will be saved
## each 500 steps will be saved a model
## Save the report to mlflow
# How to evaluate mlflow?
# LR finder does not give reliable results for Transformers models https://github.com/huggingface/transformers/issues/16013



## Train the model using several evaluation metrics

In [None]:
import evaluate
import mlflow

ModuleNotFoundError: No module named 'evaluate'

You can use your own function as an evaluation metric -> then you have to retun as an dict  
Or you can use the evaluate library from hugging face to load different functions: [evaluate](https://huggingface.co/docs/evaluate/a_quick_tour)


In [None]:
def compute_classification_metrics(eval_pred):
    metrics = ["accuracy", "f1", "matthews_correlation", "precision", "recall"]
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    loaded = {metric:evaluate.load(metric) for metric in metrics}
    results = {metric: loaded[metric].compute(predictions=predictions, references=labels)[metric] 
               for metric in metrics}

    # the predictions from the models are logits (it also returns the labels, 
    # it also returns loss, attentions and hidden state but that is the classification model, for evalaution Trainer will only 
    # return logits and labels)
    return results

def compute_regression_metrics(eval_pred):
	metrics = ["mse", "mae"]
	logits, labels = eval_pred
	predictions = logits
	loaded = {metric:evaluate.load(metric) for metric in metrics}
	results = {metric: loaded[metric].compute(predictions=predictions, references=labels)[metric] 
			   for metric in metrics}
	results["r2"] = evaluate.load("r_squared").compute(predictions=predictions, references=labels)
	results["rmse"] = loaded["mse"].compute(predictions=predictions, references=labels, squared=False)["mse"]
	return results

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        print(inputs)
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.compute_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
trainer = Trainer(model, args, train_dataset=new['train'], eval_dataset=new['test'], # we need to pass tokenized datasets
                  tokenizer=tokenizer, compute_metrics=compute_classification_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=2)])

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## Tuning the hyperparameters learning rate and batch_size

In [None]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-2, log=True),
        "gradient_accumulation_steps": trial.suggest_categorical("gradient_accumulation_steps", [2, 4, 8, 16]),
    }


def compute_classification_objective(metrics: dict[str, float]) -> tuple[float, float]:
	return metrics["eval_loss"], metrics["eval_matthews_correlation"]

def compute_regression_objective(metrics: dict[str, float]) -> tuple[float, float]:
	return metrics["eval_loss"], metrics["eval_r2"]

In [None]:
trainer = Trainer(None, args, model_init=model_init2, train_dataset=new['train'], eval_dataset=new['test'], # we need to pass tokenized datasets
                  tokenizer=tokenizer, 
                  compute_metrics=compute_classification_metrics, 
                  callbacks=[EarlyStoppingCallback(early_stopping_patience=2)])

RuntimeError: model_init should have 0 or 1 argument.

In [None]:
mlflow.end_run()

best_trials = trainer.hyperparameter_search(
	direction=["minimize", "maximize"],
	backend="optuna",
	hp_space=optuna_hp_space,
	n_trials=1,
	compute_objective=compute_classification_objective,
    storage='sqlite:///my_optuna_studies.db',
    load_if_exists=True
)


[I 2024-05-05 18:27:53,211] A new study created in RDB with name: no-name-305c8b7d-b552-4cd6-b1d9-2e327a424486
Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/116 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.6917587518692017, 'eval_accuracy': 0.5666666666666667, 'eval_f1': 0.0, 'eval_matthews_correlation': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 18.5898, 'eval_samples_per_second': 1.614, 'eval_steps_per_second': 0.807, 'epoch': 0.99}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.7241240739822388, 'eval_accuracy': 0.5, 'eval_f1': 0.4444444444444444, 'eval_matthews_correlation': -0.008988968316207744, 'eval_precision': 0.42857142857142855, 'eval_recall': 0.46153846153846156, 'eval_runtime': 17.8967, 'eval_samples_per_second': 1.676, 'eval_steps_per_second': 0.838, 'epoch': 1.98}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.7805582880973816, 'eval_accuracy': 0.5, 'eval_f1': 0.4444444444444444, 'eval_matthews_correlation': -0.008988968316207744, 'eval_precision': 0.42857142857142855, 'eval_recall': 0.46153846153846156, 'eval_runtime': 18.3333, 'eval_samples_per_second': 1.636, 'eval_steps_per_second': 0.818, 'epoch': 2.97}


[I 2024-05-05 18:36:15,996] Trial 0 finished with values: [0.7805582880973816, -0.008988968316207744] and parameters: {'learning_rate': 7.813286994811102e-05, 'gradient_accumulation_steps': 4}. 


{'train_runtime': 501.8248, 'train_samples_per_second': 0.933, 'train_steps_per_second': 0.231, 'train_loss': 0.6711522244859016, 'epoch': 2.97}


In [None]:
trainer.args.learning_rate = best_trials[0].hyperparameters["learning_rate"]
trainer.args.gradient_accumulation_steps = best_trials[0].hyperparameters["gradient_accumulation_steps"]

In [None]:
trainer.train()

  0%|          | 0/116 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.7221236824989319, 'eval_accuracy': 0.5333333333333333, 'eval_f1': 0.6956521739130436, 'eval_matthews_correlation': 0.0, 'eval_precision': 0.5333333333333333, 'eval_recall': 1.0, 'eval_runtime': 31.508, 'eval_samples_per_second': 0.952, 'eval_steps_per_second': 0.476, 'epoch': 0.99}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.7303746938705444, 'eval_accuracy': 0.5333333333333333, 'eval_f1': 0.6956521739130436, 'eval_matthews_correlation': 0.0, 'eval_precision': 0.5333333333333333, 'eval_recall': 1.0, 'eval_runtime': 31.2142, 'eval_samples_per_second': 0.961, 'eval_steps_per_second': 0.481, 'epoch': 1.98}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.7445932030677795, 'eval_accuracy': 0.5333333333333333, 'eval_f1': 0.6956521739130436, 'eval_matthews_correlation': 0.0, 'eval_precision': 0.5333333333333333, 'eval_recall': 1.0, 'eval_runtime': 31.2265, 'eval_samples_per_second': 0.961, 'eval_steps_per_second': 0.48, 'epoch': 2.97}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.753971517086029, 'eval_accuracy': 0.6, 'eval_f1': 0.7272727272727273, 'eval_matthews_correlation': 0.2857142857142857, 'eval_precision': 0.5714285714285714, 'eval_recall': 1.0, 'eval_runtime': 31.5365, 'eval_samples_per_second': 0.951, 'eval_steps_per_second': 0.476, 'epoch': 3.97}
{'train_runtime': 618.7594, 'train_samples_per_second': 0.756, 'train_steps_per_second': 0.187, 'train_loss': 0.6006371070598734, 'epoch': 3.97}


TrainOutput(global_step=116, training_loss=0.6006371070598734, metrics={'train_runtime': 618.7594, 'train_samples_per_second': 0.756, 'train_steps_per_second': 0.187, 'train_loss': 0.6006371070598734, 'epoch': 3.97})

We make sure that we get the same results by evaluating the results once more

In [None]:
metrics = trainer.evaluate()
print(metrics)

## Search for hyperparameters like the learning rate which is the most important

Well it is actually batch size and learning rate -> smaller batch sizes tend to work better than large batch sizes -> but learning rate is affected by batch as well -> higher abtch need higher learning rate.

Fix everything else and tune the learning rate -> learning rate finder doesn'0t seem to work very well for transformers?  
But teh idea of learning rate finder is just test different learning rates -> so I cannot test them?

Ktrains: A wrapper to do many tasks and has a learning rate finder: [ktrains](https://github.com/amaiya/ktrain)

Use pytorch lightning perhaps: [pytorch_lighningt_huggingface](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb)

In [None]:
#model = AutoModel.from_pretrained("bigscience/T0pp", device_map="auto")

config.json:   0%|          | 0.00/633 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/44.5G [00:00<?, ?B/s]

KeyboardInterrupt: 

## Parameter efficient fine tuning

In [None]:
from dataclasses import dataclass
from torch.optim import AdamW, Optimizer
from torch.optim.lr_scheduler import OneCycleLR
from lightning import LightningModule, LightningDataModule
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
from lightning.pytorch.loggers import MLFlowLogger
from lightning import Trainer
from torchmetrics.functional.classification import (
    accuracy,
    f1_score,
    precision,
    recall,
    auroc,
    average_precision,
    cohen_kappa,
    confusion_matrix,
    matthews_corrcoef
) 

from torchmetrics.functional.regression import (
    mean_absolute_error,
    mean_squared_error,
    pearson_corrcoef,
    kendall_rank_corrcoef,
    r2_score,
    mean_absolute_percentage_error,
    mean_squared_log_error)

from torch.utils.data import DataLoader
from datasets import Dataset
from BioML.deep.train_config import LLMConfig
from BioML.utilities import split_methods as split

In [None]:
def calculate_classification_metrics(split: str, loss: torch.tensor, preds: torch.tensor, 
                                     target: torch.tensor, num_classes: int=2, threshold: float=0.5):
    task = "binary" if num_classes == 2 else "multiclass"
    metrics = {
                f"{split}_Loss": loss,
                f"{split}_Acc": accuracy(
                    preds=preds,
                    target=target,
                    num_classes=num_classes,
                    task=task,
                    threshold=threshold,
                    average="weighted",
                ),
                f"{split}_F1":f1_score(
                    preds=preds,
                    target=target,
                    task=task,
                    num_classes=num_classes,
                    average="weighted",
                ),
                f"{split}_Precision": precision(
                    preds=preds,
                    target=target,
                    task=task,
                    num_classes=num_classes,
                    average="weighted",
                ),
                f"{split}_Recall": recall(
                    preds=preds,
                    target=target,
                    task=task,
                    num_classes=num_classes,
                    average="weighted"
                ),
                f"{split}_MCC": matthews_corrcoef(
                    preds=preds,
                    target=target,
                    num_classes=num_classes,
                    threshold=threshold,
                    task=task,
                ),
                f"{split}_Confusion_Matrix": confusion_matrix(
                    preds=preds,
                    target=target,
                    num_classes=num_classes,
                    normalize="true",
                    task=task,
                    threshold=threshold,
                ),
                f"{split}_AUROC": auroc(
                    preds=preds,
                    target=target,
                    num_classes=num_classes,
                    task=task,
                    thresholds=None,
                    average="weighted",
                ),
                f"{split}_Average_Precision": average_precision(
                    preds=preds,
                    target=target,
                    num_classes=num_classes,
                    task=task,
                    average="weighted",
                ),
                f"{split}_Cohen_Kappa": cohen_kappa(
                    preds=preds,
                    target=target,
                    num_classes=num_classes,
                    task=task,
                    threshold=threshold,
                )}
    return metrics

def calculate_regression_metrics(split: str, loss: torch.tensor, preds: torch.tensor, 
                                 target: torch.tensor):
    metrics = {f"{split}_Loss": loss,
                f"{split}_MAE": mean_absolute_error(preds, target),
                f"{split}_MSE": mean_squared_error(preds, target),
                f"{split}_RMSE": mean_squared_error(preds, target, squared=False),
                f"{split}_R2": r2_score(preds, target),
                f"{split}_Pearson": pearson_corrcoef(preds, target),
                f"{split}_Kendall": kendall_rank_corrcoef(preds, target),
                f"{split}_MAPE": mean_absolute_percentage_error(preds, target),
                f"{split}_MSLE": mean_squared_log_error(preds, target)}
    return metrics


In [None]:
r_values = (8, 16, 32, 64, 128, 256)

In [None]:
from functools import partial

def test(x, y):
    return x + y
class Test:
    def __init__(self):
        self.x = test
        self.tes_ = partial(self.x, y=2)
    def __call__(self, x):
        return self.tes_(x)

t = Test()
t(3)

5

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D", num_labels=2, low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def get_target_module_names_for_peft(model, filter_="key"):
    if isinstance(filter_, str):
        filter_ = [filter_] # if it is a string, convert it to a list
    module_names = []
    for num, (name, module) in enumerate(model.named_modules()):
        n = name.split(".")
        if filter_ and set(n).intersection(filter_):
            module_names.append(name)
        elif not filter_:
            module_names.append(name)
    return module_names

names = get_target_module_names_for_peft(model, filter_="output")
names

['base_model.model.esm.encoder.layer.0.attention.output',
 'base_model.model.esm.encoder.layer.0.attention.output.dense',
 'base_model.model.esm.encoder.layer.0.attention.output.dense.base_layer',
 'base_model.model.esm.encoder.layer.0.attention.output.dense.lora_dropout',
 'base_model.model.esm.encoder.layer.0.attention.output.dense.lora_dropout.default',
 'base_model.model.esm.encoder.layer.0.attention.output.dense.lora_A',
 'base_model.model.esm.encoder.layer.0.attention.output.dense.lora_A.default',
 'base_model.model.esm.encoder.layer.0.attention.output.dense.lora_B',
 'base_model.model.esm.encoder.layer.0.attention.output.dense.lora_B.default',
 'base_model.model.esm.encoder.layer.0.attention.output.dense.lora_embedding_A',
 'base_model.model.esm.encoder.layer.0.attention.output.dense.lora_embedding_B',
 'base_model.model.esm.encoder.layer.0.attention.output.dropout',
 'base_model.model.esm.encoder.layer.0.output',
 'base_model.model.esm.encoder.layer.0.output.dense',
 'base_mode

In [None]:
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training, replace_lora_weights_loftq

In [None]:
peft_config = LoraConfig(inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1, 
                         target_modules="all-linear")

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 285,144 || all params: 8,125,907 || trainable%: 3.509072894878073


## Train model

In [1]:
import pandas as pd
from BioML.deep import finetuning as ft
from BioML.deep.utils import load_adapter
from datasets import Dataset
from Bio import SeqIO
from safetensors import SafetensorError
import torch
from peft import replace_lora_weights_loftq, AutoPeftModelForSequenceClassification, AutoPeftModel
from lightning.pytorch.tuner import Tuner
from transformers import AutoTokenizer, AutoModel
import numpy as np

In [2]:
def fasta_generator(fasta_file: str="whole_sequence.fasta"):
    with open(fasta_file, 'r') as f:
        seqs = SeqIO.parse(f, 'fasta')
        for seq in seqs:
            yield {"id":seq.id, "seq":str(seq.seq)}


In [3]:
label = "../data/esterase_labels.csv"
lab = pd.read_csv(label, index_col=0)
split_config = ft.SplitConfig()
llm_config = ft.LLMConfig()
train_config = ft.TrainConfig(2, batch_size=2, max_epochs=1, lora_rank=16, optimize="Val_MCC")
fasta_file = "../data/whole_sequence.fasta"
label_regre = np.array(list(map(float, range(len(lab)))))

In [4]:
train_config.target_modules = ['query', 'key', 'value', 'attention.output.dense']
train_config.objective

'classification'

In [5]:
tokenizer = ft.TokenizeFasta(llm_config)
data = tokenizer.tokenize(fasta_file, add_columns=[("labels", lab.to_numpy().flatten())])

In [6]:
splitter = ft.PrepareSplit(split_config.cluster_file, split_config.shuffle, split_config.random_seed, 
                            split_config.splitting_strategy, 
                            split_config.num_split, False)
data_module = ft.DataModule(splitter, fasta_file, lab.to_numpy().flatten(), llm_config, train_config.batch_size)
peft = ft.PreparePEFT(train_config, llm_config, "pissa")
model = peft.prepare_model()
light_mod = ft.TransformerModule(model, train_config, lr=1e-3)

filename = f"{{epoch}}-{{{train_config.optimize}:.2f}}"
checkpoint_callback = ft.ModelCheckpoint(filename=filename, monitor=train_config.optimize, 
                                              mode=train_config.optimize_mode, verbose=True, save_top_k=1)
early_callback = ft.EarlyStopping(monitor=train_config.optimize, min_delta=train_config.min_delta, 
                                       patience=train_config.patience, verbose=True, mode=train_config.optimize_mode)


Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 356,802 || all params: 8,197,565 || trainable%: 4.3525


In [7]:
peft = ft.PreparePEFT(train_config, llm_config)
model2 = peft.prepare_model()

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 233,922 || all params: 8,074,685 || trainable%: 2.8970


In [25]:
for name, params in model.named_parameters():
    if "lora" in name:
        print(name, params)

base_model.model.esm.encoder.layer.0.attention.self.query.lora_A.default.weight Parameter containing:
tensor([[ 1.7043e-01, -8.2647e-01,  2.4838e-02,  ...,  4.7189e-02,
         -8.3786e-02,  2.5851e-02],
        [-3.5719e-01,  2.8952e-02,  8.5006e-02,  ...,  5.7942e-02,
          2.0922e-01, -2.6888e-01],
        [ 7.5063e-02,  1.2485e-01,  6.2840e-02,  ..., -5.7912e-02,
          8.7211e-02,  5.9190e-02],
        ...,
        [ 6.2899e-03,  2.6713e-02, -3.9605e-02,  ...,  1.0640e-01,
          7.5123e-04, -5.9767e-02],
        [ 4.4162e-02,  4.6904e-03,  8.3011e-02,  ...,  1.4559e-01,
          2.7426e-02, -3.2084e-02],
        [ 1.7775e-04, -1.3213e-02, -1.0379e-02,  ..., -9.3273e-02,
          1.4039e-01, -3.1797e-02]], device='cuda:0', requires_grad=True)
base_model.model.esm.encoder.layer.0.attention.self.query.lora_B.default.weight Parameter containing:
tensor([[ 0.0285, -0.0048, -0.0054,  ...,  0.0129, -0.0562,  0.0302],
        [ 0.1384,  0.0217, -0.0380,  ...,  0.0082, -0.046

In [24]:
for name, params in a.named_parameters():
    if "lora" in name:
        print(name, params)

base_model.model.esm.encoder.layer.0.attention.self.query.lora_A.initial.weight Parameter containing:
tensor([[ 1.7043e-01, -8.2647e-01,  2.4838e-02,  ...,  4.7189e-02,
         -8.3786e-02,  2.5851e-02],
        [-3.5719e-01,  2.8952e-02,  8.5006e-02,  ...,  5.7942e-02,
          2.0922e-01, -2.6888e-01],
        [ 7.5063e-02,  1.2485e-01,  6.2840e-02,  ..., -5.7912e-02,
          8.7211e-02,  5.9190e-02],
        ...,
        [ 6.2899e-03,  2.6713e-02, -3.9605e-02,  ...,  1.0640e-01,
          7.5123e-04, -5.9767e-02],
        [ 4.4162e-02,  4.6904e-03,  8.3011e-02,  ...,  1.4559e-01,
          2.7426e-02, -3.2084e-02],
        [ 1.7775e-04, -1.3213e-02, -1.0379e-02,  ..., -9.3273e-02,
          1.4039e-01, -3.1797e-02]], device='cuda:0', requires_grad=True)
base_model.model.esm.encoder.layer.0.attention.self.query.lora_B.initial.weight Parameter containing:
tensor([[ 0.0285, -0.0048, -0.0054,  ...,  0.0129, -0.0562,  0.0302],
        [ 0.1384,  0.0217, -0.0380,  ...,  0.0082, -0.046

In [10]:
for name, params in light_mod.model.named_parameters():
    if "lora" in name:
        print(name, params)

base_model.model.esm.encoder.layer.0.attention.self.query.lora_A.default.weight Parameter containing:
tensor([[ 0.1699, -0.8279,  0.0260,  ...,  0.0469, -0.0843,  0.0254],
        [-0.3562,  0.0308,  0.0865,  ...,  0.0591,  0.2071, -0.2667],
        [ 0.0750,  0.1241,  0.0656,  ..., -0.0567,  0.0863,  0.0597],
        ...,
        [-0.0034, -0.1483, -0.1708,  ..., -0.0852, -0.1563, -0.0351],
        [ 0.0784, -0.0335,  0.0649,  ...,  0.0123, -0.1528, -0.1552],
        [ 0.0145,  0.0106, -0.0876,  ...,  0.0228,  0.0360, -0.0452]],
       device='cuda:0', requires_grad=True)
base_model.model.esm.encoder.layer.0.attention.self.query.lora_B.default.weight Parameter containing:
tensor([[ 0.0273, -0.0062, -0.0049,  ...,  0.0532, -0.0117,  0.0456],
        [ 0.1398,  0.0231, -0.0385,  ...,  0.0212,  0.0046,  0.0588],
        [ 0.1207,  0.0045,  0.0326,  ...,  0.0863,  0.0458, -0.0689],
        ...,
        [ 0.0235, -0.0163,  0.0695,  ...,  0.0183,  0.0525, -0.0219],
        [-0.0018,  0.0108

In [None]:
for name, params in light_mod.model.named_parameters():
    if "lora" in name or "classifier" in name:
        print(name)

base_model.model.esm.encoder.layer.0.attention.self.query.lora_A.default.weight
base_model.model.esm.encoder.layer.0.attention.self.query.lora_B.default.weight
base_model.model.esm.encoder.layer.0.attention.self.query.lora_magnitude_vector.default
base_model.model.esm.encoder.layer.0.attention.self.key.lora_A.default.weight
base_model.model.esm.encoder.layer.0.attention.self.key.lora_B.default.weight
base_model.model.esm.encoder.layer.0.attention.self.key.lora_magnitude_vector.default
base_model.model.esm.encoder.layer.0.attention.self.value.lora_A.default.weight
base_model.model.esm.encoder.layer.0.attention.self.value.lora_B.default.weight
base_model.model.esm.encoder.layer.0.attention.self.value.lora_magnitude_vector.default
base_model.model.esm.encoder.layer.0.attention.output.dense.lora_A.default.weight
base_model.model.esm.encoder.layer.0.attention.output.dense.lora_B.default.weight
base_model.model.esm.encoder.layer.0.attention.output.dense.lora_magnitude_vector.default
base_mod

In [7]:
trainer = ft.Trainer(callbacks=[checkpoint_callback, early_callback], default_root_dir=train_config.model_checkpoint_dir,
                          fast_dev_run=bool(train_config.debug_mode_sample), max_epochs=10, 
                          max_time=train_config.max_time, precision=train_config.precision,
                          accumulate_grad_batches=train_config.accumulate_grad_batches)

#tuner = Tuner(trainer)
#lr_finder = tuner.lr_find(light_mod, data_module, min_lr=1e-6, max_lr=1, num_training=1000, mode="exponential", early_stop_threshold=4)

trainer.fit(model=light_mod, datamodule=data_module)
best_model_path = checkpoint_callback.best_model_path

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/phastos/Programs/mambaforge/envs/bioml/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enab

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/phastos/Programs/mambaforge/envs/bioml/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric Val_MCC improved. New best score: -0.170
Epoch 0, global step 47: 'Val_MCC' reached -0.17045 (best -0.17045), saving model to 'model_checkpoint/lightning_logs/version_1/checkpoints/epoch=0-Val_MCC=-0.17.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric Val_MCC improved by 0.439 >= min_delta = 0.005. New best score: 0.269
Epoch 1, global step 94: 'Val_MCC' reached 0.26857 (best 0.26857), saving model to 'model_checkpoint/lightning_logs/version_1/checkpoints/epoch=1-Val_MCC=0.27.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 141: 'Val_MCC' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric Val_MCC improved by 0.037 >= min_delta = 0.005. New best score: 0.306
Epoch 3, global step 188: 'Val_MCC' reached 0.30551 (best 0.30551), saving model to 'model_checkpoint/lightning_logs/version_1/checkpoints/epoch=3-Val_MCC=0.31.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric Val_MCC improved by 0.033 >= min_delta = 0.005. New best score: 0.338
Epoch 4, global step 235: 'Val_MCC' reached 0.33806 (best 0.33806), saving model to 'model_checkpoint/lightning_logs/version_1/checkpoints/epoch=4-Val_MCC=0.34.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric Val_MCC improved by 0.100 >= min_delta = 0.005. New best score: 0.438
Epoch 5, global step 282: 'Val_MCC' reached 0.43819 (best 0.43819), saving model to 'model_checkpoint/lightning_logs/version_1/checkpoints/epoch=5-Val_MCC=0.44.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 6, global step 329: 'Val_MCC' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric Val_MCC improved by 0.069 >= min_delta = 0.005. New best score: 0.507
Epoch 7, global step 376: 'Val_MCC' reached 0.50709 (best 0.50709), saving model to 'model_checkpoint/lightning_logs/version_1/checkpoints/epoch=7-Val_MCC=0.51.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 8, global step 423: 'Val_MCC' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 9, global step 470: 'Val_MCC' was not in top 1
`Trainer.fit` stopped: `max_epochs=10` reached.


## Use adapters

In [9]:
def load_adapter(peft_model: str, llm_config,
                 use_adapter: str="initial", adapters: dict[str, str] | None=None):
    device = "auto" if llm_config.device == "cuda" else llm_config.device
    model = AutoPeftModel.from_pretrained(peft_model, adapter_name="initial", 
                                                                   low_cpu_mem_usage=True, device_map=device,
                                                                   torch_dtype=llm_config.dtype)                                                                
    if adapters:
        for key, value in adapters.items():
            model.load_adapter(value, adapter_name=key)
    model.set_adapter(use_adapter)
    model.merge_adapter()
    return model

In [10]:
mod = ft.TransformerModule.load_from_checkpoint(best_model_path, model=model)

In [11]:
data_module.prepare_data()
data_module.setup("fit")
inputs = data_module.test_dataloader()

In [12]:
for batch in inputs:
    print(batch)
    break

{'input_ids': tensor([[20, 15, 15,  ...,  1,  1,  1],
        [20, 15, 15,  ...,  1,  1,  1]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'), 'labels': tensor([0, 1], device='cuda:0')}


In [13]:
model.save_pretrained("model")



In [14]:
a = load_adapter("model", ft.LLMConfig(), use_adapter="initial")

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
a(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]).logits

tensor([[ 0.6370, -0.5562],
        [ 0.3029, -0.2428]], device='cuda:0', grad_fn=<ToCopyBackward0>)

In [17]:
model.eval()

PeftModel(
  (base_model): LoraModel(
    (model): EsmForSequenceClassification(
      (esm): EsmModel(
        (embeddings): EsmEmbeddings(
          (word_embeddings): Embedding(33, 320, padding_idx=1)
          (dropout): Dropout(p=0.0, inplace=False)
          (position_embeddings): Embedding(1026, 320, padding_idx=1)
        )
        (encoder): EsmEncoder(
          (layer): ModuleList(
            (0-5): 6 x EsmLayer(
              (attention): EsmAttention(
                (self): EsmSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=320, out_features=320, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=320, out_features=16, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Li

In [18]:
model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]).logits # stochastic dorpout if not eval

tensor([[ 0.6370, -0.5562],
        [ 0.3029, -0.2428]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [20]:
mod(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]).logits

tensor([[ 0.6370, -0.5562],
        [ 0.3029, -0.2428]], device='cuda:0', grad_fn=<AddmmBackward0>)