# How to finetune HuggingFace models on text data of any size and format with custom splitting (not random)

A way to handle text data of any size and format with custom split because random splitting is not recommended for protein sequences.

In [8]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
import torch
import numpy as np
from Bio import SeqIO
from datasets import Dataset, DatasetDict
from BioML.utilities import split_methods
from BioML.deep.embeddings import LLMConfig, TokenizeFasta
from BioML.deep.utils import set_seed
from peft import get_peft_model, LoraConfig
## https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb
## https://github.com/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb

## Load data

You need to label the target values as labels so Trainer can recognize it.
Dataset can actually be used for any usecases with large   files it doesn't depend on transformers  
Although you would need to use PyTorch Dataloader to transform it into batches (but it only returns inputs ids and attention masks will it also return labels?)

In [9]:
def fasta_generator(fasta_file: str="../data/whole_sequence.fasta"):
    with open(fasta_file, 'r') as f:
        seqs = SeqIO.parse(f, 'fasta')
        for seq in seqs:
            yield {"id":seq.id, "seq":str(seq.seq)}

b = Dataset.from_generator(fasta_generator, gen_kwargs={"fasta_file":"../data/whole_sequence.fasta"})
y = np.random.randint(0, 2, size=len(b))
dataset = b.add_column("labels", y)

Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
tok = TokenizeFasta()
tokens = tok.tokenize("../data/whole_sequence.fasta", (["labels", y],))

## Custom spliting with indices

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
cluster = split_methods.ClusterSpliter("../data/resultsDB_clu.tsv")
train, test = cluster.train_test_split(range(len(dataset)), groups=dataset["id"])

In [13]:
train, test = train_test_split(range(len(dataset)), stratify=dataset["labels"], test_size=0.2) # random splitting

In [14]:
new = DatasetDict({"train":dataset.select(train), "test":dataset.select(test)})

In [15]:
train_, validation = cluster.train_test_split(range(len(new["train"])), groups=new["train"]["id"])

In [16]:
new_2 = DatasetDict({"train":new["train"].select(train_), "test":dataset.select(test), "validation": new["train"].select(validation)})

## Load the protein language models model

In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device="cpu"

In [18]:
def model_init(): # 0 or 1 parameters ( the trial hyperparameters)
    return AutoModelForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D", num_labels=2) # torch_dtype=torch.bfloat16 to load in bfloat16 which is accepted by CPUs unlike float16

def model_init2(): # 0 or 1 parameters ( the trial hyperparameters)
	return AutoModelForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D", low_cpu_mem_usage=True)

model = AutoModelForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D", num_labels=2, low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
new["train"] = new["train"].map(lambda examples: tokenizer(examples["seq"], return_tensors="np",padding=True, truncation=True), batched=True)
new["test"] = new["test"].map(lambda examples: tokenizer(examples["seq"], return_tensors="np",padding=True, truncation=True), batched=True)

Map:   0%|          | 0/117 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/30 [00:00<?, ? examples/s]

### Create the training arguments

In [20]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

In [21]:
lr = 8e-5
bs = 1
epochs = 4

Se use cpu to False whe you wan to use GPUs (it will automatically use GPUs), when f16 is True it will only use GPUs.

In [22]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.2, lr_scheduler_type='cosine', fp16=False if device=="cpu" else True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to=['mlflow'],
    load_best_model_at_end=True, metric_for_best_model="matthews_correlation", 
    save_total_limit=2, save_strategy="epoch", seed=3242342, gradient_accumulation_steps=4, use_cpu=True if device=="cpu" else False) 

## The warmup step together with cosine learning rate scheduler turns to onecycle learning rate scheduler
## weight decay for the Adam (AdamW) -> this is fast.Ai does
## fp16 is half precision -> mixed training (using fp32 and fp16)
## save_total_limit to 3 -> so only 3 models will be saved
## each 500 steps will be saved a model
## Save the report to mlflow
# How to evaluate mlflow?
# LR finder does not give reliable results for Transformers models https://github.com/huggingface/transformers/issues/16013



## Train the model using several evaluation metrics

In [23]:
import evaluate
import mlflow

ModuleNotFoundError: No module named 'evaluate'

You can use your own function as an evaluation metric -> then you have to retun as an dict  
Or you can use the evaluate library from hugging face to load different functions: [evaluate](https://huggingface.co/docs/evaluate/a_quick_tour)


In [None]:
def compute_classification_metrics(eval_pred):
    metrics = ["accuracy", "f1", "matthews_correlation", "precision", "recall"]
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    loaded = {metric:evaluate.load(metric) for metric in metrics}
    results = {metric: loaded[metric].compute(predictions=predictions, references=labels)[metric] 
               for metric in metrics}

    # the predictions from the models are logits (it also returns the labels, 
    # it also returns loss, attentions and hidden state but that is the classification model, for evalaution Trainer will only 
    # return logits and labels)
    return results

def compute_regression_metrics(eval_pred):
	metrics = ["mse", "mae"]
	logits, labels = eval_pred
	predictions = logits
	loaded = {metric:evaluate.load(metric) for metric in metrics}
	results = {metric: loaded[metric].compute(predictions=predictions, references=labels)[metric] 
			   for metric in metrics}
	results["r2"] = evaluate.load("r_squared").compute(predictions=predictions, references=labels)
	results["rmse"] = loaded["mse"].compute(predictions=predictions, references=labels, squared=False)["mse"]
	return results

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        print(inputs)
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.compute_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
trainer = Trainer(model, args, train_dataset=new['train'], eval_dataset=new['test'], # we need to pass tokenized datasets
                  tokenizer=tokenizer, compute_metrics=compute_classification_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=2)])

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## Tuning the hyperparameters learning rate and batch_size

In [None]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-2, log=True),
        "gradient_accumulation_steps": trial.suggest_categorical("gradient_accumulation_steps", [2, 4, 8, 16]),
    }


def compute_classification_objective(metrics: dict[str, float]) -> tuple[float, float]:
	return metrics["eval_loss"], metrics["eval_matthews_correlation"]

def compute_regression_objective(metrics: dict[str, float]) -> tuple[float, float]:
	return metrics["eval_loss"], metrics["eval_r2"]

In [None]:
trainer = Trainer(None, args, model_init=model_init2, train_dataset=new['train'], eval_dataset=new['test'], # we need to pass tokenized datasets
                  tokenizer=tokenizer, 
                  compute_metrics=compute_classification_metrics, 
                  callbacks=[EarlyStoppingCallback(early_stopping_patience=2)])

RuntimeError: model_init should have 0 or 1 argument.

In [None]:
mlflow.end_run()

best_trials = trainer.hyperparameter_search(
	direction=["minimize", "maximize"],
	backend="optuna",
	hp_space=optuna_hp_space,
	n_trials=1,
	compute_objective=compute_classification_objective,
    storage='sqlite:///my_optuna_studies.db',
    load_if_exists=True
)


[I 2024-05-05 18:27:53,211] A new study created in RDB with name: no-name-305c8b7d-b552-4cd6-b1d9-2e327a424486
Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/116 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.6917587518692017, 'eval_accuracy': 0.5666666666666667, 'eval_f1': 0.0, 'eval_matthews_correlation': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 18.5898, 'eval_samples_per_second': 1.614, 'eval_steps_per_second': 0.807, 'epoch': 0.99}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.7241240739822388, 'eval_accuracy': 0.5, 'eval_f1': 0.4444444444444444, 'eval_matthews_correlation': -0.008988968316207744, 'eval_precision': 0.42857142857142855, 'eval_recall': 0.46153846153846156, 'eval_runtime': 17.8967, 'eval_samples_per_second': 1.676, 'eval_steps_per_second': 0.838, 'epoch': 1.98}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.7805582880973816, 'eval_accuracy': 0.5, 'eval_f1': 0.4444444444444444, 'eval_matthews_correlation': -0.008988968316207744, 'eval_precision': 0.42857142857142855, 'eval_recall': 0.46153846153846156, 'eval_runtime': 18.3333, 'eval_samples_per_second': 1.636, 'eval_steps_per_second': 0.818, 'epoch': 2.97}


[I 2024-05-05 18:36:15,996] Trial 0 finished with values: [0.7805582880973816, -0.008988968316207744] and parameters: {'learning_rate': 7.813286994811102e-05, 'gradient_accumulation_steps': 4}. 


{'train_runtime': 501.8248, 'train_samples_per_second': 0.933, 'train_steps_per_second': 0.231, 'train_loss': 0.6711522244859016, 'epoch': 2.97}


In [None]:
trainer.args.learning_rate = best_trials[0].hyperparameters["learning_rate"]
trainer.args.gradient_accumulation_steps = best_trials[0].hyperparameters["gradient_accumulation_steps"]

In [None]:
trainer.train()

  0%|          | 0/116 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.7221236824989319, 'eval_accuracy': 0.5333333333333333, 'eval_f1': 0.6956521739130436, 'eval_matthews_correlation': 0.0, 'eval_precision': 0.5333333333333333, 'eval_recall': 1.0, 'eval_runtime': 31.508, 'eval_samples_per_second': 0.952, 'eval_steps_per_second': 0.476, 'epoch': 0.99}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.7303746938705444, 'eval_accuracy': 0.5333333333333333, 'eval_f1': 0.6956521739130436, 'eval_matthews_correlation': 0.0, 'eval_precision': 0.5333333333333333, 'eval_recall': 1.0, 'eval_runtime': 31.2142, 'eval_samples_per_second': 0.961, 'eval_steps_per_second': 0.481, 'epoch': 1.98}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.7445932030677795, 'eval_accuracy': 0.5333333333333333, 'eval_f1': 0.6956521739130436, 'eval_matthews_correlation': 0.0, 'eval_precision': 0.5333333333333333, 'eval_recall': 1.0, 'eval_runtime': 31.2265, 'eval_samples_per_second': 0.961, 'eval_steps_per_second': 0.48, 'epoch': 2.97}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.753971517086029, 'eval_accuracy': 0.6, 'eval_f1': 0.7272727272727273, 'eval_matthews_correlation': 0.2857142857142857, 'eval_precision': 0.5714285714285714, 'eval_recall': 1.0, 'eval_runtime': 31.5365, 'eval_samples_per_second': 0.951, 'eval_steps_per_second': 0.476, 'epoch': 3.97}
{'train_runtime': 618.7594, 'train_samples_per_second': 0.756, 'train_steps_per_second': 0.187, 'train_loss': 0.6006371070598734, 'epoch': 3.97}


TrainOutput(global_step=116, training_loss=0.6006371070598734, metrics={'train_runtime': 618.7594, 'train_samples_per_second': 0.756, 'train_steps_per_second': 0.187, 'train_loss': 0.6006371070598734, 'epoch': 3.97})

We make sure that we get the same results by evaluating the results once more

In [None]:
metrics = trainer.evaluate()
print(metrics)

## Search for hyperparameters like the learning rate which is the most important

Well it is actually batch size and learning rate -> smaller batch sizes tend to work better than large batch sizes -> but learning rate is affected by batch as well -> higher abtch need higher learning rate.

Fix everything else and tune the learning rate -> learning rate finder doesn'0t seem to work very well for transformers?  
But teh idea of learning rate finder is just test different learning rates -> so I cannot test them?

Ktrains: A wrapper to do many tasks and has a learning rate finder: [ktrains](https://github.com/amaiya/ktrain)

Use pytorch lightning perhaps: [pytorch_lighningt_huggingface](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb)

In [None]:
#model = AutoModel.from_pretrained("bigscience/T0pp", device_map="auto")

config.json:   0%|          | 0.00/633 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/44.5G [00:00<?, ?B/s]

KeyboardInterrupt: 

## Parameter efficient fine tuning

In [None]:
from dataclasses import dataclass
from torch.optim import AdamW, Optimizer
from torch.optim.lr_scheduler import OneCycleLR
from lightning import LightningModule, LightningDataModule
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
from lightning.pytorch.loggers import MLFlowLogger
from lightning import Trainer
from torchmetrics.functional.classification import (
    accuracy,
    f1_score,
    precision,
    recall,
    auroc,
    average_precision,
    cohen_kappa,
    confusion_matrix,
    matthews_corrcoef
) 

from torchmetrics.functional.regression import (
    mean_absolute_error,
    mean_squared_error,
    pearson_corrcoef,
    kendall_rank_corrcoef,
    r2_score,
    mean_absolute_percentage_error,
    mean_squared_log_error)

from torch.utils.data import DataLoader
from datasets import Dataset
from BioML.deep.train_config import LLMConfig
from BioML.utilities import split_methods as split

In [None]:
def calculate_classification_metrics(split: str, loss: torch.tensor, preds: torch.tensor, 
                                     target: torch.tensor, num_classes: int=2, threshold: float=0.5):
    task = "binary" if num_classes == 2 else "multiclass"
    metrics = {
                f"{split}_Loss": loss,
                f"{split}_Acc": accuracy(
                    preds=preds,
                    target=target,
                    num_classes=num_classes,
                    task=task,
                    threshold=threshold,
                    average="weighted",
                ),
                f"{split}_F1":f1_score(
                    preds=preds,
                    target=target,
                    task=task,
                    num_classes=num_classes,
                    average="weighted",
                ),
                f"{split}_Precision": precision(
                    preds=preds,
                    target=target,
                    task=task,
                    num_classes=num_classes,
                    average="weighted",
                ),
                f"{split}_Recall": recall(
                    preds=preds,
                    target=target,
                    task=task,
                    num_classes=num_classes,
                    average="weighted"
                ),
                f"{split}_MCC": matthews_corrcoef(
                    preds=preds,
                    target=target,
                    num_classes=num_classes,
                    threshold=threshold,
                    task=task,
                ),
                f"{split}_Confusion_Matrix": confusion_matrix(
                    preds=preds,
                    target=target,
                    num_classes=num_classes,
                    normalize="true",
                    task=task,
                    threshold=threshold,
                ),
                f"{split}_AUROC": auroc(
                    preds=preds,
                    target=target,
                    num_classes=num_classes,
                    task=task,
                    thresholds=None,
                    average="weighted",
                ),
                f"{split}_Average_Precision": average_precision(
                    preds=preds,
                    target=target,
                    num_classes=num_classes,
                    task=task,
                    average="weighted",
                ),
                f"{split}_Cohen_Kappa": cohen_kappa(
                    preds=preds,
                    target=target,
                    num_classes=num_classes,
                    task=task,
                    threshold=threshold,
                )}
    return metrics

def calculate_regression_metrics(split: str, loss: torch.tensor, preds: torch.tensor, 
                                 target: torch.tensor):
    metrics = {f"{split}_Loss": loss,
                f"{split}_MAE": mean_absolute_error(preds, target),
                f"{split}_MSE": mean_squared_error(preds, target),
                f"{split}_RMSE": mean_squared_error(preds, target, squared=False),
                f"{split}_R2": r2_score(preds, target),
                f"{split}_Pearson": pearson_corrcoef(preds, target),
                f"{split}_Kendall": kendall_rank_corrcoef(preds, target),
                f"{split}_MAPE": mean_absolute_percentage_error(preds, target),
                f"{split}_MSLE": mean_squared_log_error(preds, target)}
    return metrics


In [None]:
r_values = (8, 16, 32, 64, 128, 256)

In [None]:
from functools import partial

def test(x, y):
    return x + y
class Test:
    def __init__(self):
        self.x = test
        self.tes_ = partial(self.x, y=2)
    def __call__(self, x):
        return self.tes_(x)

t = Test()
t(3)

5

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D", num_labels=2, low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def get_target_module_names_for_peft(model, filter_="key"):
    if isinstance(filter_, str):
        filter_ = [filter_] # if it is a string, convert it to a list
    module_names = []
    for num, (name, module) in enumerate(model.named_modules()):
        n = name.split(".")
        if filter_ and set(n).intersection(filter_):
            module_names.append(name)
        elif not filter_:
            module_names.append(name)
    return module_names

names = get_target_module_names_for_peft(model, filter_="output")
names

['base_model.model.esm.encoder.layer.0.attention.output',
 'base_model.model.esm.encoder.layer.0.attention.output.dense',
 'base_model.model.esm.encoder.layer.0.attention.output.dense.base_layer',
 'base_model.model.esm.encoder.layer.0.attention.output.dense.lora_dropout',
 'base_model.model.esm.encoder.layer.0.attention.output.dense.lora_dropout.default',
 'base_model.model.esm.encoder.layer.0.attention.output.dense.lora_A',
 'base_model.model.esm.encoder.layer.0.attention.output.dense.lora_A.default',
 'base_model.model.esm.encoder.layer.0.attention.output.dense.lora_B',
 'base_model.model.esm.encoder.layer.0.attention.output.dense.lora_B.default',
 'base_model.model.esm.encoder.layer.0.attention.output.dense.lora_embedding_A',
 'base_model.model.esm.encoder.layer.0.attention.output.dense.lora_embedding_B',
 'base_model.model.esm.encoder.layer.0.attention.output.dropout',
 'base_model.model.esm.encoder.layer.0.output',
 'base_model.model.esm.encoder.layer.0.output.dense',
 'base_mode

In [None]:
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training, replace_lora_weights_loftq

In [None]:
peft_config = LoraConfig(inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1, 
                         target_modules="all-linear")

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 285,144 || all params: 8,125,907 || trainable%: 3.509072894878073


## Train model

In [1]:
import pandas as pd
from BioML.deep import finetuning as ft
from BioML.deep.utils import loftq_initialization
from datasets import Dataset
from Bio import SeqIO
from safetensors import SafetensorError
import torch

In [2]:
def fasta_generator(fasta_file: str="whole_sequence.fasta"):
    with open(fasta_file, 'r') as f:
        seqs = SeqIO.parse(f, 'fasta')
        for seq in seqs:
            yield {"id":seq.id, "seq":str(seq.seq)}


In [3]:
label = "../data/esterase_labels.csv"
lab = pd.read_csv(label, index_col=0)
split_config = ft.SplitConfig()
llm_config = ft.LLMConfig()
train_config = ft.TrainConfig(batch_size=1, max_epochs=1, lora_rank=8)
fasta_file = "../data/whole_sequence.fasta"

In [4]:
b = Dataset.from_generator(fasta_generator, gen_kwargs={"fasta_file":fasta_file})
dataset = b.add_column("labels", lab.to_numpy().flatten())

In [5]:
tokenizer = ft.TokenizeFasta(llm_config)
tokenizer.tokenize(fasta_file, add_columns=[("labels",lab.to_numpy().flatten())])

Dataset({
    features: ['id', 'seq', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 147
})

In [4]:
splitter = ft.PrepareSplit(split_config.cluster_file, split_config.shuffle, split_config.random_seed, 
                            split_config.splitting_strategy, 
                            split_config.num_split, split_config.stratify)
data_module = ft.DataModule(splitter, fasta_file, lab.values.flatten(), llm_config, train_config.batch_size)
peft = ft.PreparePEFT(True)
model = peft.get_model(llm_config)
peft_config = peft.get_lora_config(rank=train_config.lora_rank, target_modules=train_config.target_modules, 
                                       lora_alpha=train_config.lora_alpha, lora_dropout=train_config.lora_dropout)
model = ft.get_peft_model(model, peft_config)
#try:
    #ft.replace_lora_weights_loftq(model) # https://github.com/huggingface/peft/blob/main/examples/loftq_finetuning/LoftQ_weight_replacement.ipynb
#except SafetensorError as e:
#    print(e)
light_mod = ft.TransformerModule(model, train_config, lr=1e-3)

filename = f"{{epoch}}-{{{train_config.optimize}:.2f}}"
checkpoint_callback = ft.ModelCheckpoint(filename=filename, monitor=train_config.optimize, 
                                              mode=train_config.optimize_mode, verbose=True, save_top_k=1)
early_callback = ft.EarlyStopping(monitor=train_config.optimize, min_delta=train_config.min_delta, 
                                       patience=train_config.patience, verbose=True, mode=train_config.optimize_mode)


Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
peft = ft.PreparePEFT(True)
model2 = peft.get_model(llm_config)
peft_config = peft.get_lora_config(rank=train_config.lora_rank, target_modules=train_config.target_modules, 
                                       lora_alpha=train_config.lora_alpha, lora_dropout=train_config.lora_dropout)
model2 = ft.get_peft_model(model2, peft_config)

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
for name, params in model.named_parameters():
    if "lora" in name:
        print(name, params)

base_model.model.esm.encoder.layer.0.attention.self.query.lora_A.default.weight Parameter containing:
tensor([[-0.0235,  0.0263, -0.0274,  ...,  0.0383,  0.0155,  0.0455],
        [ 0.0005, -0.0314,  0.0201,  ...,  0.0262,  0.0409, -0.0128],
        [ 0.0557,  0.0065,  0.0155,  ..., -0.0222, -0.0319, -0.0253],
        ...,
        [-0.0398, -0.0048, -0.0126,  ..., -0.0211, -0.0285, -0.0191],
        [ 0.0077,  0.0053,  0.0307,  ...,  0.0496,  0.0328,  0.0542],
        [ 0.0221,  0.0179,  0.0110,  ...,  0.0061, -0.0308,  0.0303]],
       device='cuda:0', requires_grad=True)
base_model.model.esm.encoder.layer.0.attention.self.query.lora_B.default.weight Parameter containing:
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0', requires_grad=True)
base_model.model.esm.encode

In [14]:
for name, params in mod.model.named_parameters():
    if "lora" in name:
        print(name, params)

base_model.model.esm.encoder.layer.0.attention.self.query.lora_A.default.weight Parameter containing:
tensor([[-0.0166,  0.0334, -0.0216,  ...,  0.0451,  0.0140,  0.0470],
        [-0.0050, -0.0374,  0.0145,  ...,  0.0178,  0.0414, -0.0164],
        [ 0.0582,  0.0066,  0.0150,  ..., -0.0125, -0.0309, -0.0224],
        ...,
        [-0.0425, -0.0114, -0.0110,  ..., -0.0163, -0.0303, -0.0151],
        [ 0.0105,  0.0111,  0.0324,  ...,  0.0573,  0.0321,  0.0549],
        [ 0.0162,  0.0118,  0.0065,  ..., -0.0023, -0.0302,  0.0265]],
       device='cuda:0', requires_grad=True)
base_model.model.esm.encoder.layer.0.attention.self.query.lora_B.default.weight Parameter containing:
tensor([[ 0.0055, -0.0038, -0.0023,  ..., -0.0030,  0.0067, -0.0031],
        [ 0.0037, -0.0042,  0.0023,  ..., -0.0014,  0.0024, -0.0053],
        [ 0.0018, -0.0032,  0.0016,  ...,  0.0030,  0.0027, -0.0032],
        ...,
        [ 0.0068, -0.0050,  0.0014,  ...,  0.0007,  0.0056, -0.0056],
        [ 0.0009,  0.0008

In [15]:
for name, params in light_mod.model.named_parameters():
    if "lora" in name:
        print(name, params)

base_model.model.esm.encoder.layer.0.attention.self.query.lora_A.default.weight Parameter containing:
tensor([[-0.0165,  0.0334, -0.0204,  ...,  0.0451,  0.0146,  0.0468],
        [-0.0049, -0.0374,  0.0132,  ...,  0.0175,  0.0412, -0.0162],
        [ 0.0576,  0.0062,  0.0150,  ..., -0.0121, -0.0313, -0.0231],
        ...,
        [-0.0432, -0.0117, -0.0119,  ..., -0.0162, -0.0306, -0.0155],
        [ 0.0100,  0.0110,  0.0332,  ...,  0.0574,  0.0319,  0.0545],
        [ 0.0162,  0.0118,  0.0055,  ..., -0.0026, -0.0303,  0.0268]],
       requires_grad=True)
base_model.model.esm.encoder.layer.0.attention.self.query.lora_B.default.weight Parameter containing:
tensor([[ 0.0053, -0.0035, -0.0028,  ..., -0.0029,  0.0065, -0.0030],
        [ 0.0045, -0.0048,  0.0025,  ..., -0.0022,  0.0029, -0.0058],
        [ 0.0019, -0.0031,  0.0016,  ...,  0.0032,  0.0028, -0.0031],
        ...,
        [ 0.0069, -0.0051,  0.0014,  ...,  0.0009,  0.0055, -0.0057],
        [ 0.0010,  0.0009, -0.0001,  ..., 

In [10]:
for name, params in model.named_parameters():
    if "lora" in name:
        print(name, params)

base_model.model.esm.encoder.layer.0.attention.self.query.lora_A.default.weight Parameter containing:
tensor([[-0.0165,  0.0334, -0.0204,  ...,  0.0451,  0.0146,  0.0468],
        [-0.0049, -0.0374,  0.0132,  ...,  0.0175,  0.0412, -0.0162],
        [ 0.0576,  0.0062,  0.0150,  ..., -0.0121, -0.0313, -0.0231],
        ...,
        [-0.0432, -0.0117, -0.0119,  ..., -0.0162, -0.0306, -0.0155],
        [ 0.0100,  0.0110,  0.0332,  ...,  0.0574,  0.0319,  0.0545],
        [ 0.0162,  0.0118,  0.0055,  ..., -0.0026, -0.0303,  0.0268]],
       requires_grad=True)
base_model.model.esm.encoder.layer.0.attention.self.query.lora_B.default.weight Parameter containing:
tensor([[ 0.0053, -0.0035, -0.0028,  ..., -0.0029,  0.0065, -0.0030],
        [ 0.0045, -0.0048,  0.0025,  ..., -0.0022,  0.0029, -0.0058],
        [ 0.0019, -0.0031,  0.0016,  ...,  0.0032,  0.0028, -0.0031],
        ...,
        [ 0.0069, -0.0051,  0.0014,  ...,  0.0009,  0.0055, -0.0057],
        [ 0.0010,  0.0009, -0.0001,  ..., 

In [8]:
trainer = ft.Trainer(callbacks=[checkpoint_callback, early_callback], default_root_dir=train_config.model_checkpoint_dir,
                          fast_dev_run=bool(train_config.debug_mode_sample), max_epochs=10, 
                          max_time=train_config.max_time, precision=train_config.precision,
                          accumulate_grad_batches=train_config.accumulate_grad_batches)
        
trainer.fit(model=light_mod, datamodule=data_module)
best_model_path = checkpoint_callback.best_model_path

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/phastos/Programs/mambaforge/envs/bioml/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enab

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/phastos/Programs/mambaforge/envs/bioml/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]



Validation: |          | 0/? [00:00<?, ?it/s]

Metric Val_MCC improved. New best score: -0.083
Epoch 0, global step 93: 'Val_MCC' reached -0.08333 (best -0.08333), saving model to 'model_checkpoint/lightning_logs/version_3/checkpoints/epoch=0-Val_MCC=-0.08.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric Val_MCC improved by 0.333 >= min_delta = 0.005. New best score: 0.250
Epoch 1, global step 186: 'Val_MCC' reached 0.25000 (best 0.25000), saving model to 'model_checkpoint/lightning_logs/version_3/checkpoints/epoch=1-Val_MCC=0.25.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 279: 'Val_MCC' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3, global step 372: 'Val_MCC' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric Val_MCC improved by 0.167 >= min_delta = 0.005. New best score: 0.417
Epoch 4, global step 465: 'Val_MCC' reached 0.41667 (best 0.41667), saving model to 'model_checkpoint/lightning_logs/version_3/checkpoints/epoch=4-Val_MCC=0.42.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 5, global step 558: 'Val_MCC' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric Val_MCC improved by 0.083 >= min_delta = 0.005. New best score: 0.500
Epoch 6, global step 651: 'Val_MCC' reached 0.50000 (best 0.50000), saving model to 'model_checkpoint/lightning_logs/version_3/checkpoints/epoch=6-Val_MCC=0.50.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 7, global step 744: 'Val_MCC' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric Val_MCC improved by 0.083 >= min_delta = 0.005. New best score: 0.583
Epoch 8, global step 837: 'Val_MCC' reached 0.58333 (best 0.58333), saving model to 'model_checkpoint/lightning_logs/version_3/checkpoints/epoch=8-Val_MCC=0.58.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 9, global step 930: 'Val_MCC' was not in top 1
`Trainer.fit` stopped: `max_epochs=10` reached.


In [9]:
best_model_path

'model_checkpoint/lightning_logs/version_3/checkpoints/epoch=8-Val_MCC=0.58.ckpt'

In [12]:
checkpoint = torch.load(best_model_path, map_location=lambda storage, loc: storage)
checkpoint["state_dict"]["model.base_model.model.esm.contact_head.regression.lora_A.default.weight"].shape

torch.Size([8, 120])

In [13]:
mod = ft.TransformerModule.load_from_checkpoint(best_model_path, model=model2)

In [16]:
mod3 = light_mod.load_from_checkpoint(best_model_path, model=model2)

TypeError: The classmethod `TransformerModule.load_from_checkpoint` cannot be called on an instance. Please call it on the class type and make sure the return value is used.

In [None]:
data_module.prepare_data()
data_module.setup("fit")
inputs = data_module.train_dataloader

In [16]:
model.save_pretrained("model")



In [None]:
logits_base = model(input_ids=batch["input_ids"], 
                    attention_mask=batch["attention_mask"]).logits