# How to finetune HuggingFace models on text data of any size and format with custom splitting (not random)

A way to handle text data of any size and format with custom split because random splitting is not recommended for protein sequences.

In [110]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import numpy as np
from Bio import SeqIO
from datasets import Dataset, DatasetDict
from sklearn.model_selection import GroupKFold
import pandas as pd
from typing import Sequence
from sklearn.utils import shuffle
## https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb
## https://github.com/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb

## Load data

You need to label the target values as labels so Trainer can recognize it.  
Dataset can actually be used for any usecases with large files it doesn't depend on transformers.  
Although you would need to use PyTorch Dataloader to transform it into batches

In [111]:
def fasta_generator(fasta_file: str="whole_sequence.fasta"):
    with open(fasta_file, 'r') as f:
        seqs = SeqIO.parse(f, 'fasta')
        for seq in seqs:
            yield {"id":seq.id, "seq":str(seq.seq)}

b = Dataset.from_generator(fasta_generator, gen_kwargs={"fasta_file":"whole_sequence.fasta"})
y = np.random.randint(0, 2, size=len(b))
dataset = b.add_column("labels", y)

## Custom spliting with indices

In [112]:
def read_cluster_info(file_path: str) -> dict[str | int, list[str|int]]:
	"""
	Read the cluster information from a tsv file generated by MMSeqs2 function.
	The sequences are clusterized at 30% sequence identity. So sequences in different clusters are at most 30% identical.
	"""
	cluster_info = {}
	with open(file_path, "r") as f:
		lines = [x.strip() for x in f.readlines()]
	for x in lines:
		X = x.split("\t")
		if X[0] not in cluster_info:
			cluster_info[X[0]] = []
		cluster_info[X[0]].append(X[1])
	return cluster_info

def get_group_index(cluster_info, index: Sequence[str | int]) -> np.ndarray:
	"""
	Now for each sample in the data, assign a group using the cluster info dictionary.
	The groups will be the keys of the cluster info.

	index should be the same as the cluster_info values.
	"""
	group = []
	for x in index:
		for key, value in cluster_info.items():
			if x in value:
				group.append(key)
				break
	group = np.array(group)
	return group

def split(X: Sequence[int | str] | pd.DataFrame, y: Sequence[int] | None=None, 
            groups: Sequence[str|int] | None = None) -> list[tuple[np.ndarray, np.ndarray]]:
    """
    Split the data into train and test sets.
    """
    group_kfold = GroupKFold(n_splits=5)
    fold = list(group_kfold.split(X, y, groups=groups))
    return fold

def get_fold(fold: list[np.ndarray], dataset: Dataset, fold_num: int) -> DatasetDict:
    """
    Get the train and test sets for the given fold.
    """
    train_idx, test_idx = fold[fold_num]
    train = dataset.select(train_idx).shuffle(43)
    test = dataset.select(test_idx).shuffle(43)
    return DatasetDict({"train":train, "test":test})

In [113]:
cluster = read_cluster_info("resultsDB_clu.tsv")
group = get_group_index(cluster, dataset["id"])
fold = split(dataset, y, group)
fold_0 = get_fold(fold, dataset, 0)
fold_0

DatasetDict({
    train: Dataset({
        features: ['id', 'seq', 'labels'],
        num_rows: 117
    })
    test: Dataset({
        features: ['id', 'seq', 'labels'],
        num_rows: 30
    })
})

## Load the large language models

In [114]:
disable_pgu = True
device = "cuda:0" if torch.cuda.is_available() and not disable_pgu else "cpu"

In [116]:
model = AutoModelForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [117]:
fold_0["train"] = fold_0["train"].map(lambda examples: tokenizer(examples["seq"], return_tensors="np",padding=True, truncation=True), batched=True)
fold_0["test"] = fold_0["test"].map(lambda examples: tokenizer(examples["seq"], return_tensors="np",padding=True, truncation=True), batched=True)

Map:   0%|          | 0/117 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [71]:
fold_0

DatasetDict({
    train: Dataset({
        features: ['id', 'seq', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 117
    })
    test: Dataset({
        features: ['id', 'seq', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 30
    })
})

### Create the training arguments

In [81]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

You can play with these values to see which one returns the highest performance (see [hyperparameter search from huggingface](https://huggingface.co/docs/setfit/how_to/hyperparameter_optimization))

In [82]:
lr = 8e-5
bs = 1
epochs = 4

Set use_cpu to False when you wan to use GPUs (it will automatically use GPUs), when f16 is True it will only use GPUs.

In [118]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.2, lr_scheduler_type='cosine', fp16=False if device=="cpu" else True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to=['mlflow'],
    load_best_model_at_end=True, metric_for_best_model="matthews_correlation", 
    save_total_limit=2, save_strategy="epoch", seed=3242342, gradient_accumulation_steps=4, 
    use_cpu=True if device=="cpu" else False) 

## cosine will set it to cosine and then we have a learning rate
## weight decay for the Adam -> this is fast.Ai does
## fp16 is half precision -> mixed training (using fp32 and fp16)
## save_total_limit to 2 -> so only 2 models will be saved
## Save the report to mlflow
# LR finder does not give reliable results for Transformers models https://github.com/huggingface/transformers/issues/16013

## Train the model and evaluate the results

In [146]:
import evaluate
import mlflow

You can use your own function as an evaluation metric -> then you have to retun as a dictionary {metric_name: metric_score}  
Or you can use the evaluate library from hugging face to load different functions: [evaluate](https://huggingface.co/docs/evaluate/a_quick_tour)


In [147]:
def compute_classification_metrics(eval_pred):
    metrics = ["accuracy", "f1", "matthews_correlation", "precision", "recall"]
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    loaded = {metric:evaluate.load(metric) for metric in metrics}
    results = {metric: loaded[metric].compute(predictions=predictions, references=labels)[metric] 
               for metric in metrics}

    # the predictions from the models are logits (it also returns the labels, 
    # it also returns loss, attentions and hidden state but that is the classification model, for evalaution Trainer will only 
    # return logits and labels)
    return results


In [148]:
trainer = Trainer(model, args, train_dataset=fold_0['train'], eval_dataset=fold_0['test'], # we need to pass tokenized datasets
                  tokenizer=tokenizer, compute_metrics=compute_classification_metrics,
                  callbacks=[EarlyStoppingCallback(early_stopping_patience=2)])

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [149]:
mlflow.end_run()
trainer.train()

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/232 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.6930113434791565, 'eval_accuracy': 0.5, 'eval_f1': 0.6511627906976744, 'eval_matthews_correlation': -0.0890870806374748, 'eval_precision': 0.5185185185185185, 'eval_recall': 0.875, 'eval_runtime': 15.6809, 'eval_samples_per_second': 1.913, 'eval_steps_per_second': 0.957, 'epoch': 0.99}


  0%|          | 0/15 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6964237093925476, 'eval_accuracy': 0.4666666666666667, 'eval_f1': 0.0, 'eval_matthews_correlation': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 16.4017, 'eval_samples_per_second': 1.829, 'eval_steps_per_second': 0.915, 'epoch': 2.0}


  0%|          | 0/15 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6956890225410461, 'eval_accuracy': 0.4666666666666667, 'eval_f1': 0.0, 'eval_matthews_correlation': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 15.4504, 'eval_samples_per_second': 1.942, 'eval_steps_per_second': 0.971, 'epoch': 2.99}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.6947107315063477, 'eval_accuracy': 0.4666666666666667, 'eval_f1': 0.0, 'eval_matthews_correlation': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 15.7289, 'eval_samples_per_second': 1.907, 'eval_steps_per_second': 0.954, 'epoch': 3.97}
{'train_runtime': 583.9581, 'train_samples_per_second': 0.801, 'train_steps_per_second': 0.397, 'train_loss': 0.7043541875378839, 'epoch': 3.97}


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=232, training_loss=0.7043541875378839, metrics={'train_runtime': 583.9581, 'train_samples_per_second': 0.801, 'train_steps_per_second': 0.397, 'train_loss': 0.7043541875378839, 'epoch': 3.97})

In [129]:
metrics = trainer.evaluate()
print(metrics)

  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.6928602457046509, 'eval_accuracy': 0.5333333333333333, 'eval_f1': 0.6956521739130436, 'eval_matthews_correlation': 0.0, 'eval_precision': 0.5333333333333333, 'eval_recall': 1.0, 'eval_runtime': 14.71, 'eval_samples_per_second': 2.039, 'eval_steps_per_second': 1.02, 'epoch': 3.83}
