In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import numpy as np
from Bio import SeqIO
from datasets import Dataset, DatasetDict
from transformers import get_scheduler, get_cosine_schedule_with_warmup
from BioML.utilities import split_methods

## https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb
## https://github.com/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb

## Load data

You need to label the target values as labels so Trainer can recognize it.
Dataset can actually be used for any usecases with large files it doesn't depend on transformers
Although you would need to use PyTorch Dataloader to transform it into batches (but it only returns inputs ids and attention masks will it also return labels?)

In [25]:
def fasta_generator(fasta_file: str="../data/whole_sequence.fasta"):
    with open(fasta_file, 'r') as f:
        seqs = SeqIO.parse(f, 'fasta')
        for seq in seqs:
            yield {"id":seq.id, "seq":str(seq.seq)}

b = Dataset.from_generator(fasta_generator, gen_kwargs={"fasta_file":"../data/whole_sequence.fasta"})
y = np.random.randint(0, 2, size=len(b))
b = b.add_column("labels", y)

## Load model

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device="cpu"

In [23]:
model = AutoModelForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
dataset = b.map(lambda examples: tokenizer(examples["seq"], return_tensors="np",padding=True, truncation=True), batched=True)

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

### Custom spliting with indices

In [30]:
cluster = split_methods.ClusterSpliter("../data/resultsDB_clu.tsv")
train, test = cluster.train_test_split(range(len(dataset)), index=dataset["id"])

In [33]:
new = DatasetDict({"train":dataset.select(train), "test":dataset.select(test)})

### I create the training arguments

In [15]:
from transformers import TrainingArguments, Trainer

In [16]:
lr = 8e-5
bs = 1
epochs = 4

Se use cpu to False whe you wan to use GPUs (it will automatically use GPUs), when f16 is True it will only use GPUs.

In [17]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.2, lr_scheduler_type='cosine', fp16=False,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to=['mlflow'],
    load_best_model_at_end=True, metric_for_best_model="matthews_correlation", 
    save_total_limit=5, save_strategy="epoch", seed=3242342, gradient_accumulation_steps=4, use_cpu=True) 

## cosine will set it to cosine and then we have a learning rate
## weight decay for the Adam -> this is fast.Ai does
## fp16 is half precision -> mixed training (using fp32 and fp16)
## save_total_limit to 5 -> so only 5 models will be saved
## each 500 steps will be saved a model
## Save the report to mlflow
# How to evaluate mlflow?
# LR finder does not give reliable results for Transformers models

## I train the model

In [18]:
import evaluate

You can use your own function as an evaluation metric -> then you have to retun as an dict  
Or you can use the evaluate library from hugging face to load different functions: [evaluate](https://huggingface.co/docs/evaluate/a_quick_tour)


In [39]:
def compute_metrics(eval_pred):
    accuray = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    matthews = evaluate.load("matthews_correlation")
    precision = evaluate.load("precision")
    recall = evaluate.load("recall")
    
    logits, labels = eval_pred # the predictions from the models are logits (it also returns the labels, 
    # it also returns loss, attentions and hidden state but that is the classification model, for evalaution Trainer will only 
    # return logits and labels)
    predictions = np.argmax(logits, axis=-1)
    metrics = {"accuracy": accuray.compute(predictions=predictions, references=labels), 
               "f1": f1.compute(predictions=predictions, references=labels), 
               "matthews": matthews.compute(predictions=predictions, references=labels),
               "precision": precision.compute(predictions=predictions, references=labels),
               "recall": recall.compute(predictions=predictions, references=labels)}

    return metrics

In [26]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        print(inputs)
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.compute_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [40]:
trainer = Trainer(model, args, train_dataset=new['train'], eval_dataset=new['test'], # we need to pass tokenized datasets
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

In [41]:
trainer.train()

MlflowException: Changing param values is not allowed. Param with key='problem_type' was already logged with value='None' for run ID='5bbf70dc125542818ecc8ad10aeff6e1'. Attempted logging new value 'single_label_classification'.

## Search for hyperparameters like the learning rate which is the most important

Well it is actually batch size and learning rate -> smaller batch sizes tend to work better than large batch sizes -> but learning rate is affected by batch as well -> higher abtch need higher learning rate.

Fix everything else and tune the learning rate -> learning rate finder doesn'0t seem to work very well for transformers?  
But teh idea of learning rate finder is just test different learning rates -> so I cannot test them?

Ktrains: A wrapper to do many tasks and has a learning rate finder: [ktrains](https://github.com/amaiya/ktrain)

Use pytorch lightning perhaps: [pytorch_lighningt_huggingface](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb)