In [7]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import numpy as np
from Bio import SeqIO
from datasets import Dataset
from transformers import get_scheduler, get_cosine_schedule_with_warmup

## https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb
## https://github.com/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb

## Load data

In [3]:
def fasta_generator(fasta_file: str="../data/whole_sequence.fasta"):
    with open(fasta_file, 'r') as f:
        seqs = SeqIO.parse(f, 'fasta')
        for seq in seqs:
            yield {"id":seq.id, "seq":str(seq.seq)}

b = Dataset.from_generator(fasta_generator, gen_kwargs={"fasta_file":"../data/whole_sequence.fasta"})
b

Dataset({
    features: ['id', 'seq'],
    num_rows: 147
})

## Load model

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device="cpu"

In [9]:
model = AutoModelForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### I create the training arguments

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
lr = 8e-5
bs = 32
epochs = 4

In [None]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.2, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to=['mlflow'],
    load_best_model_at_end=True, metric_for_best_model="matthews_correlation", 
    save_total_limit=5, save_strategy="epoch", seed=3242342) 

## cosine will set it to cosine and then we have a learning rate
## weight decay for the Adam -> this is fast.Ai does
## fp16 is half precision -> mixed training (using fp32 and fp16)
## save_total_limit to 5 -> so only 5 models will be saved
## each 500 steps will be saved a model
## Save the report to mlflow
# How to evaluate mlflow?
# LR finder does not give reliable results for Transformers models

## I train the model

In [1]:
import evaluate

You can use your own function as an evaluation metric -> then you have to retun as an dict  
Or you can use the evaluate library from hugging face to load different functions: [evaluate](https://huggingface.co/docs/evaluate/a_quick_tour)


In [None]:
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred # the predictions from the models are logits (it also returns the labels, 
    # it also returns loss, attentions and hidden state but that is the classification model, for evalaution Trainer will only 
    # return logits and labels)
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
def corr(x,y): return np.corrcoef(x,y)[0][1]
def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

In [None]:
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'], # we need to pass tokenized datasets
                  tokenizer=tokenizer, compute_metrics=corr_d)

In [None]:
trainer.train()

## Search for hyperparameters like the learning rate which is the most important

Well it is actually batch size and learning rate -> smaller batch sizes tend to work better than large batch sizes -> but learning rate is affected by batch as well -> higher abtch need higher learning rate.

Fix everything else and tune the learning rate -> learning rate finder doesn'0t seem to work very well for transformers?  
But teh idea of learning rate finder is just test different learning rates -> so I cannot test them?

Ktrains: A wrapper to do many tasks and has a learning rate finder: [ktrains](https://github.com/amaiya/ktrain)

Use pytorch lightning perhaps: [pytorch_lighningt_huggingface](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb)