# ProtTans Finetuning with LoRA for Signal Peptide Prediction

## Links
### Papers/ Knowledge
- https://www.sciencedirect.com/science/article/pii/S2001037021000945
- https://huggingface.co/blog/peft
- https://ieeexplore.ieee.org/ielx7/34/9893033/9477085/supp1-3095381.pdf?arnumber=9477085
### Architecture
- https://www.philschmid.de/fine-tune-flan-t5-peft
- https://huggingface.co/spaces/evaluate-metric/seqeval
- https://huggingface.co/docs/transformers/v4.33.3/en/model_doc/esm#transformers.EsmForTokenClassification
- https://huggingface.co/docs/datasets/v2.14.5/en/package_reference/builder_classes#datasets.SplitGenerator
- https://huggingface.co/docs/datasets/v2.14.5/en/package_reference/main_classes#datasets.Dataset.add_column
- https://huggingface.co/docs/transformers/main_classes/data_collator
- https://huggingface.co/docs/transformers/main/en/main_classes/trainer#checkpoints
### Code
- https://github.com/ziegler-ingo/cleavage_extended/blob/master/models/final/c_bilstm_t5_coteaching.ipynb
- https://www.kaggle.com/code/henriupton/proteinet-pytorch-ems2-t5-protbert-embeddings/notebook#7.-Train-the-Model
- https://www.kaggle.com/code/prithvijaunjale/t5-multi-label-classification
### Optmization
- https://huggingface.co/blog/accelerate-large-models
- https://huggingface.co/docs/transformers/hpo_train

## ToDo
- Implement BitsAndBites (QLoRA)
- Implement DeepSpeed
- Fix weird extra char on inference

## Notebook Setup
___

In [1]:
%load_ext autoreload
%autoreload 2

## Packages
___

In [2]:
import re
import os
import math
import copy
import types
import yaml

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.nn import (
    CrossEntropyLoss,
    MSELoss
)

import evaluate

from transformers import (
    AutoModelForTokenClassification,
    AutoConfig,
    T5EncoderModel,
    T5Tokenizer,
    T5PreTrainedModel,
    T5ForConditionalGeneration,
    pipeline,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    set_seed,
    )
from transformers.modeling_outputs import TokenClassifierOutput

from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    get_peft_config,
    PeftModel,
    PeftConfig,
    prepare_model_for_kbit_training
    )

from datasets import Dataset

import src.config as config

from src.model import (
    get_prottrans_tokenizer_model,
    df_to_dataset,
    inject_linear_layer,
    compute_metrics_full,
    compute_metrics_fast
    )
from src.utils import get_project_root_path
from torch.utils.data import DataLoader

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


---
## Setup and Variables

In [3]:
base_model_name = config.base_model_name
print("Base Model:\t", base_model_name)
print("MPS:\t\t", torch.backends.mps.is_available())
ROOT = get_project_root_path()
print("Path:\t\t", ROOT)
device = torch.device('cuda:0' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
print(f"Using device:\t {device}")

Base Model:	 Rostlab/prot_t5_xl_uniref50
MPS:		 True
Path:		 /Users/finnlueth/Developer/gits/prottrans-t5-signalpeptide-prediction
Using device:	 mps


In [None]:
lr = config.lr
batch_size = config.batch_size
num_epochs = config.num_epochs
dropout_rate = config.dropout_rate

label_encoding = config.label_encoding
label_list = config.label_decoding

compute_metrics = compute_metrics_fast

---
## Create Tokenizer and Load Model

In [None]:
# model_architecture = T5ForConditionalGeneration
model_architecture = T5EncoderModel
t5_tokenizer, t5_base_model = get_prottrans_tokenizer_model(base_model_name, model_architecture)

---
## Load Data, Split into Dataset, and Tokenize Sequences

In [None]:
df_data = pd.read_parquet(ROOT + '/data/processed/5.0_train.parquet.gzip')

In [None]:
ds_train = df_data[df_data.Split == 'train']

ds_train = df_to_dataset(
    t5_tokenizer,
    ds_train.Sequence.to_list()[:11],
    ds_train.Label.to_list()[:11],
)

In [None]:
# ToDo: Use entire test set
ds_test = df_data[df_data.Split == 'test']

ds_test = df_to_dataset(
    t5_tokenizer,
    ds_test.Sequence.to_list()[:3],
    ds_test.Label.to_list()[:3]
)

In [None]:
ds_test

In [None]:
print(ds_test[0]['input_ids'])
print(ds_test[0]['attention_mask'])
print(ds_test[0]['labels'])

In [None]:
t5_tokenizer.decode(ds_test[0]['input_ids'])

---
## Apply LoRA

In [None]:
lora_config = LoraConfig(
        # task_type=TaskType.TOKEN_CLS,
        inference_mode=False,
        r=8,
        lora_alpha=16,
        lora_dropout=0.05,
        target_modules=['q', 'k', 'v', 'o'],
        # target_modules=['o'],
        bias="none",
    )

In [None]:
t5_lora_model = get_peft_model(t5_base_model, lora_config)
del t5_base_model
# t5_lora_model = prepare_model_for_kbit_training(t5_lora_model) # add quantization

In [None]:
t5_lora_model.print_trainable_parameters()

---
## Model

In [None]:
t5_lora_model = inject_linear_layer(t5_lora_model)

In [None]:
num_labels = label_list.__len__()
t5_lora_model.dropout = nn.Dropout(dropout_rate)
t5_lora_model.num_labels = num_labels

t5_lora_model.get_base_model().dropout = nn.Dropout(dropout_rate)
t5_lora_model.get_base_model().classifier = nn.Linear(
    in_features=t5_lora_model.get_base_model().config.hidden_size,
    out_features=label_list.__len__()
    )

---
## DeepSpeed

In [None]:
os.environ["MASTER_ADDR"] = "127.0.0.1"
os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"

---
## Training Loop
https://huggingface.co/docs/peft/task_guides/token-classification-lora

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=t5_tokenizer)

In [None]:
with open(ROOT+'/deepspeed_config.yaml', 'r') as file:
    deepspeed_config = yaml.safe_load(file)

In [None]:
training_args = TrainingArguments(
    output_dir='./',
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    # save_strategy="steps",
    # save_steps=100,
    save_strategy='no',
    load_best_model_at_end=True,
    save_total_limit=3,
    seed=42,
    # deepspeed=deepspeed_config
)

In [None]:
trainer = Trainer(
    model=t5_lora_model,
    args=training_args,
    train_dataset=ds_train,
    # eval_dataset=ds_test, #make sure to change to actual eval later
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
torch.tensor(ds_train[0]['input_ids']).to(device).size()

In [None]:
t5_lora_model = t5_lora_model.to('cpu')

In [None]:
label_list

In [None]:
num_labels = label_list.__len__()
t5_lora_model.dropout = nn.Dropout(dropout_rate)
t5_lora_model.num_labels = num_labels

t5_lora_model.get_base_model().dropout = nn.Dropout(dropout_rate)
t5_lora_model.get_base_model().classifier = nn.Linear(
    in_features=t5_lora_model.get_base_model().config.hidden_size,
    out_features=label_list.__len__()
    )

In [None]:
trainer.train()

---
## Save Model

In [None]:
t5_lora_model.save_pretrained(ROOT + '/models/linear_model')

---
---
---