---
## Setup and Variables

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import gc
import copy
import random
import os

import torch
import torch.nn as nn
import pandas as pd
import numpy as np

import seaborn as sns

import evaluate

from transformers import (
    T5Tokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    TrainerCallback
)

from src.model_new import (
    T5EncoderModelForTokenClassification
)

import src.config
import src.data
import src.model_new
import src.utils


import peft
from peft import (
    LoraConfig,
)


import sklearn.metrics

In [3]:
print("Base Model:\t", src.config.base_model_name)
print("MPS:\t\t", torch.backends.mps.is_available())
ROOT = src.utils.get_project_root_path()
print("Path:\t\t", ROOT)
device = torch.device('cuda:0' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
print(f"Using device:\t {device}")

# os.environ['CUDA_LAUNCH_BLOCKING'] = "0"
use_crf = False

SEED = 42
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

Base Model:	 Rostlab/prot_t5_xl_uniref50
MPS:		 False
Path:		 /home/ec2-user/developer/prottrans-t5-signalpeptide-prediction
Using device:	 cuda:0


---
## Create Tokenizer and Load Model

In [4]:
t5_tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model_name_or_path=src.config.base_model_name,
    do_lower_case=False,
    use_fast=True,
    legacy=False
)

In [5]:
t5_base_model = T5EncoderModelForTokenClassification.from_pretrained(
    pretrained_model_name_or_path=src.config.base_model_name,
    device_map='auto',
    load_in_8bit=False,
    custom_num_labels=len(src.config.label_decoding),
    custom_dropout_rate=0.1,
    use_crf=use_crf
    )

Some weights of T5EncoderModelForTokenClassification were not initialized from the model checkpoint at Rostlab/prot_t5_xl_uniref50 and are newly initialized: ['custom_classifier.bias', 'custom_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
t5_base_model.custom_classifier.weight = nn.Linear(
    in_features=t5_base_model.config.hidden_size,
    out_features=t5_base_model.custom_num_labels
).weight
if use_crf:
    t5_base_model.crf.reset_parameters()

---
## Apply LoRA

In [7]:
if use_crf:
    modules_to_save = ['custom_classifier', 'crf']
else:
    modules_to_save = ['custom_classifier']

lora_config = LoraConfig(
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=['q', 'k', 'v', 'o'],
    bias="none",
    # modules_to_save=modules_to_save,
    modules_to_save=['custom_classifier'],
)

t5_lora_model = peft.get_peft_model(t5_base_model, lora_config)
t5_lora_model.print_trainable_parameters()

trainable params: 3,944,460 || all params: 1,212,086,284 || trainable%: 0.32542732741623864


In [8]:
t5_lora_model.base_model.custom_classifier.modules_to_save.default.weight.min(), t5_lora_model.base_model.custom_classifier.modules_to_save.default.weight.max()

(tensor(-0.0312, grad_fn=<MinBackward1>),
 tensor(0.0312, grad_fn=<MaxBackward1>))

In [9]:
# [x[0] for x in t5_base_model.crf.named_parameters()]
# t5_lora_model.crf.transitions

In [10]:
# [x for x in t5_lora_model.named_parameters() if 'crf' in x[0]]

---
## Load Data, Split into Dataset, and Tokenize Sequences

In [11]:
FASTA_FILENAME = '5_SignalP_5.0_Training_set.fasta'
# FASTA_FILENAME = '5_SignalP_5.0_Training_set_testing.fasta'
annotations_name = 'Label' # Choose Type or Label

df_data = src.data.process(src.data.parse_file(ROOT + '/data/raw/' + FASTA_FILENAME))

dataset_signalp = src.model_new.create_datasets(
    splits=src.config.splits,
    tokenizer=t5_tokenizer,
    data=df_data,
    annotations_name=annotations_name,
    # dataset_size=src.config.dataset_size,
    dataset_size=3,
    encoder=src.config.select_encodings[annotations_name],
    )

del df_data

In [12]:
display(dataset_signalp)

ds_index = 3
print(len(dataset_signalp['train'][ds_index]['input_ids']), dataset_signalp['train'][ds_index]['input_ids'])
print(len(dataset_signalp['train'][ds_index]['labels']), dataset_signalp['train'][ds_index]['labels'])
print(len(dataset_signalp['train'][ds_index]['attention_mask']), dataset_signalp['train'][ds_index]['attention_mask'])

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3
    })
})

71 [19, 14, 13, 13, 7, 11, 16, 7, 7, 13, 3, 3, 6, 3, 3, 3, 3, 13, 3, 19, 10, 7, 3, 3, 3, 3, 10, 4, 11, 10, 6, 4, 22, 9, 15, 10, 3, 6, 4, 3, 10, 15, 3, 7, 13, 15, 20, 9, 8, 20, 15, 20, 18, 9, 9, 20, 4, 9, 8, 19, 14, 8, 8, 7, 7, 3, 7, 6, 7, 10, 1]
70 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
71 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


---

In [13]:
# import torch
# from torchcrf import CRF
# num_tags = 5
# model = CRF(num_tags=num_tags, batch_first=True)

In [14]:
# batch_size = 2
# seq_length = 4
# emissions = torch.randn(batch_size, seq_length, num_tags)
# tags = torch.tensor([
#     [1, 2, 3, 3],
#     [2, 2, 2, 3]
#     ], dtype=torch.long)  # (seq_length, batch_size)

# display(emissions, emissions.shape)
# display(tags, tags.shape)
# display(model(emissions, tags))
# display(torch.Tensor(model.decode(emissions)).shape)
# display(model.decode(emissions))

---
## Training Loop
https://huggingface.co/docs/peft/task_guides/token-classification-lora

In [15]:
data_collator = DataCollatorForTokenClassification(tokenizer=t5_tokenizer)

training_args = TrainingArguments(
    output_dir='./checkpoints',
    learning_rate=src.config.lr,
    per_device_train_batch_size=src.config.batch_size,
    per_device_eval_batch_size=src.config.batch_size,
    num_train_epochs=src.config.num_epochs,
    logging_steps=src.config.logging_steps,
    # save_strategy="steps",
    # save_steps=src.config.save_steps,
    # evaluation_strategy="steps",
    # eval_steps=src.config.eval_steps,
    # gradient_accumulation_steps=accum,
    # load_best_model_at_end=True,
    # save_total_limit=5,
    seed=SEED,
    # fp16=True,
    # deepspeed=deepspeed_config,
    remove_unused_columns=False,
    label_names=['labels'],
    # debug="underflow_overflow",
)

trainer = Trainer(
    model=t5_lora_model,
    args=training_args,
    train_dataset=dataset_signalp['train'],
    eval_dataset=dataset_signalp['valid'],
    data_collator=data_collator,
    compute_metrics=src.model_new.compute_metrics,
)

# class EvaluateFirstStepCallback(TrainerCallback):
#     def on_step_begin(self, args, state, control, **kwargs):
#         if state.global_step == 0:
#             control.should_evaluate = True
# trainer.add_callback(EvaluateFirstStepCallback())

In [17]:
gc.collect()
torch.cuda.empty_cache()
# torch.mps.empty_cache()

In [31]:
t5_lora_model.custom_classifier.original_module.weight.min()

tensor(-0.0312, device='cuda:0', grad_fn=<MinBackward1>)

In [22]:
trainer.train()

tensor(-3.4237e+27, device='cuda:0', grad_fn=<MinBackward1>) tensor(0.4764, device='cuda:0', grad_fn=<MaxBackward1>)
tensor(False, device='cuda:0')
logits tensor([[-4.8972e-02, -1.8654e-02,  5.1627e-02, -3.4237e+27,  2.9514e-01,
          5.7991e-02],
        [-7.1986e-02, -1.7042e-02, -6.7814e-02, -3.4237e+27,  2.5285e-01,
         -2.4287e-02],
        [ 6.9416e-02, -1.9611e-02,  1.1845e-01, -3.4237e+27,  4.4818e-02,
          1.4938e-01],
        ...,
        [ 1.7503e-01, -2.8016e-01, -2.9745e-02, -3.4237e+27,  1.6332e-01,
          2.2564e-02],
        [ 9.0682e-02, -3.5049e-02, -6.4780e-02, -3.4237e+27,  7.8264e-02,
          2.6416e-03],
        [-2.7950e-02,  3.7203e-02,  5.7097e-02, -3.4237e+27, -7.7143e-02,
         -5.1237e-02]], device='cuda:0', grad_fn=<ViewBackward0>)
labels tensor([   4,    4,    4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
           4,    4,    4,    4,    4,    4,    4,    4,    4,    4,    4,    3,
           3,    3,    3,    3,    3,   

Step,Training Loss
1,7.553949814751452e+26


TrainOutput(global_step=1, training_loss=7.553949814751454e+26, metrics={'train_runtime': 0.8156, 'train_samples_per_second': 11.034, 'train_steps_per_second': 1.226, 'total_flos': 4646632356792.0, 'train_loss': 7.553949814751454e+26, 'epoch': 1.0})

In [21]:
metrics=trainer.evaluate()
print(metrics)

tensor(-1.2319, device='cuda:0') tensor(1.2816, device='cuda:0')


tensor(-1.2217, device='cuda:0') tensor(1.3702, device='cuda:0')
tensor(-1.2800, device='cuda:0') tensor(1.4304, device='cuda:0')
tensor(-1.3456, device='cuda:0') tensor(1.3881, device='cuda:0')
tensor(-1.1950, device='cuda:0') tensor(1.1463, device='cuda:0')
tensor(-1.3821, device='cuda:0') tensor(1.3203, device='cuda:0')
tensor(-1.3316, device='cuda:0') tensor(1.2572, device='cuda:0')
tensor(-1.3085, device='cuda:0') tensor(1.2676, device='cuda:0')
tensor(-1.3280, device='cuda:0') tensor(1.2849, device='cuda:0')
tensor(-1.3682, device='cuda:0') tensor(1.2365, device='cuda:0')
tensor(-1.4654, device='cuda:0') tensor(1.2441, device='cuda:0')
tensor(-1.1798, device='cuda:0') tensor(1.2467, device='cuda:0')
tensor(-1.1554, device='cuda:0') tensor(1.2987, device='cuda:0')
tensor(-1.4104, device='cuda:0') tensor(1.5904, device='cuda:0')
tensor(-1.2398, device='cuda:0') tensor(1.2105, device='cuda:0')
tensor(-1.4310, device='cuda:0') tensor(1.4350, device='cuda:0')
tensor(-1.2382, device='c

In [22]:
# t5_lora_model.crf.modules_to_save.default.decode

---

In [23]:
training_log = pd.DataFrame(trainer.state.log_history)
display(training_log)

Unnamed: 0,loss,learning_rate,epoch,step,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss,eval_loss,eval_accuracy_metric,eval_precision_metric,eval_recall_metric,eval_f1_metric,eval_matthews_correlation,eval_confusion_matrix,eval_runtime,eval_samples_per_second,eval_steps_per_second
0,1.567752e+19,9.8e-05,0.02,1,,,,,,,,,,,,,,,
1,1.567752e+19,9.6e-05,0.04,2,,,,,,,,,,,,,,,
2,1.44597e+19,9.5e-05,0.05,3,,,,,,,,,,,,,,,
3,1.538356e+19,9.3e-05,0.07,4,,,,,,,,,,,,,,,
4,1.538356e+19,9.1e-05,0.09,5,,,,,,,,,,,,,,,
5,1.540914e+19,8.9e-05,0.11,6,,,,,,,,,,,,,,,
6,1.486564e+19,8.8e-05,0.12,7,,,,,,,,,,,,,,,
7,1.555153e+19,8.6e-05,0.14,8,,,,,,,,,,,,,,,
8,1.569241e+19,8.4e-05,0.16,9,,,,,,,,,,,,,,,
9,1.542556e+19,8.2e-05,0.18,10,,,,,,,,,,,,,,,


In [24]:
# adapter_location = '/models/testing_1'
# training_log['eval_confusion_matrix'] = training_log['eval_confusion_matrix'].apply(lambda x: x.tolist() if type(x)==np.ndarray else None)
# t5_lora_model.save_pretrained(ROOT + adapter_location)
# training_log.to_csv(ROOT + adapter_location + '/training_log.csv', index=False)
# training_log.to_parquet(ROOT + adapter_location + '/training_log.parquet')

---

In [25]:
_ds_index = 2
_ds_type = 'test'

_input_ids_test = t5_tokenizer.decode(dataset_signalp[_ds_type][_ds_index]['input_ids'][:-1])
_labels_test = torch.tensor([dataset_signalp[_ds_type][_ds_index]['labels'] + [-100]]).to(device)
_attention_mask_test = torch.tensor([dataset_signalp[_ds_type][_ds_index]['attention_mask']]).to(device)

_labels_test_decoded = [src.config.label_decoding[x] for x in _labels_test.tolist()[0][:-1]]

print('Iput IDs:\t', _input_ids_test)
print('Labels:\t\t', *_labels_test.tolist()[0])
print('Labels Decoded:\t', *_labels_test_decoded)
print('Attention Mask:\t', *_attention_mask_test.tolist()[0])
print('----')

preds = src.model_new.predict_model(
    sequence=_input_ids_test,
    tokenizer=t5_tokenizer,
    model=t5_lora_model,
    labels=_labels_test,
    attention_mask=_attention_mask_test,
    device=device,
    viterbi_decoding=use_crf,
    )

_result = src.model_new.translate_logits(
    logits=preds.logits,
    viterbi_decoding=use_crf,
    )

print('Result: \t',* _result)

Iput IDs:	 M A A V I L E R L G A L W V Q N L R G K L A L G I L P Q S H I H T S A S L E I S R K W E K K N K I V Y P P Q L P G E P R R P A E I Y H C R R
Labels:		 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -100
Labels Decoded:	 I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I
Attention Mask:	 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
----
tensor(-0.9441, device='cuda:0') tensor(1.1183, device='cuda:0')
Result: 	 M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M M


In [26]:
# torch.set_printoptions(threshold=10_000)
# t5_lora_model.custom_classifier.modules_to_save.default.weight