---
## Setup and Variables

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import gc
import copy
import random
import os

import torch
import torch.nn as nn
import pandas as pd
import numpy as np

import seaborn as sns

import evaluate

from transformers import (
    T5Tokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    TrainerCallback
)

from src.model_new import (
    T5EncoderModelForTokenClassification
)

import src.config
import src.data
import src.model_new
import src.utils


import peft
from peft import (
    LoraConfig,
)


import sklearn.metrics

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [3]:
print("Base Model:\t", src.config.base_model_name)
print("MPS:\t\t", torch.backends.mps.is_available())
ROOT = src.utils.get_project_root_path()
print("Path:\t\t", ROOT)
device = torch.device('cuda:0' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
print(f"Using device:\t {device}")

# os.environ['CUDA_LAUNCH_BLOCKING'] = "0"
use_crf = False


SEED = 42
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

Base Model:	 Rostlab/prot_t5_xl_uniref50
MPS:		 True
Path:		 /Users/finnlueth/Developer/gits/prottrans-t5-signalpeptide-prediction
Using device:	 mps


---
## Create Tokenizer and Load Model

In [4]:
t5_tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model_name_or_path=src.config.base_model_name,
    do_lower_case=False,
    use_fast=True,
    legacy=False
)

In [5]:
t5_base_model = T5EncoderModelForTokenClassification.from_pretrained(
    pretrained_model_name_or_path=src.config.base_model_name,
    device_map='auto',
    load_in_8bit=False,
    custom_num_labels=len(src.config.label_decoding),
    custom_dropout_rate=0.1,
    use_crf=use_crf
    )

Some weights of T5EncoderModelForTokenClassification were not initialized from the model checkpoint at Rostlab/prot_t5_xl_uniref50 and are newly initialized: ['custom_classifier.weight', 'custom_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
t5_base_model.custom_classifier.weight = nn.Linear(
    in_features=t5_base_model.config.hidden_size,
    out_features=t5_base_model.custom_num_labels
).weight
if use_crf:
    t5_base_model.crf.reset_parameters()

---
## Apply LoRA

In [7]:
if use_crf:
    modules_to_save = ['custom_classifier', 'crf']
else:
    modules_to_save = ['custom_classifier']

lora_config = LoraConfig(
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=['q', 'k', 'v', 'o'],
    bias="none",
    modules_to_save=modules_to_save,
)

t5_lora_model = peft.get_peft_model(t5_base_model, lora_config)
t5_lora_model.print_trainable_parameters()

trainable params: 3,944,460 || all params: 1,212,086,284 || trainable%: 0.32542732741623864


In [8]:
t5_lora_model.base_model.custom_classifier.modules_to_save.default.weight.min(), t5_lora_model.base_model.custom_classifier.modules_to_save.default.weight.max()

(tensor(-0.0312, grad_fn=<MinBackward1>),
 tensor(0.0312, grad_fn=<MaxBackward1>))

In [9]:
# [x[0] for x in t5_base_model.crf.named_parameters()]
# t5_lora_model.crf.transitions

In [10]:
# [x for x in t5_lora_model.named_parameters() if 'crf' in x[0]]

---
## Load Data, Split into Dataset, and Tokenize Sequences

In [11]:
FASTA_FILENAME = '5_SignalP_5.0_Training_set.fasta'
# FASTA_FILENAME = '5_SignalP_5.0_Training_set_testing.fasta'
annotations_name = 'Label' # Choose Type or Label

df_data = src.data.process(src.data.parse_file(ROOT + '/data/raw/' + FASTA_FILENAME))

dataset_signalp = src.model_new.create_datasets(
    splits=src.config.splits,
    tokenizer=t5_tokenizer,
    data=df_data,
    annotations_name=annotations_name,
    # dataset_size=src.config.dataset_size,
    dataset_size=3,
    encoder=src.config.select_encodings[annotations_name],
    )

del df_data

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [12]:
display(dataset_signalp)

ds_index = 0
print(dataset_signalp['valid'][ds_index]['input_ids'])
print(dataset_signalp['valid'][ds_index]['labels'])
print(dataset_signalp['valid'][ds_index]['attention_mask'])

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3
    })
})

[19, 9, 13, 12, 10, 10, 12, 4, 15, 9, 6, 11, 10, 3, 15, 14, 11, 16, 14, 9, 10, 4, 4, 9, 4, 6, 11, 4, 12, 10, 12, 18, 5, 9, 16, 6, 17, 16, 9, 5, 7, 18, 9, 9, 14, 11, 8, 15, 12, 9, 11, 4, 17, 11, 4, 4, 9, 10, 17, 13, 7, 11, 11, 5, 9, 12, 5, 21, 10, 4, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


---

In [13]:
# import torch
# from torchcrf import CRF
# num_tags = 5
# model = CRF(num_tags=num_tags, batch_first=True)

In [14]:
# batch_size = 2
# seq_length = 4
# emissions = torch.randn(batch_size, seq_length, num_tags)
# tags = torch.tensor([
#     [1, 2, 3, 3],
#     [2, 2, 2, 3]
#     ], dtype=torch.long)  # (seq_length, batch_size)

# display(emissions, emissions.shape)
# display(tags, tags.shape)
# display(model(emissions, tags))
# display(torch.Tensor(model.decode(emissions)).shape)
# display(model.decode(emissions))

---
## Training Loop
https://huggingface.co/docs/peft/task_guides/token-classification-lora

In [15]:
data_collator = DataCollatorForTokenClassification(tokenizer=t5_tokenizer)

training_args = TrainingArguments(
    output_dir='./checkpoints',
    learning_rate=src.config.lr,
    per_device_train_batch_size=src.config.batch_size,
    per_device_eval_batch_size=src.config.batch_size,
    num_train_epochs=src.config.num_epochs,
    logging_steps=src.config.logging_steps,
    # save_strategy="steps",
    # save_steps=src.config.save_steps,
    # evaluation_strategy="steps",
    # eval_steps=src.config.eval_steps,
    # gradient_accumulation_steps=accum,
    # load_best_model_at_end=True,
    # save_total_limit=5,
    seed=SEED,
    # fp16=True,
    # deepspeed=deepspeed_config,
    remove_unused_columns=False,
    label_names=['labels'],
    # debug="underflow_overflow",
)

trainer = Trainer(
    model=t5_lora_model,
    args=training_args,
    train_dataset=dataset_signalp['train'],
    eval_dataset=dataset_signalp['valid'],
    data_collator=data_collator,
    compute_metrics=src.model_new.compute_metrics,
)

# class EvaluateFirstStepCallback(TrainerCallback):
#     def on_step_begin(self, args, state, control, **kwargs):
#         if state.global_step == 0:
#             control.should_evaluate = True
# trainer.add_callback(EvaluateFirstStepCallback())

In [16]:
# t5_lora_model.crf

In [17]:
gc.collect()
torch.cuda.empty_cache()
# torch.mps.empty_cache()

In [18]:
trainer.train()

Python(33060) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


  0%|          | 0/1 [00:00<?, ?it/s]

{'loss': 1.7725, 'learning_rate': 0.0, 'epoch': 1.0}
{'train_runtime': 56.2057, 'train_samples_per_second': 0.16, 'train_steps_per_second': 0.018, 'train_loss': 1.7725309133529663, 'epoch': 1.0}


TrainOutput(global_step=1, training_loss=1.7725309133529663, metrics={'train_runtime': 56.2057, 'train_samples_per_second': 0.16, 'train_steps_per_second': 0.018, 'train_loss': 1.7725309133529663, 'epoch': 1.0})

In [None]:
metrics=trainer.evaluate()
print(metrics)

In [None]:
# t5_lora_model.crf.modules_to_save.default.decode

---

In [None]:
training_log = pd.DataFrame(trainer.state.log_history)
display(training_log)

In [None]:
# adapter_location = '/models/testing_1'
# training_log['eval_confusion_matrix'] = training_log['eval_confusion_matrix'].apply(lambda x: x.tolist() if type(x)==np.ndarray else None)
# t5_lora_model.save_pretrained(ROOT + adapter_location)
# training_log.to_csv(ROOT + adapter_location + '/training_log.csv', index=False)
# training_log.to_parquet(ROOT + adapter_location + '/training_log.parquet')

---

In [None]:
_ds_index = 2
_ds_type = 'test'

_input_ids_test = t5_tokenizer.decode(dataset_signalp[_ds_type][_ds_index]['input_ids'][:-1])
_labels_test = torch.tensor([dataset_signalp[_ds_type][_ds_index]['labels'] + [-100]]).to(device)
_attention_mask_test = torch.tensor([dataset_signalp[_ds_type][_ds_index]['attention_mask']]).to(device)

_labels_test_decoded = [src.config.label_decoding[x] for x in _labels_test.tolist()[0][:-1]]

print('Iput IDs:\t', _input_ids_test)
print('Labels:\t\t', *_labels_test.tolist()[0])
print('Labels Decoded:\t', *_labels_test_decoded)
print('Attention Mask:\t', *_attention_mask_test.tolist()[0])
print('----')

preds = src.model_new.predict_model(
    sequence=_input_ids_test,
    tokenizer=t5_tokenizer,
    model=t5_lora_model,
    labels=_labels_test,
    attention_mask=_attention_mask_test,
    device=device,
    viterbi_decoding=use_crf,
    )

_result = src.model_new.translate_logits(
    logits=preds.logits,
    viterbi_decoding=use_crf,
    )

print('Result: \t',* _result)

In [None]:
# torch.set_printoptions(threshold=10_000)
# t5_lora_model.custom_classifier.modules_to_save.default.weight