---
## Setup and Variables

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import random
import numpy as np

import torch
import torch.nn as nn

import gc
import time

import pandas as pd

from transformers import (
    T5EncoderModel,
    T5Tokenizer,
    T5Config,
    modeling_outputs,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)

import src.config as config

from peft import (
    LoraConfig,
    TaskType
)

import peft

from datasets import Dataset, DatasetDict

from src.model_new import (
    T5EncoderModelForSequenceClassification,
)
import src.config
import src.data
import src.model_new

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [3]:
print("Base Model:\t", src.config.base_model_name)
print("MPS:\t\t", torch.backends.mps.is_available())
ROOT = src.utils.get_project_root_path()
print("Path:\t\t", ROOT)
device = torch.device('cuda:0' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
print(f"Using device:\t {device}")

torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

Base Model:	 Rostlab/prot_t5_xl_uniref50
MPS:		 True
Path:		 /Users/finnlueth/Developer/gits/prottrans-t5-signalpeptide-prediction
Using device:	 mps


---
## Create Tokenizer and Load Model

In [4]:
t5_tokenizer = T5Tokenizer.from_pretrained(
        pretrained_model_name_or_path=src.config.base_model_name,
        do_lower_case=False,
        use_fast=True,
        legacy=False
    )

In [5]:
t5_base_model = T5EncoderModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=src.config.base_model_name,
    device_map='auto',
    load_in_8bit=False,
    custom_num_labels=len(src.config.type_decoding),
    custom_dropout_rate=0.1,
    )

Some weights of T5EncoderModelForSequenceClassification were not initialized from the model checkpoint at Rostlab/prot_t5_xl_uniref50 and are newly initialized: ['custom_classifier.weight', 'custom_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
t5_base_model.custom_classifier.weight = nn.Linear(
        in_features=t5_base_model.config.hidden_size,
        out_features=t5_base_model.custom_num_labels
    ).weight

---
## Apply LoRA

In [7]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=['q', 'k', 'v', 'o'],
    bias="none",
)
t5_lora_model = peft.get_peft_model(t5_base_model, lora_config)
t5_lora_model.print_trainable_parameters()

trainable params: 3,940,360 || all params: 1,212,082,184 || trainable%: 0.3250901673182253


---
## Load Data, Split into Dataset, and Tokenize Sequences

In [8]:
FASTA_FILENAME = '5_SignalP_5.0_Training_set.fasta'
# FASTA_FILENAME = '5_SignalP_5.0_Training_set_testing.fasta'
annotations_name = 'Type' # Choose Type or Label

df_data = src.data.process(src.data.parse_file(ROOT + '/data/raw/' + FASTA_FILENAME))

dataset_signalp = src.model_new.create_datasets(
    splits=src.config.splits,
    tokenizer=t5_tokenizer,
    data=df_data,
    annotations_name=annotations_name,
    dataset_size=src.config.dataset_size,
    encoder=src.config.select_encodings[annotations_name],
    )

del df_data

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [9]:
display(dataset_signalp)
print(dataset_signalp['valid'][0]['input_ids'])
print(dataset_signalp['valid'][0]['labels'])
print(dataset_signalp['valid'][0]['attention_mask'])

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 12462
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4149
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4147
    })
})

[19, 19, 10, 17, 9, 6, 4, 10, 15, 10, 12, 5, 6, 5, 6, 7, 7, 5, 5, 10, 6, 10, 10, 10, 3, 12, 10, 12, 9, 20, 20, 3, 4, 10, 10, 10, 10, 19, 4, 10, 7, 13, 12, 19, 13, 22, 5, 17, 5, 4, 6, 5, 17, 7, 5, 17, 18, 15, 13, 17, 16, 9, 9, 9, 3, 22, 10, 4, 4, 10, 1]
0
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


---

In [10]:
t5_lora_model.to(device)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): T5EncoderModelForSequenceClassification(
      (shared): Embedding(128, 1024)
      (encoder): T5Stack(
        (embed_tokens): Embedding(128, 1024)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): Linear(
                    in_features=1024, out_features=4096, bias=False
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1024, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=4096, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
             

In [18]:
    with torch.no_grad():
    embds_2 = t5_lora_model(
        input_ids=torch.tensor(
            [[7, 4, 7, 5, 7, 7, 7, 7, 7],[3, 4, 7, 5, 7, 5, 7, 7, 7]]).to('mps'),
        attention_mask=torch.tensor(
            [[1, 1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1, 1]]).to('mps'),
        labels=torch.tensor([[1],[2]]).to('mps'),
    )

torch.Size([2, 1024])
tensor([[ 0.0594,  0.1111,  0.0456,  ..., -0.1198,  0.1975, -0.0226],
        [ 0.1154,  0.1476,  0.1941,  ..., -0.1042,  0.1343,  0.0356]],
       device='mps:0')
torch.Size([2, 4])
tensor([[ 0.0721, -0.0130,  0.0858,  0.0663],
        [ 0.0996,  0.1204,  0.0068,  0.0828]], device='mps:0')
loss tensor(1.4553, device='mps:0')


In [22]:
embds_2.logits.argmax(dim=-1)

tensor([2, 1], device='mps:0')

In [None]:
dataset_signalp['train'][0]

In [None]:
type(t5_lora_model)

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=t5_tokenizer)

training_args = TrainingArguments(
    output_dir='./checkpoints',
    learning_rate=config.lr,
    per_device_train_batch_size=config.batch_size,
    per_device_eval_batch_size=config.batch_size,
    num_train_epochs=config.num_epochs,
    logging_steps=config.logging_steps,
    # save_strategy="steps",
    # save_steps=config.save_steps,
    # evaluation_strategy="steps",
    # eval_steps=1,
    # load_best_model_at_end=True,
    # save_total_limit=5,
    seed=42,
    # fp16=True,
    # deepspeed=deepspeed_config,
    remove_unused_columns=False
)

trainer = Trainer(
    model=t5_lora_model,
    args=training_args,
    train_dataset=dataset_signalp['train'],
    eval_dataset=dataset_signalp['valid'],
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
)

In [None]:
gc.collect()
torch.cuda.empty_cache()
torch.mps.empty_cache()

In [None]:
trainer.train()