In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import pickletools

import re
import gc
import os
import math
import copy
import types
import yaml
import sys

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.nn import (
    CrossEntropyLoss,
    MSELoss
)
from torch.utils.data import DataLoader

import evaluate

import transformers
from transformers import (
    AutoModelForTokenClassification,
    AutoConfig,
    T5EncoderModel,
    T5Tokenizer,
    T5PreTrainedModel,
    T5ForConditionalGeneration,
    pipeline,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    set_seed,
    EvalPrediction,
    )
from transformers.modeling_outputs import TokenClassifierOutput

from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    get_peft_config,
    PeftModel,
    PeftConfig,
    prepare_model_for_kbit_training
    )

from datasets import Dataset

import src.config as config
import src.config
import src.data
import src.model_new

from src.model_working import (
    get_prottrans_tokenizer_model,
    df_to_dataset,
    inject_linear_layer,
    )
from src.utils import get_project_root_path

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [3]:
base_model_name = config.base_model_name
print("Base Model:\t", base_model_name)
print("MPS:\t\t", torch.backends.mps.is_available())
ROOT = get_project_root_path()
print("Path:\t\t", ROOT)
device = torch.device('cuda:0' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
print(f"Using device:\t {device}")

Base Model:	 Rostlab/prot_t5_xl_uniref50
MPS:		 True
Path:		 /Users/finnlueth/Developer/gits/prottrans-t5-signalpeptide-prediction
Using device:	 mps


In [4]:
t5_tokenizer = T5Tokenizer.from_pretrained(
        pretrained_model_name_or_path=src.config.base_model_name,
        do_lower_case=False,
        use_fast=True,
        legacy=False
    )

In [5]:
t5_base_model = T5EncoderModel.from_pretrained(
    pretrained_model_name_or_path=src.config.base_model_name,
    device_map='auto',
    load_in_8bit=False,
    )

In [6]:
# t5_base_model.load_adapter(ROOT+adapter_location)

In [7]:
t5_base_model.custom_dropout = nn.Dropout(src.config.dropout_rate)
t5_base_model.num_labels = len(src.config.label_decoding)

t5_base_model.custom_classifier = nn.Linear(
    in_features=t5_base_model.config.hidden_size,
    out_features=t5_base_model.num_labels
)

In [8]:
# t5_lora_model = inject_linear_layer(
#     t5_lora_model=t5_base_model,
#     num_labels=len(src.config.label_decoding),
#     dropout_rate=src.config.dropout_rate
#     )

In [9]:
adapter_location = '/models/testing_4'
# model_lora_config = PeftConfig.from_pretrained(ROOT + adapter_location)

In [10]:
t5_lora_model = PeftModel.from_pretrained(
    model = t5_base_model,
    is_trainable=False,
    model_id=ROOT+adapter_location,
)

In [11]:
t5_lora_model.custom_classifier.to(device)
t5_lora_model.custom_dropout.to(device)

Dropout(p=0.1, inplace=False)

In [12]:
t5_lora_model.forward = types.MethodType(src.model_working.injected_forward, t5_lora_model)

In [13]:
[x for x in t5_lora_model.custom_classifier.named_parameters()]

[('original_module.weight',
  Parameter containing:
  tensor([[ 0.0078, -0.0201,  0.0082,  ..., -0.0304,  0.0279, -0.0118],
          [ 0.0308, -0.0067, -0.0065,  ..., -0.0041, -0.0081, -0.0104],
          [ 0.0042,  0.0259, -0.0007,  ...,  0.0117,  0.0005,  0.0153],
          [ 0.0108,  0.0231,  0.0167,  ..., -0.0299,  0.0020, -0.0265],
          [-0.0034,  0.0022, -0.0172,  ..., -0.0180, -0.0132,  0.0005],
          [ 0.0190, -0.0062,  0.0291,  ...,  0.0054, -0.0081, -0.0085]],
         device='mps:0', requires_grad=True)),
 ('original_module.bias',
  Parameter containing:
  tensor([-0.0253,  0.0306, -0.0126,  0.0020, -0.0304, -0.0175], device='mps:0',
         requires_grad=True)),
 ('modules_to_save.default.weight',
  Parameter containing:
  tensor([[ 0.0078, -0.0201,  0.0082,  ..., -0.0304,  0.0279, -0.0118],
          [ 0.0308, -0.0067, -0.0065,  ..., -0.0041, -0.0081, -0.0104],
          [ 0.0042,  0.0259, -0.0007,  ...,  0.0117,  0.0005,  0.0153],
          [ 0.0108,  0.0231,  

---

In [14]:
FASTA_FILENAME = '5_SignalP_5.0_Training_set.fasta'
annotations_name = 'Label' # Choose Type or Label

df_data = src.data.process(src.data.parse_file(ROOT + '/data/raw/' + FASTA_FILENAME))

dataset_signalp = src.model_new.create_datasets(
    splits=src.config.splits,
    tokenizer=t5_tokenizer,
    data=df_data,
    annotations_name=annotations_name,
    dataset_size=src.config.dataset_size,
    encoder=src.config.select_encodings[annotations_name],
    )

del df_data

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


---

In [15]:
def predict_model(sequence: str, tokenizer: T5Tokenizer, model):
    # print('sequence', sequence)
    tokenized_string = tokenizer.encode(sequence, padding=True, truncation=True, return_tensors="pt", max_length=1024)
    # print('tokenized_string', tokenized_string)
    with torch.no_grad():
        output = model(tokenized_string.to(device))
    # print('output', output)
    return output

def translate_logits(logits):
    return [src.config.label_decoding[x] for x in logits.argmax(-1).tolist()[0]]

In [16]:
_ds_index = 2
_ds_type = 'test'

_inids_test = t5_tokenizer.decode(dataset_signalp[_ds_type][_ds_index]['input_ids'])
_labels_test = dataset_signalp[_ds_type][_ds_index]['labels']
_labels_test_decoded = [src.config.label_decoding[x] for x in _labels_test]
print(_inids_test)
print(_labels_test)
print(_labels_test_decoded)

M A A V I L E R L G A L W V Q N L R G K L A L G I L P Q S H I H T S A S L E I S R K W E K K N K I V Y P P Q L P G E P R R P A E I Y H C R R</s>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I']


In [17]:
[x for x in t5_lora_model.custom_classifier.named_parameters()]

[('original_module.weight',
  Parameter containing:
  tensor([[ 0.0078, -0.0201,  0.0082,  ..., -0.0304,  0.0279, -0.0118],
          [ 0.0308, -0.0067, -0.0065,  ..., -0.0041, -0.0081, -0.0104],
          [ 0.0042,  0.0259, -0.0007,  ...,  0.0117,  0.0005,  0.0153],
          [ 0.0108,  0.0231,  0.0167,  ..., -0.0299,  0.0020, -0.0265],
          [-0.0034,  0.0022, -0.0172,  ..., -0.0180, -0.0132,  0.0005],
          [ 0.0190, -0.0062,  0.0291,  ...,  0.0054, -0.0081, -0.0085]],
         device='mps:0', requires_grad=True)),
 ('original_module.bias',
  Parameter containing:
  tensor([-0.0253,  0.0306, -0.0126,  0.0020, -0.0304, -0.0175], device='mps:0',
         requires_grad=True)),
 ('modules_to_save.default.weight',
  Parameter containing:
  tensor([[ 0.0078, -0.0201,  0.0082,  ..., -0.0304,  0.0279, -0.0118],
          [ 0.0308, -0.0067, -0.0065,  ..., -0.0041, -0.0081, -0.0104],
          [ 0.0042,  0.0259, -0.0007,  ...,  0.0117,  0.0005,  0.0153],
          [ 0.0108,  0.0231,  

In [18]:
preds = predict_model(_inids_test, t5_tokenizer, t5_lora_model)



In [19]:
_res = translate_logits(preds.logits.cpu().numpy())
print(_res)

['I', 'L', 'L', 'O', 'O', 'O', 'T', 'T', 'M', 'M', 'M', 'I', 'O', 'O', 'T', 'L', 'M', 'T', 'M', 'M', 'M', 'O', 'M', 'M', 'O', 'O', 'M', 'M', 'O', 'M', 'O', 'T', 'T', 'I', 'L', 'S', 'I', 'T', 'S', 'I', 'T', 'T', 'M', 'S', 'T', 'T', 'O', 'T', 'L', 'O', 'L', 'O', 'O', 'S', 'L', 'O', 'O', 'S', 'O', 'T', 'T', 'O', 'O', 'T', 'O', 'L', 'L', 'L', 'T', 'O', 'L']
