In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import pickletools

import re
import gc
import os
import math
import copy
import types
import yaml
import sys

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.nn import (
    CrossEntropyLoss,
    MSELoss
)
from torch.utils.data import DataLoader

import evaluate

import transformers
from transformers import (
    AutoModelForTokenClassification,
    AutoConfig,
    T5EncoderModel,
    T5Tokenizer,
    T5PreTrainedModel,
    T5ForConditionalGeneration,
    pipeline,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    set_seed,
    EvalPrediction,
    )
from transformers.modeling_outputs import TokenClassifierOutput

from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    get_peft_config,
    PeftModel,
    PeftConfig,
    prepare_model_for_kbit_training
    )

from datasets import Dataset

import src.config as config
import src.config
import src.data
import src.model_new

from src.model_working import (
    get_prottrans_tokenizer_model,
    df_to_dataset,
    inject_linear_layer,
    )
from src.utils import get_project_root_path

In [3]:
base_model_name = config.base_model_name
print("Base Model:\t", base_model_name)
print("MPS:\t\t", torch.backends.mps.is_available())
ROOT = get_project_root_path()
print("Path:\t\t", ROOT)
device = torch.device('cuda:0' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
print(f"Using device:\t {device}")

Base Model:	 Rostlab/prot_t5_xl_uniref50
MPS:		 False
Path:		 /home/ec2-user/developer/prottrans-t5-signalpeptide-prediction
Using device:	 cuda:0


In [4]:
t5_tokenizer = T5Tokenizer.from_pretrained(
        pretrained_model_name_or_path=src.config.base_model_name,
        do_lower_case=False,
        use_fast=True,
        legacy=False
    )

In [5]:
t5_base_model = T5EncoderModel.from_pretrained(
    pretrained_model_name_or_path=src.config.base_model_name,
    device_map='auto',
    load_in_8bit=False,
    )

In [6]:
# t5_base_model.load_adapter(ROOT+adapter_location)

In [7]:
adapter_location = '/models/linear_model_v5'
model_lora_config = PeftConfig.from_pretrained(ROOT + adapter_location)

In [8]:
t5_lora_model = PeftModel.from_pretrained(
    model = t5_base_model,
    is_trainable=False,
    model_id=ROOT+adapter_location,
)

In [9]:
t5_lora_model = inject_linear_layer(
    t5_lora_model=t5_lora_model,
    num_labels=len(src.config.label_decoding),
    dropout_rate=src.config.dropout_rate
    )

In [10]:
[x for x in t5_lora_model.custom_classifier.named_parameters()]

[('weight',
  Parameter containing:
  tensor([[-0.0238, -0.0091, -0.0253,  ..., -0.0252,  0.0124,  0.0081],
          [ 0.0102, -0.0186,  0.0241,  ...,  0.0279,  0.0276,  0.0046],
          [ 0.0310,  0.0180, -0.0218,  ...,  0.0090,  0.0036,  0.0001],
          [ 0.0272,  0.0238,  0.0047,  ...,  0.0258, -0.0070,  0.0241],
          [-0.0284, -0.0256, -0.0073,  ..., -0.0220, -0.0227,  0.0252],
          [-0.0286,  0.0217,  0.0073,  ...,  0.0307,  0.0256, -0.0114]],
         requires_grad=True)),
 ('bias',
  Parameter containing:
  tensor([ 0.0103,  0.0268, -0.0011,  0.0031, -0.0128,  0.0100],
         requires_grad=True))]

In [11]:
t5_lora_model = PeftModel.from_pretrained(
    model = t5_lora_model,
    is_trainable=False,
    model_id=ROOT+adapter_location,
)

In [14]:
[x for x in t5_lora_model.custom_classifier.named_parameters()]

[('original_module.weight',
  Parameter containing:
  tensor([[-0.0238, -0.0091, -0.0253,  ..., -0.0252,  0.0124,  0.0081],
          [ 0.0102, -0.0186,  0.0241,  ...,  0.0279,  0.0276,  0.0046],
          [ 0.0310,  0.0180, -0.0218,  ...,  0.0090,  0.0036,  0.0001],
          [ 0.0272,  0.0238,  0.0047,  ...,  0.0258, -0.0070,  0.0241],
          [-0.0284, -0.0256, -0.0073,  ..., -0.0220, -0.0227,  0.0252],
          [-0.0286,  0.0217,  0.0073,  ...,  0.0307,  0.0256, -0.0114]],
         requires_grad=True)),
 ('original_module.bias',
  Parameter containing:
  tensor([ 0.0103,  0.0268, -0.0011,  0.0031, -0.0128,  0.0100],
         requires_grad=True)),
 ('modules_to_save.default.weight',
  Parameter containing:
  tensor([[-0.0238, -0.0091, -0.0253,  ..., -0.0252,  0.0124,  0.0081],
          [ 0.0102, -0.0186,  0.0241,  ...,  0.0279,  0.0276,  0.0046],
          [ 0.0310,  0.0180, -0.0218,  ...,  0.0090,  0.0036,  0.0001],
          [ 0.0272,  0.0238,  0.0047,  ...,  0.0258, -0.0070, 

In [None]:
FASTA_FILENAME = '5_SignalP_5.0_Training_set.fasta'
annotations_name = 'Label' # Choose Type or Label

df_data = src.data.process(src.data.parse_file(ROOT + '/data/raw/' + FASTA_FILENAME))

dataset_signalp = src.model_new.create_datasets(
    splits=src.config.splits,
    tokenizer=t5_tokenizer,
    data=df_data,
    annotations_name=annotations_name,
    dataset_size=src.config.dataset_size,
    encoder=src.config.select_encodings[annotations_name],
    )

del df_data

---

In [None]:
def predict_model(sequence: str, tokenizer: T5Tokenizer, model):
    # print('sequence', sequence)
    tokenized_string = tokenizer.encode(sequence, padding=True, truncation=True, return_tensors="pt", max_length=1024)
    # print('tokenized_string', tokenized_string)
    with torch.no_grad():
        output = model(tokenized_string.to(device))
    # print('output', output)
    return output

def translate_logits(logits):
    return [src.config.label_decoding[x] for x in logits.argmax(-1).tolist()[0]]

In [None]:
_ds_index = 3290
_ds_type = 'test'

_inids_test = t5_tokenizer.decode(dataset_signalp[_ds_type][_ds_index]['input_ids'])
_labels_test = dataset_signalp[_ds_type][_ds_index]['labels']
_labels_test_decoded = [src.config.label_decoding[x] for x in _labels_test]
print(_inids_test)
print(_labels_test)
print(_labels_test_decoded)

In [None]:
[x for x in t5_lora_model.custom_classifier.named_parameters()]

In [None]:
preds = predict_model(_inids_test, t5_tokenizer, t5_lora_model)

In [None]:
_res = translate_logits(preds.logits.cpu().numpy())
print(_res)