In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import gc
import copy

import torch
import torch.nn as nn
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import evaluate

from transformers import (
    T5Tokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)

from src.model_new import (
    T5EncoderModelForTokenClassification,
    T5EncoderModelForSequenceClassification,
    create_datasets,
)
import src.config
import src.data
import src.model_new


import peft
from peft import (
    LoraConfig,
    PeftModel
)

import random


  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [3]:
ROOT = src.utils.get_project_root_path()
device = torch.device('cuda:0' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))

USE_CRF = True

EXPERT = 'ALL'
MODEL_VERRSION = src.config.model_version

SEED = 42
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

print("Base Model:\t", src.config.base_model_name)
print("MPS:\t\t", torch.backends.mps.is_available())
print("Path:\t\t", ROOT)
print(f"Using device:\t {device}")

# torch.set_printoptions(threshold=10_000)

Base Model:	 Rostlab/prot_t5_xl_uniref50
MPS:		 True
Path:		 /Users/finnlueth/Developer/gits/prottrans-t5-signalpeptide-prediction
Using device:	 mps


In [4]:
t5_tokenizer = T5Tokenizer.from_pretrained(
        pretrained_model_name_or_path=src.config.base_model_name,
        do_lower_case=False,
        use_fast=True,
        legacy=False
    )

In [5]:
FASTA_FILENAME = '5_SignalP_5.0_Training_set.fasta'
# FASTA_FILENAME = '5_SignalP_5.0_Training_set_testing.fasta'
annotations_name = ['Label'] + ['Type'] # Choose Type or Label

df_data = src.data.process(src.data.parse_file(ROOT + '/data/raw/' + FASTA_FILENAME))

dataset_signalp_type_splits = {}

for sequence_type in src.config.select_encoding_type.keys():
    dataset_signalp = src.model_new.create_datasets(
        splits=src.config.splits,
        tokenizer=t5_tokenizer,
        data=df_data,
        annotations_name=annotations_name,
        dataset_size=src.config.dataset_size,
        sequence_type=sequence_type
        )
    dataset_signalp_type_splits.update({sequence_type: dataset_signalp})

del df_data

dataset_signalp = dataset_signalp_type_splits[EXPERT]
display(dataset_signalp_type_splits)

{'ALL': DatasetDict({
     train: Dataset({
         features: ['input_ids', 'attention_mask', 'labels', 'type'],
         num_rows: 12462
     })
     valid: Dataset({
         features: ['input_ids', 'attention_mask', 'labels', 'type'],
         num_rows: 4149
     })
     test: Dataset({
         features: ['input_ids', 'attention_mask', 'labels', 'type'],
         num_rows: 4147
     })
 }),
 'NO_SP': DatasetDict({
     train: Dataset({
         features: ['input_ids', 'attention_mask', 'labels', 'type'],
         num_rows: 9233
     })
     valid: Dataset({
         features: ['input_ids', 'attention_mask', 'labels', 'type'],
         num_rows: 3075
     })
     test: Dataset({
         features: ['input_ids', 'attention_mask', 'labels', 'type'],
         num_rows: 3082
     })
 }),
 'SP': DatasetDict({
     train: Dataset({
         features: ['input_ids', 'attention_mask', 'labels', 'type'],
         num_rows: 2017
     })
     valid: Dataset({
         features: ['input_ids', '

In [6]:
t5_base_model_gate = T5EncoderModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=src.config.base_model_name,
    device_map='auto',
    load_in_8bit=False,
    custom_num_labels=len(src.config.type_encoding),
    custom_dropout_rate=0.1,
    )

Some weights of T5EncoderModelForSequenceClassification were not initialized from the model checkpoint at Rostlab/prot_t5_xl_uniref50 and are newly initialized: ['custom_classifier_out.weight', 'custom_classifier_in.weight', 'custom_classifier_out.bias', 'custom_classifier_in.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
t5_base_model_expert = T5EncoderModelForTokenClassification.from_pretrained(
    pretrained_model_name_or_path=src.config.base_model_name,
    device_map='auto',
    load_in_8bit=False,
    custom_num_labels=len(src.config.label_decoding),
    custom_dropout_rate=0.1,
    use_crf=USE_CRF
    )

Some weights of T5EncoderModelForTokenClassification were not initialized from the model checkpoint at Rostlab/prot_t5_xl_uniref50 and are newly initialized: ['crf.start_transitions', 'crf.end_transitions', 'crf.transitions', 'custom_classifier.bias', 'custom_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
adapter_location = f'/models/moe_v{MODEL_VERRSION}_'

In [9]:
gate_adapter_location = adapter_location+'gate'
t5_base_model_gate.load_adapter(ROOT+gate_adapter_location)

In [None]:
# expert_adapter_location = adapter_location + f'expert_{EXPERT}'
# t5_base_model_expert.load_adapter(ROOT+expert_adapter_location)

# FASTA_FILENAME = '5_SignalP_5.0_Training_set_testing.fasta'
# df_data = src.data.process(src.data.parse_file(ROOT + '/data/raw/' + FASTA_FILENAME))
# df_data = df_data[df_data['Partition_No'] == 4].reset_index(drop=True)
# df_data['Sequence'] = df_data['Sequence'].apply(lambda x: x.replace(' ', ''))

In [None]:
# # df_data['Type_Prediction'] = 
# df_data['Label'].iloc[17:18].apply(lambda x: src.model_new.moe_inference(
#     sequence=x,
#     tokenizer=t5_tokenizer,
#     model_gate=t5_base_model_gate,
#     model_expert=t5_base_model_expert,
#     device=device,
#     # result_type='SP',
#     )[0])

In [22]:
EXPERT = 'LIPO'
expert_adapter_location = ROOT + adapter_location + f'expert_{EXPERT}'
print(expert_adapter_location)

/Users/finnlueth/Developer/gits/prottrans-t5-signalpeptide-prediction/models/moe_v1_expert_LIPO


In [23]:
t5_base_model_expert.load_adapter(expert_adapter_location, adapter_name=f"{EXPERT}_1")

RuntimeError: Error(s) in loading state_dict for T5EncoderModelForTokenClassification:
	size mismatch for custom_classifier.weight: copying a param with shape torch.Size([4, 1024]) from checkpoint, the shape in current model is torch.Size([6, 1024]).
	size mismatch for custom_classifier.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([6]).
	size mismatch for crf.start_transitions: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([6]).
	size mismatch for crf.end_transitions: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([6]).
	size mismatch for crf.transitions: copying a param with shape torch.Size([4, 4]) from checkpoint, the shape in current model is torch.Size([6, 6]).

In [21]:
t5_base_model_expert.unload(EXPERT)

AttributeError: 'T5EncoderModelForTokenClassification' object has no attribute 'unload'

In [None]:
_ds_index = 4
# _input_ids_test = df_data['Sequence'].iloc[_ds_index]
# _labels_test = df_data['Label'].iloc[_ds_index]
# _type_test = df_data['Type'].iloc[_ds_index]
_input_ids_test = t5_tokenizer.decode(dataset_signalp[_ds_type][_ds_index]['input_ids'][:-1])
_labels_test = torch.tensor([dataset_signalp[_ds_type][_ds_index]['labels'] + [-100]]).to(device)
_attention_mask_test = torch.tensor([dataset_signalp[_ds_type][_ds_index]['attention_mask']]).to(device)


print('Iput IDs:\t', _input_ids_test)
print('Labels:\t\t', _labels_test)
print('Type:\t\t', _type_test)

result = src.model_new.moe_inference(
    sequence=_input_ids_test,
    attentino_mask=_attention_mask_test,
    tokenizer=t5_tokenizer,
    model_gate=t5_base_model_gate,
    model_expert=t5_base_model_expert,
    device=device,
    result_type='LIPO',
    use_crf=True,
)

print(result)

In [None]:
t5_base_model_gate.unload()

---

In [None]:
# _ds_index = 220
# _ds_type = 'test'
# USE_CRF = True

# _input_ids_test = t5_tokenizer.decode(dataset_signalp[_ds_type][_ds_index]['input_ids'][:-1])
# _labels_test = torch.tensor([dataset_signalp[_ds_type][_ds_index]['labels'] + [-100]]).to(device)
# _attention_mask_test = torch.tensor([dataset_signalp[_ds_type][_ds_index]['attention_mask']]).to(device)

# _labels_test_decoded = [src.config.label_decoding[x] for x in _labels_test.tolist()[0][:-1]]

# print('Iput IDs:\t', _input_ids_test)
# print('Labels:\t\t', *_labels_test.tolist()[0])
# print('Labels Decoded:\t', *_labels_test_decoded)
# print('Attention Mask:\t', *_attention_mask_test.tolist()[0])
# print('----')

# _ds_index = 3250
_ds_index = 3250
_ds_type = 'test'
USE_CRF = True

_input_ids_test = t5_tokenizer.decode(dataset_signalp[_ds_type][_ds_index]['input_ids'][:-1])
_labels_test = torch.tensor([dataset_signalp[_ds_type][_ds_index]['labels'] + [-100]]).to(device)
_attention_mask_test = torch.tensor([dataset_signalp[_ds_type][_ds_index]['attention_mask']]).to(device)

_labels_test_decoded = [src.config.label_decoding[x] for x in _labels_test.tolist()[0][:-1]]

print('Iput IDs:\t', _input_ids_test)
print('Labels:\t\t', *_labels_test.tolist()[0])
print('Labels Decoded:\t', *_labels_test_decoded)
print('Attention Mask:\t', *_attention_mask_test.tolist()[0])
print('----')

preds = src.model_new.predict_model(
    sequence=_input_ids_test,
    tokenizer=t5_tokenizer,
    model=t5_base_model_expert,
    labels=_labels_test,
    attention_mask=_attention_mask_test,
    device=device,
    viterbi_decoding=USE_CRF,
    )

_result = src.model_new.translate_logits(
    logits=preds.logits,
    viterbi_decoding=USE_CRF,
    decoding=src.config.label_decoding
    )

print('Result: \t',* _result)