../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator()

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import src.config
import src.data
import src.model_new
import src.utils
from src.model_new import (
    T5EncoderModelForSequenceClassification,
)

import gc
import copy
import random

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import seaborn as sns
import src.utils

from transformers import (
    T5Tokenizer,
    DataCollatorForTokenClassification,
    T5ForSequenceClassification,
    TrainingArguments,
    Trainer,
    TrainerCallback
)

import peft
from peft import (
    LoraConfig,
)

In [3]:
ROOT = src.utils.get_project_root_path()
device = torch.device('cuda:0' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))

EXPERT = 'ALL'

SEED = 42
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

print("Base Model:\t", src.config.base_model_name)
print("MPS:\t\t", torch.backends.mps.is_available())
print("Path:\t\t", ROOT)
print(f"Using device:\t {device}")

Base Model:	 Rostlab/prot_t5_xl_uniref50
MPS:		 False
Path:		 /home/ec2-user/developer/prottrans-t5-signalpeptide-prediction
Using device:	 cuda:0


In [4]:
t5_tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model_name_or_path=src.config.base_model_name,
    do_lower_case=False,
    use_fast=True,
    legacy=False
)

In [5]:
FASTA_FILENAME = '5_SignalP_5.0_Training_set.fasta'
# FASTA_FILENAME = '5_SignalP_5.0_Training_set_testing.fasta'
annotations_name = 'Type' # Choose Type or Label

df_data = src.data.process(src.data.parse_file(ROOT + '/data/raw/' + FASTA_FILENAME))

dataset_signalp = src.model_new.create_datasets(
        splits=src.config.splits,
        tokenizer=t5_tokenizer,
        data=df_data,
        annotations_name=annotations_name,
        dataset_size=src.config.dataset_size,
        encoder=src.config.type_encoding,
    )

del df_data

In [6]:
display(dataset_signalp)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 12462
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4149
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4147
    })
})

In [7]:
# pd.Series(dataset_signalp['train']['labels']).value_counts()

In [8]:
t5_base_model = T5EncoderModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=src.config.base_model_name,
    device_map='auto',
    load_in_8bit=False,
    custom_num_labels=len(src.config.type_encoding),
    custom_dropout_rate=0.1,
)

tmp_lin_in = nn.Linear(
    in_features=t5_base_model.config.hidden_size,
    out_features=t5_base_model.config.hidden_size
)
tmp_lin_out = nn.Linear(
    in_features=t5_base_model.config.hidden_size,
    out_features=t5_base_model.custom_num_labels
)

t5_base_model.custom_classifier_in.weight = tmp_lin_in.weight
t5_base_model.custom_classifier_in.bias = tmp_lin_in.bias
t5_base_model.custom_classifier_out.weight = tmp_lin_out.weight
t5_base_model.custom_classifier_out.bias = tmp_lin_out.bias

modules_to_save = ['custom_classifier_in', 'custom_classifier_out']

lora_config = LoraConfig(
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=['q', 'k', 'v', 'o'],
    bias="none",
    modules_to_save=modules_to_save,
)

t5_lora_model = peft.get_peft_model(t5_base_model, lora_config)
t5_lora_model.print_trainable_parameters()

Some weights of T5EncoderModelForSequenceClassification were not initialized from the model checkpoint at Rostlab/prot_t5_xl_uniref50 and are newly initialized: ['custom_classifier_in.weight', 'custom_classifier_out.weight', 'custom_classifier_in.bias', 'custom_classifier_out.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 6,039,560 || all params: 1,214,181,384 || trainable%: 0.49741826712111736


In [10]:
# data_collator = DataCollatorForTokenClassification(tokenizer=t5_tokenizer)

training_args = TrainingArguments(
    output_dir='./checkpoints',
    learning_rate=src.config.lr,
    per_device_train_batch_size=src.config.batch_size,
    per_device_eval_batch_size=src.config.batch_size,
    num_train_epochs=src.config.num_epochs,
    logging_steps=src.config.logging_steps,
    # save_strategy="steps",
    # save_steps=src.config.save_steps,
    # evaluation_strategy="steps",
    # eval_steps=src.config.eval_steps,
    # gradient_accumulation_steps=accum,
    # load_best_model_at_end=True,
    # save_total_limit=5,
    seed=42,
    # fp16=True,
    # deepspeed=deepspeed_config,
    remove_unused_columns=False,
    label_names=['labels'],
    # debug="underflow_overflow",
)

trainer = Trainer(
    model=t5_lora_model,
    args=training_args,
    train_dataset=dataset_signalp['train'],
    eval_dataset=dataset_signalp['valid'],
    # data_collator=data_collator,
    compute_metrics=src.model_new.compute_metrics,
)

In [12]:
gc.collect()
torch.cuda.empty_cache()
# torch.mps.empty_cache()

trainer.train()

Step,Training Loss
1,1.3796
2,1.3563
3,1.3448
4,1.3218
5,1.3329
6,1.3064
7,1.2794
8,1.2819
9,1.2013
10,1.3179


TrainOutput(global_step=779, training_loss=0.15104150041644623, metrics={'train_runtime': 1148.2494, 'train_samples_per_second': 10.853, 'train_steps_per_second': 0.678, 'total_flos': 6445159428725856.0, 'train_loss': 0.15104150041644623, 'epoch': 1.0})

In [13]:
metrics=trainer.evaluate()
print(metrics)

{'eval_loss': 0.07023723423480988, 'eval_accuracy_metric': 0.9860207278862376, 'eval_precision_metric': 0.9860207278862376, 'eval_recall_metric': 0.9860207278862376, 'eval_f1_metric': 0.9860207278862376, 'eval_matthews_correlation': 0.0, 'eval_confusion_matrix': array([[3059,   14,    3,    1],
       [  14,  654,   11,    2],
       [   2,    8,  291,    0],
       [   0,    3,    0,   87]]), 'eval_runtime': 247.6431, 'eval_samples_per_second': 16.754, 'eval_steps_per_second': 1.05, 'epoch': 1.0}


In [14]:
training_log = pd.DataFrame(trainer.state.log_history)
display(training_log)

Unnamed: 0,loss,learning_rate,epoch,step,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss,eval_loss,eval_accuracy_metric,eval_precision_metric,eval_recall_metric,eval_f1_metric,eval_matthews_correlation,eval_confusion_matrix,eval_runtime,eval_samples_per_second,eval_steps_per_second
0,1.3796,9.987163e-05,0.00,1,,,,,,,,,,,,,,,
1,1.3563,9.974326e-05,0.00,2,,,,,,,,,,,,,,,
2,1.3448,9.961489e-05,0.00,3,,,,,,,,,,,,,,,
3,1.3218,9.948652e-05,0.01,4,,,,,,,,,,,,,,,
4,1.3329,9.935815e-05,0.01,5,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
776,0.0053,2.567394e-07,1.00,777,,,,,,,,,,,,,,,
777,0.0042,1.283697e-07,1.00,778,,,,,,,,,,,,,,,
778,0.0024,0.000000e+00,1.00,779,,,,,,,,,,,,,,,
779,,,1.00,779,1148.2494,10.853,0.678,6.445159e+15,0.151042,,,,,,,,,,


In [15]:
adapter_location = '/models/moe_gate_1'
training_log['eval_confusion_matrix'] = training_log['eval_confusion_matrix'].apply(lambda x: x.tolist() if type(x)==np.ndarray else None)
t5_lora_model.save_pretrained(ROOT + adapter_location)
training_log.to_csv(ROOT + adapter_location + '/training_log.csv', index=False)
training_log.to_parquet(ROOT + adapter_location + '/training_log.parquet')

In [20]:
_ds_index = 2
_ds_type = 'test'

_input_ids_test = t5_tokenizer.decode(dataset_signalp[_ds_type][_ds_index]['input_ids'][:-1])
_labels_test = torch.tensor([dataset_signalp[_ds_type][_ds_index]['labels']]).to(device)
_attention_mask_test = torch.tensor([dataset_signalp[_ds_type][_ds_index]['attention_mask']]).to(device)

_labels_test_decoded = [src.config.type_decoding[x] for x in _labels_test.tolist()]

print('Iput IDs:\t', _input_ids_test)
print('Labels:\t\t', _labels_test.tolist()[0])
print('Labels Decoded:\t', *_labels_test_decoded)
print('Attention Mask:\t', *_attention_mask_test.tolist()[0])
print('----')

print('Result: \t',* _result)

Iput IDs:	 M E I S T P D F G F G T E D S S A Q Q S A N R A I P Q P V P A P A F P L K E T A S D T G G T A P T F G T L Q D N I N E L C L R Y Q T V C S E
Labels:		 0
Labels Decoded:	 NO_SP
Attention Mask:	 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
----


NameError: name '_result' is not defined

In [21]:
preds = src.model_new.predict_model(
    sequence=_input_ids_test,
    tokenizer=t5_tokenizer,
    model=t5_lora_model,
    labels=_labels_test,
    attention_mask=_attention_mask_test,
    device=device,
    )

In [22]:
preds

SequenceClassifierOutput(loss=tensor(0.0010, device='cuda:0'), logits=tensor([[ 6.5109, -0.5789, -2.1166, -4.9055]], device='cuda:0'), hidden_states=None, attentions=None)

In [23]:
_result = src.model_new.translate_logits(
    logits=preds.logits,
    )

TypeError: 'int' object is not iterable