## Notebook Setup
___

In [None]:
%load_ext autoreload
%autoreload 2

## Packages
___

In [None]:
import re
import os
import math
import copy
import types
import yaml

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.nn import (
    CrossEntropyLoss,
    MSELoss
)

import evaluate

from transformers import (
    AutoModelForTokenClassification,
    AutoConfig,
    T5EncoderModel,
    T5Tokenizer,
    T5PreTrainedModel,
    T5ForConditionalGeneration,
    pipeline,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    set_seed,
    )
from transformers.modeling_outputs import TokenClassifierOutput

from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    get_peft_config,
    PeftModel,
    PeftConfig,
    prepare_model_for_kbit_training
    )

from datasets import Dataset

import src.config as config

from src.model import (
    get_prottrans_tokenizer_model,
    df_to_dataset,
    inject_linear_layer,
    compute_metrics_full,
    compute_metrics_fast
    )
from src.utils import get_project_root_path

---
## Setup and Variables

In [None]:
base_model_name = config.base_model_name
print("Base Model:\t", base_model_name)
print("MPS:\t\t", torch.backends.mps.is_available())
ROOT = get_project_root_path()
print("Path:\t\t", ROOT)
device = torch.device('cuda:0' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
print(f"Using device:\t {device}")

In [None]:
lr = config.lr
batch_size = config.batch_size
num_epochs = config.num_epochs
dropout_rate = config.dropout_rate

label_encoding = config.label_encoding
label_list = config.label_decoding

compute_metrics = compute_metrics_fast

---
## Create Tokenizer and Load Model

In [None]:
model_architecture = T5EncoderModel

t5_tokenizer, t5_base_model = get_prottrans_tokenizer_model(base_model_name, model_architecture)

---
## Load Adapter

In [None]:
# t5_lora_model_load_adapter = PeftConfig.from_pretrained(ROOT + '/models/linear_model')

In [None]:
# t5_lora_model_load_adapter

In [None]:
adapter_location = '/models/linear_model'

In [None]:
t5_lora_model_config = PeftConfig.from_pretrained(ROOT + adapter_location)

In [None]:
t5_base_model = PeftModel.from_pretrained(
    model=t5_base_model,
    model_id=ROOT+adapter_location,
    # is_trainable=False,
    )
# del t5_base_model

In [None]:
t5_lora_model = inject_linear_layer(t5_base_model)

In [None]:
num_labels = label_list.__len__()
t5_lora_model.dropout = nn.Dropout(dropout_rate)
t5_lora_model.num_labels = num_labels

t5_lora_model.get_base_model().dropout = nn.Dropout(dropout_rate)
t5_lora_model.get_base_model().classifier = nn.Linear(
    in_features=t5_lora_model.get_base_model().config.hidden_size,
    out_features=label_list.__len__()
    )

In [None]:
t5_lora_model

---
## Make Inference

In [None]:
from torch.utils.data import DataLoader

In [None]:
df_data = pd.read_parquet(ROOT + '/data/processed/5.0_train.parquet.gzip')

In [None]:
# ToDo: Use entire test set
ds_test = df_data[df_data.Split == 'test']

ds_test = df_to_dataset(
    t5_tokenizer,
    ds_test.Sequence.to_list()[:3],
    ds_test.Label.to_list()[:3]
)

In [None]:
ds_test

In [None]:
print(*ds_test['input_ids'][0])
print(*ds_test['attention_mask'][0])
print(*ds_test['labels'][0])

In [None]:
input_str = t5_tokenizer.decode(ds_test['input_ids'][0][:-1])
print(input_str)

In [None]:
inputs = t5_tokenizer(input_str)
print(inputs)

In [None]:
# with torch.no_grad():
#     logits = t5_lora_model(inputs['input_ids']).logits

In [None]:
# device = 'cpu'
# t5_lora_model.to(device)

# test_set = ds_test.with_format("torch", device=device)

# # For token classification we need a data collator here to pad correctly
# data_collator = DataCollatorForTokenClassification(t5_tokenizer) 

# # Create a dataloader for the test dataset
# test_dataloader = DataLoader(test_set, batch_size=16, shuffle = False, collate_fn = data_collator)

# # Put the model in evaluation mode
# t5_lora_model.eval()

# # Make predictions on the test dataset
# predictions = []
# # We need to collect the batch["labels"] as well, this allows us to filter out all positions with a -100 afterwards
# padded_labels = []

# counter = 0

# with torch.no_grad():
#     for batch in test_dataloader:
#         print(counter)
#         counter += 1
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         # Padded labels from the data collator
#         padded_labels += batch['labels'].tolist()
#         # Add batch results(logits) to predictions, we take the argmax here to get the predicted class
#         prediction = t5_lora_model(input_ids=input_ids)
#         print(prediction)
#         predictions += prediction.logits.argmax(dim=-1).tolist()

In [None]:
# print(*predictions)

In [None]:
# print(test_set[0]['labels'])
# print(*[config.label_decoding[x] for x in test_set[0]['labels'].tolist()])

In [None]:
# print(*[config.label_decoding[x] for x in predictions[0]])

In [None]:
# t5_lora_model(ds_test['input_ids'][0])

In [None]:
# t5_tokenizer.decode(padded_labels[0])
# print(*[config.label_decoding[x] for x in padded_labels[0]])

In [None]:
# t5_lora_model()

In [None]:
# type(predictions)

In [None]:
# print(*[[config.label_decoding[y] for y in x] for x in predictions][0])

---
## Measure Performance

In [None]:
# base_model_test = T5ForConditionalGeneration.from_pretrained(
#     base_model_name,
#     device_map='auto',
#     offload_folder='./offload',
#     load_in_8bit=False
# )
# tsss_ids = t5_tokenizer('M A P T L F Q K L F S K R T G L G A P G R D A', return_tensors="pt").input_ids.to(device)
# tsss_mask = t5_tokenizer('M A P T L F Q K L F S K R T G L G A P G R D A', return_tensors="pt").attention_mask.to(device)
# base_model_test(input_ids=tsss_ids, decoder_input_ids=tsss_ids, attention_mask=tsss_mask)

---

In [None]:
t5_lora_model

In [None]:
device = 'cpu'
t5_lora_model.to(device)

test_set = ds_test.with_format("torch", device=device)

# For token classification we need a data collator here to pad correctly
data_collator = DataCollatorForTokenClassification(t5_tokenizer) 

# Create a dataloader for the test dataset
test_dataloader = DataLoader(test_set, batch_size=16, shuffle = False, collate_fn = data_collator)

# Put the model in evaluation mode
t5_lora_model.eval()

# Make predictions on the test dataset
predictions = []
# We need to collect the batch["labels"] as well, this allows us to filter out all positions with a -100 afterwards
padded_labels = []

counter = 0

with torch.no_grad():
    for batch in test_dataloader:
        print(counter)
        counter += 1
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        # Padded labels from the data collator
        padded_labels += batch['labels'].tolist()
        # Add batch results(logits) to predictions, we take the argmax here to get the predicted class
        prediction = t5_lora_model(input_ids=input_ids).logits.argmax(dim=-1).tolist()
        print(prediction)
        predictions += prediction#.argmax(dim=-1).tolist()

In [None]:
index_item = 0

actual = [config.label_decoding[x] for x in test_set['labels'][index_item].tolist()]
print(actual.__len__())
print(*actual)
pred = [config.label_decoding[x] for x in predictions[index_item]]
print(pred.__len__())
print(*pred)