## Notebook Setup
___

In [1]:
%load_ext autoreload
%autoreload 2

## Packages
___

In [2]:
import re
import os
import math
import copy
import types
import yaml
import gc

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.nn import (
    CrossEntropyLoss,
    MSELoss
)

import evaluate

from transformers import (
    AutoModelForTokenClassification,
    AutoConfig,
    T5EncoderModel,
    T5Tokenizer,
    T5PreTrainedModel,
    T5ForConditionalGeneration,
    pipeline,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    set_seed,
    )
from transformers.modeling_outputs import TokenClassifierOutput

from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    get_peft_config,
    PeftModel,
    PeftConfig,
    prepare_model_for_kbit_training
    )

from datasets import Dataset

import src.config as config

from src.model import (
    get_prottrans_tokenizer_model,
    df_to_dataset,
    inject_linear_layer,
    compute_metrics_full,
    compute_metrics_fast
    )
from src.utils import get_project_root_path
from tqdm import tqdm

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


---
## Setup and Variables

In [3]:
base_model_name = config.base_model_name
print("Base Model:\t", base_model_name)
print("MPS:\t\t", torch.backends.mps.is_available())
ROOT = get_project_root_path()
print("Path:\t\t", ROOT)
device = torch.device('cuda:0' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
print(f"Using device:\t {device}")

Base Model:	 Rostlab/prot_t5_xl_uniref50
MPS:		 True
Path:		 /Users/finnlueth/Developer/gits/prottrans-t5-signalpeptide-prediction
Using device:	 mps


In [4]:
lr = config.lr
batch_size = config.batch_size
num_epochs = config.num_epochs
dropout_rate = config.dropout_rate

label_encoding = config.label_encoding
label_list = config.label_decoding

compute_metrics = compute_metrics_fast

In [5]:
tqdm.pandas()

---
## Create Tokenizer and Load Model

In [6]:
model_architecture = T5EncoderModel
tokenizer, model = get_prottrans_tokenizer_model(base_model_name, model_architecture)

---
## Load Data, Split into Dataset, and Tokenize Sequences

In [7]:
df_data = pd.read_parquet(ROOT + '/data/processed/5.0_train.parquet.gzip')

In [8]:
df_data.head(3)

Unnamed: 0,Sequence,Label,Split
0,M A P T L F Q K L F S K R T G L G A P G R D A ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train
1,M D F T S L E T T T F E E V V I A L G S N V G ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train
2,M D D I S G R Q T L P R I N R L L E H V G N P ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train


In [9]:
ids = tokenizer.batch_encode_plus(df_data['Sequence'], add_special_tokens=True, padding=True)

In [10]:
df_data['input_ids'] = ids['input_ids']
df_data['attention_mask'] = ids['attention_mask']

In [11]:
df_data['input_ids'] = df_data.progress_apply(lambda x: torch.tensor(x['input_ids']).unsqueeze(0), axis=1)
df_data['attention_mask'] = df_data.progress_apply(lambda x: torch.tensor(x['attention_mask']).unsqueeze(0), axis=1)

100%|██████████| 20758/20758 [00:00<00:00, 54337.61it/s]
100%|██████████| 20758/20758 [00:00<00:00, 73905.34it/s]


In [12]:
df_data.head(3)

Unnamed: 0,Sequence,Label,Split,input_ids,attention_mask
0,M A P T L F Q K L F S K R T G L G A P G R D A ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train,"[[tensor(19), tensor(3), tensor(13), tensor(11...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
1,M D F T S L E T T T F E E V V I A L G S N V G ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train,"[[tensor(19), tensor(10), tensor(15), tensor(1...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
2,M D D I S G R Q T L P R I N R L L E H V G N P ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train,"[[tensor(19), tensor(10), tensor(10), tensor(1...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."


In [13]:
df_data['input_ids'].at[0].device

device(type='cpu')

In [14]:
df_data['input_ids'] = df_data['input_ids'].progress_apply(lambda x: x.to(device))
df_data['attention_mask'] = df_data['attention_mask'].progress_apply(lambda x: x.to(device))

100%|██████████| 20758/20758 [00:03<00:00, 5352.95it/s]
100%|██████████| 20758/20758 [00:03<00:00, 6541.26it/s]


---
## Feature Extraction

In [15]:
model.to(device)
model.eval()
print(f"Model loaded to {device}.")

Model loaded to mps.


In [16]:
df_data_split_1 = df_data.iloc[:10000]

In [17]:
with torch.no_grad():
    df_data_split_1['embeddings'] = df_data_split_1.progress_apply(lambda x: model(x['input_ids'], x['attention_mask']), axis=1)

 80%|████████  | 8009/10000 [48:19<11:53,  2.79it/s]  

In [None]:
display(df_data_split_1.head())
display(df_data_split_1.tail())

In [None]:
df_data_split_1.to_parquet(ROOT + '/data/processed/1_5.0_train_embeddings.parquet.gzip', compression='gzip')

In [None]:
gc.collect()

In [None]:
df_data_split_2 = df_data.iloc[10000:]

In [None]:
with torch.no_grad():
    df_data_split_2['embeddings'] = df_data_split_2.progress_apply(lambda x: model(x['input_ids'], x['attention_mask']), axis=1)

In [None]:
display(df_data_split_2.head())
display(df_data_split_2.tail())

---
## Model

In [None]:
df_data_embeds = pd.read_parquet(ROOT + '/data/processed/5.0_train_embeddings.parquet.gzip')