In [2]:
%load_ext autoreload
%autoreload 2

import gc
import copy

import torch
import torch.nn as nn
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from tqdm import tqdm
import umap

import evaluate

from transformers import (
    T5Tokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)

from src.model_new import (
    T5EncoderModelForTokenClassification,
    T5EncoderModelForSequenceClassification,
    create_datasets,
)
import src.config
import src.data
import src.model_new


import peft
from peft import (
    LoraConfig,
    PeftModel
)

import random

print("Base Model:\t", src.config.base_model_name)
print("MPS:\t\t", torch.backends.mps.is_available())
ROOT = src.utils.get_project_root_path()
print("Path:\t\t", ROOT)
device = torch.device('cuda:0' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
print(f"Using device:\t {device}")

torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

# torch.set_printoptions(threshold=10_000)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'
Base Model:	 Rostlab/prot_t5_xl_uniref50
MPS:		 True
Path:		 /Users/finnlueth/Developer/gits/prottrans-t5-signalpeptide-prediction
Using device:	 mps


In [None]:
# FASTA_FILENAME = '5_SignalP_5.0_Training_set.fasta'
FASTA_FILENAME = '5_SignalP_5.0_Training_set_testing.fasta'
annotations_name = 'Type' # Choose Type or Label

df_data = src.data.process(src.data.parse_file(ROOT + '/data/raw/' + FASTA_FILENAME))

---

In [None]:
base_model_name = config.base_model_name
model_architecture = T5EncoderModel
t5_tokenizer, t5_base_model = get_prottrans_tokenizer_model(base_model_name, model_architecture)

In [None]:
ds_test = df_data[df_data.Split.isin([4])]
ds_test = df_to_dataset(
    t5_tokenizer,
    ds_test.Sequence.to_list(),
    ds_test.Label.to_list()
)

In [None]:
test_tensor = torch.tensor(ds_test['input_ids']).to(device)

In [None]:
# test_tensor.shape

In [None]:
test_tensor_0 = test_tensor#[:100]#.unsqueeze(0)

In [None]:
# test_tensor_0.shape

In [None]:
batch_size = 100
n_batches = (test_tensor_0.size(0) + batch_size - 1) // batch_size
print(n_batches)

In [None]:
for i in tqdm(range(n_batches), desc="Processing Batches"):
    batch = test_tensor_0[i * batch_size:(i + 1) * batch_size]
    
    with torch.no_grad():
        batch_predictions = t5_base_model(batch)
    gc.collect()
    mps.empty_cache()
    
    if i == 0:
        extracted_embeddings = batch_predictions.last_hidden_state.to('cpu')
    else:
        extracted_embeddings = torch.cat((extracted_embeddings, batch_predictions.last_hidden_state.to('cpu')), dim=0)

In [None]:
torch.save(extracted_embeddings, ROOT + '/data/processed/5.0_train_full_embeddings.pt')

---

In [None]:
extracted_embeddings = torch.load(ROOT + '/data/processed/5.0_train_full_embeddings.pt')

In [None]:
extracted_embeddings.shape

In [None]:
flattened_output = extracted_embeddings.view(extracted_embeddings.size(0), -1)

In [None]:
split_outputs = [flattened_output[i].cpu().numpy() for i in range(flattened_output.size(0))]

In [None]:
df_data_test = df_data[df_data.Split.isin([4])]
df_data_test.reset_index(drop=True, inplace=True)
df_data_test.insert(0, 'Split_Output', split_outputs)

In [None]:
df_data_test.head()

In [None]:
df_data_test.loc[0].Split_Output.shape

In [None]:
extracted_embeddings.view(extracted_embeddings.size(0), -1).shape

In [None]:
umap_2d = umap.UMAP(n_components=2, random_state=42)
umap_2d_embeddings = umap_2d.fit_transform(df_data_test.Split_Output.to_list())

In [None]:
df_data_test = pd.concat([df_data_test, pd.DataFrame(umap_2d_embeddings, columns=['2d_x', '2d_y'])], axis=1)

In [None]:
df_data_test.head()

In [None]:
fig = px.scatter(
    df_data_test,
    x='2d_x',
    y='2d_y',
    title='UMAP on ProtTransT5 Embeddings SignalP5.0 Dataset Split 4',
    color='Type',
    hover_data=['Uniprot_AC', 'Sequence', 'Kingdom', 'Type'],
    # color_discrete_sequence=px.colors.qualitative.Vivid_r,
    )

# fig.update_layout(
#     margin=dict(l=30, r=30, t=30, b=30),
# )

fig.write_image("./plots/umap_1_2d.png")

fig.show()

---

In [None]:
umap_3d = umap.UMAP(n_components=3, random_state=42)
umap_3d_embeddings = umap_3d.fit_transform(df_data_test.Split_Output.to_list())

In [None]:
df_data_test = pd.concat([df_data_test, pd.DataFrame(umap_3d_embeddings, columns=['3d_x', '3d_y', '3d_z'])], axis=1)

In [None]:
df_data_test.head()

In [None]:
fig = px.scatter_3d(
    df_data_test,
    x='3d_x',
    y='3d_y',
    z='3d_z',
    title='UMAP on ProtTransT5 Embeddings SignalP5.0 Dataset Split 4',
    color='Type',
    hover_data=['Sequence', 'Kingdom', 'Type']
    )

fig.write_image("./plots/umap_1_3d.png")

fig.show()

---
---
---
---

In [7]:
t5_tokenizer = T5Tokenizer.from_pretrained(
        pretrained_model_name_or_path=src.config.base_model_name,
        do_lower_case=False,
        use_fast=True,
        legacy=False
    )

In [3]:
t5_base_model = T5EncoderModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=src.config.base_model_name,
    device_map='auto',
    load_in_8bit=False,
    custom_num_labels=len(src.config.type_encoding),
    custom_dropout_rate=0.1,
    )

Some weights of T5EncoderModelForSequenceClassification were not initialized from the model checkpoint at Rostlab/prot_t5_xl_uniref50 and are newly initialized: ['custom_classifier_out.weight', 'custom_classifier_in.bias', 'custom_classifier_in.weight', 'custom_classifier_out.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
FASTA_FILENAME = '5_SignalP_5.0_Training_set.fasta'
# FASTA_FILENAME = '5_SignalP_5.0_Training_set_testing.fasta'
annotations_name = 'Type' # Choose Type or Label

df_data = src.data.process(src.data.parse_file(ROOT + '/data/raw/' + FASTA_FILENAME))

dataset_signalp = src.model_new.create_datasets(
        splits=src.config.splits,
        tokenizer=t5_tokenizer,
        data=df_data,
        annotations_name=annotations_name,
        # dataset_size=src.config.dataset_size,
        encoder=src.config.type_encoding,
    )

In [None]:
display(dataset_signalp)

In [None]:
adapter_location = '/models/moe_v1_'
gate_adapter_location = adapter_location+'gate'
t5_base_model.load_adapter(ROOT+gate_adapter_location)

In [None]:
# torch.Tensor(dataset_signalp['test']['input_ids'])

In [None]:
def extract_embeds(model, dataset, batch_size=100):
    input_ids = torch.Tensor(dataset['input_ids']).to(torch.int32).to(device)
    attention_mask = torch.Tensor(dataset['attention_mask']).to(torch.int32).to(device)
    labels = torch.Tensor(dataset['labels']).to(torch.int32).to(device)
    
    # print(input_ids, input_ids.dtype)
    
    n_batches = (input_ids.size(0) + batch_size - 1) // batch_size
    for i in tqdm(range(n_batches), desc="Processing Batches"):
        with torch.no_grad():
            batch_predictions = model(
                input_ids=input_ids[i * batch_size : (i + 1) * batch_size],
                attention_mask=attention_mask[i * batch_size : (i + 1) * batch_size],
                # labels=labels,
                )
        
        gc.collect()
        torch.mps.empty_cache()
        
        if i == 0:
            extracted_embeddings = batch_predictions.hidden_states.to('cpu')
        else:
            extracted_embeddings = torch.cat((extracted_embeddings, batch_predictions.hidden_states.to('cpu')), dim=0)
    return extracted_embeddings

In [None]:
embeddings = extract_embeds(t5_base_model, dataset_signalp['test'])

In [None]:
torch.save(embeddings, ROOT + '/data/processed/5.0_train_full_embeddings_finetuned.pt')

In [14]:
embeddings = torch.load(ROOT + '/data/processed/5.0_train_full_embeddings_finetuned.pt')

In [15]:
flattened_embeddings = embeddings.view(embeddings.size(0), -1)

In [16]:
np_embeddings = [flattened_embeddings[i].cpu().numpy() for i in range(flattened_embeddings.size(0))]

In [17]:
df_data_test = df_data[df_data.Partition_No.isin([4])]
df_data_test.reset_index(drop=True, inplace=True)
df_data_test.insert(0, 'Split_Output', np_embeddings)

In [18]:
df_data_test

Unnamed: 0,Split_Output,Uniprot_AC,Kingdom,Type,Partition_No,Sequence,Label
0,"[0.90142685, -0.03946919, -0.31022915, 0.37370...",P55317,EUKARYA,NO_SP,4,M L G T V K M E G H E T S D W N S Y Y A D T Q ...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...
1,"[0.8565836, -0.08501352, -0.3474684, 0.3509047...",P35583,EUKARYA,NO_SP,4,M L G A V K M E G H E P S D W S S Y Y A E P E ...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...
2,"[0.9464008, -0.10772252, -0.24480106, 0.268201...",Q8UVD9,EUKARYA,NO_SP,4,M E I S T P D F G F G T E D S S A Q Q S A N R ...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...
3,"[0.89554095, -0.09638444, -0.20449053, 0.18224...",Q99PF5,EUKARYA,NO_SP,4,M S D Y S T G G P P P G P P P P A G G G G G A ...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...
4,"[0.8549611, -0.059423186, -0.2649053, 0.244586...",Q9URU9,EUKARYA,NO_SP,4,M N F R P E Q Q Y I L E K P G I L L S F E Q L ...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...
...,...,...,...,...,...,...,...
4142,"[0.8044293, -0.18126006, -0.20270547, 0.302436...",A5W4E3,NEGATIVE,NO_SP,4,M S S L D R K K P Q N R S K N N Y Y N I C L K ...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...
4143,"[0.72230655, -0.120695814, -0.2515979, 0.16539...",P76256,NEGATIVE,NO_SP,4,M R I L A I D T A T E A C S V A L W N D G T V ...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...
4144,"[0.7625429, -0.17659259, -0.24170873, 0.260483...",P76407,NEGATIVE,NO_SP,4,M A E F P A S L L I L N G K S T D N L P L R E ...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...
4145,"[0.738016, -0.26858345, -0.28682816, 0.3271395...",P0A898,NEGATIVE,NO_SP,4,M S Q V I L D L Q L A C E D N S G L P E E S Q ...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...


In [19]:
umap_2d = umap.UMAP(n_components=2, random_state=42)
umap_2d_embeddings = umap_2d.fit_transform(df_data_test.Split_Output.to_list())


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [20]:
df_data_test = pd.concat([df_data_test, pd.DataFrame(umap_2d_embeddings, columns=['2d_x', '2d_y'])], axis=1)

In [21]:
fig = px.scatter(
    df_data_test,
    x='2d_x',
    y='2d_y',
    title='UMAP on ProtTransT5 Embeddings SignalP5.0 Dataset Split 4',
    color='Type',
    hover_data=['Uniprot_AC', 'Sequence', 'Kingdom', 'Type'],
    # color_discrete_sequence=px.colors.qualitative.Vivid_r,
    )

# fig.update_layout(
#     margin=dict(l=30, r=30, t=30, b=30),
# )

fig.write_image("./plots/umap_2_2d.png")

fig.show()

In [4]:
t5_base_model.config.hidden_size

1024