In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import random
import gc

import accelerate
import torch
import transformers
from datasets import load_from_disk
from peft import LoraConfig, get_peft_model
from transformers import T5Tokenizer, Trainer, TrainingArguments

from src._shared import (
    load_config,
    setup_environment,
)
from src.model.configuration_md_pssm import MDPSSMConfig
from src.model.modeling_md_pssm import T5EncoderModelForPssmGeneration
from src.model.utils.data_collator import DataCollatorForT5Pssm

Matplotlib created a temporary cache directory at /tmp/matplotlib-131vfvbr because the default path (/home/lfi/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [3]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

train_config = load_config()
model_name_identifier, device, report_to, run, USE_WANDB, SEED = setup_environment(train_config)

accelerate.utils.set_seed(SEED + 1)
transformers.set_seed(SEED + 2)
torch.manual_seed(SEED + 3)
random.seed(SEED + 4)

[34m[1mwandb[0m: Currently logged in as: [33mfinnlueth[0m ([33mfinnlueth-organization[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
2025/01/26 19:50:23 ERROR failed to get logger path error="error creating log directory: mkdir /home/lfi/.cache/wandb: permission denied"
2025/01/26 19:50:23 INFO server is running addr=127.0.0.1:46795
2025/01/26 19:50:23 INFO Will exit if parent process dies. ppid=654973
2025/01/26 19:50:23 INFO connection: ManageConnectionData: new connection created id=127.0.0.1:52696
2025/01/26 19:50:23 INFO handleInformInit: received streamId=kvvkpha3 id=127.0.0.1:52696
2025/01/26 19:50:24 INFO handleInformInit: stream started streamId=kvvkpha3 id=127.0.0.1:52696


Using device: cuda
Model identifier: prot-md-pssm-2025-01-26-19-50-23


In [4]:
config = MDPSSMConfig()
model = T5EncoderModelForPssmGeneration(config=config)

In [5]:
target_modules = ["q", "v"]
modules_to_save = ["pssm_head"]

lora_config = LoraConfig(
    inference_mode=False,
    r=train_config["lora"]["r"],
    lora_alpha=train_config["lora"]["lora_alpha"],
    lora_dropout=train_config["lora"]["lora_dropout"],
    target_modules=target_modules,
    bias="none",
    modules_to_save=modules_to_save,
    use_rslora=train_config["lora"]["use_rslora"],
    use_dora=train_config["lora"]["use_dora"],
)

model = get_peft_model(model, lora_config)

print("target_modules:", target_modules)
print("modules_to_save:", modules_to_save)
model.print_trainable_parameters()


target_modules: ['q', 'v']
modules_to_save: ['pssm_head']
trainable params: 3,949,884 || all params: 1,214,075,512 || trainable%: 0.3253


In [27]:
dataset = load_from_disk("../tmp/data/pssm/pssm_dataset_0_only")
dataset = dataset.select(range(22,30))
print(dataset)

for x in range(6):
    print(dataset["name"][x])
    print(dataset["sequence"][x])
display(torch.tensor(dataset["pssm_features"][x]))

dataset = dataset.rename_column("pssm_features", "labels")
dataset = dataset.remove_columns(["name", "sequence", "sequence_processed"])

print(dataset)

Dataset({
    features: ['name', 'sequence', 'sequence_processed', 'input_ids', 'attention_mask', 'pssm_features'],
    num_rows: 8
})
1a05A00_379_0
MKKIAIFAGDGIGPEIVAAARQVLDAVDQAAXLGLRCTEGLVGGAALDASDDPLPAASLQLAMAADAVILGAVGGPRWDAYPPAKRPEQGLLRLRKGLDLYANLRPAQIFPQLLDASPLRPELVRDVDILVVRELTGDIYFGQPRGLEVIDGKRRGFNTMVYDEDEIRRIAXVAFRAAQGRRKQLCSVDKANVLETTRLWREVVTEVARDYPDVRLSXMYVDNAAMQLIRAPAQFDVLLTGNMFGDILSDEASQLTGSIGMLPSASLGEGRAMYEPIXGSAPDIAGQDKANPLATILSVAMMLRXSLNAEPWAQRVEAAVQRVLDQGLRTADIAAPGTPVIGTKAMGAAVVNALNLK
1a05A00_413_0
MKKIAIFAGDGIGPEIVAAARQVLDAVDQAAXLGLRCTEGLVGGAALDASDDPLPAASLQLAMAADAVILGAVGGPRWDAYPPAKRPEQGLLRLRKGLDLYANLRPAQIFPQLLDASPLRPELVRDVDILVVRELTGDIYFGQPRGLEVIDGKRRGFNTMVYDEDEIRRIAXVAFRAAQGRRKQLCSVDKANVLETTRLWREVVTEVARDYPDVRLSXMYVDNAAMQLIRAPAQFDVLLTGNMFGDILSDEASQLTGSIGMLPSASLGEGRAMYEPIXGSAPDIAGQDKANPLATILSVAMMLRXSLNAEPWAQRVEAAVQRVLDQGLRTADIAAPGTPVIGTKAMGAAVVNALNLK
1a05A00_450_0
MKKIAIFAGDGIGPEIVAAARQVLDAVDQAAXLGLRCTEGLVGGAALDASDDPLPAASLQLAMAADAVILGAVGGPRWDAYPPAKRPEQGLLRLRKGLDLYANLRPA

tensor([[0.0188, 0.0041, 0.8411,  ..., 0.0203, 0.0042, 0.0040],
        [0.0711, 0.0381, 0.4436,  ..., 0.0814, 0.0077, 0.0028],
        [0.0721, 0.0199, 0.4376,  ..., 0.0612, 0.0036, 0.0028],
        ...,
        [0.0094, 0.0060, 0.2631,  ..., 0.1378, 0.0021, 0.0138],
        [0.0113, 0.0049, 0.3610,  ..., 0.0658, 0.0026, 0.0115],
        [0.0188, 0.0041, 0.8411,  ..., 0.0203, 0.0042, 0.0040]])

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 8
})


In [29]:
(torch.tensor(dataset["pssm_features"][0])).shape

KeyError: "Column pssm_features not in the dataset. Current columns in the dataset: ['input_ids', 'attention_mask', 'labels']"

In [None]:

tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model_name_or_path="Rostlab/prot_t5_xl_uniref50",
    do_lower_case=False,
    use_fast=True,
    legacy=False,
)

data_collator = DataCollatorForT5Pssm(
    tokenizer=tokenizer,
    padding=True,
    max_length=512,
    pad_to_multiple_of=8,
    label_name="pssm_features",
)

def compute_metrics(eval_preds):
    return {}

training_args = TrainingArguments(
    output_dir=f"../tmp/models/checkpoints/{model_name_identifier}",
    run_name=model_name_identifier if USE_WANDB else None,
    report_to="wandb" if USE_WANDB else None,
    learning_rate=train_config["trainer"]["learning_rate"],
    per_device_train_batch_size=train_config["trainer"]["train_batch_size"],
    num_train_epochs=train_config["trainer"]["num_epochs"],
    eval_strategy=train_config["trainer"]["eval_strategy"],
    eval_steps=train_config["trainer"]["eval_steps"],
    per_device_eval_batch_size=train_config["trainer"]["eval_batch_size"],
    eval_on_start=train_config["trainer"]["eval_on_start"],
    batch_eval_metrics=train_config["trainer"]["batch_eval_metrics"],
    save_strategy=train_config["trainer"]["save_strategy"],
    save_steps=train_config["trainer"]["save_steps"],
    save_total_limit=train_config["trainer"]["save_total_limit"],
    remove_unused_columns=train_config["trainer"]["remove_unused_columns"],
    label_names=["input_ids", "attention_mask"],
    logging_strategy="steps",
    logging_steps=train_config["trainer"]["logging_steps"],
    seed=train_config["seed"],
    lr_scheduler_type=train_config["trainer"]["lr_scheduler_type"],
    warmup_steps=train_config["trainer"]["warmup_steps"],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [51]:
for x in dataset['input_ids']:
    print(torch.tensor(x).shape)

for x in dataset['labels']:
    print(torch.tensor(x).shape)


torch.Size([358])
torch.Size([358])
torch.Size([358])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([357, 20])
torch.Size([357, 20])
torch.Size([357, 20])
torch.Size([63, 20])
torch.Size([63, 20])
torch.Size([63, 20])
torch.Size([63, 20])
torch.Size([63, 20])


In [80]:
torch.set_printoptions(profile="full")
# torch.set_printoptions(profile="default") 


gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
if torch.backends.mps.is_available():
    torch.mps.empty_cache()

trainer.train()
trainer.evaluate()

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
if torch.backends.mps.is_available():
    torch.mps.empty_cache()

AttributeError: 'BaseModelOutputWithPastAndCrossAttentions' object has no attribute 'shape'