In [1]:
%reload_ext autoreload
%autoreload 2

from _header_model import *

device = "cpu"

print("MPS Availible:\t", torch.backends.mps.is_available())
print(f"Using device:\t {device}")

MPS Availible:	 True
Using device:	 cpu


---
### DataLoader
Create HF Dataset

```py
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'cath_id','temperature', 'replica'],
        num_rows: n
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'cath_id','temperature', 'replica'],
        num_rows: n
    })
})
```

```rust
input_ids: Amino Acid Sequence
attention_mask: Padding Mask
cath_id: cath identifier i.e. 1a0rP01
replica: replica numbner in {0, 1, 2, 3, 4}
temperature: temperature of trajectory in {320, 348, 379, 413, 450}
sequence: original sequence
pssm: PSSM as numpy array dim(20, L)
```

In [2]:
t5_tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model_name_or_path=BASE_MODEL,
    do_lower_case=False,
    use_fast=True,
    legacy=False,
)

dict_pssm = np.load(os.path.join(FILE_PATHS["pssm"], "dict_pssm.npy"), allow_pickle=True).item()


def pssm_to_hf_dataset(dict_pssm: dict, tokenizer: T5Tokenizer) -> Dataset:
    # todo add train/test split
    ds_dict = {"cath_id": [], "temperature": [], "replica": [], "sequence": [], "sequence_processed": [], "pssm": []}

    for key, value in dict_pssm.items():
        parts = key.split("|")
        ds_dict["cath_id"].append(parts[0])
        ds_dict["temperature"].append(int(parts[1]))
        ds_dict["replica"].append(parts[2])
        ds_dict["sequence"].append(parts[3])
        ds_dict["sequence_processed"].append([" ".join(x) for x in parts[3]])
        ds_dict["pssm"].append(value.tolist())

    tokenized_sequences = tokenizer(
        text=[" ".join(x) for x in ds_dict["sequence"]],
        padding=False,
        truncation=True,
        max_length=512,
    )
    ds = Dataset.from_dict(tokenized_sequences)
    for key, value in ds_dict.items():
        ds = ds.add_column(key, value)
    
    # ds = ds.map(lambda examples: {'pssm': [torch.tensor(pssm) for pssm in examples['pssm']]}, batched=True)

    return DatasetDict({"train": ds, "test": ds})


ds = pssm_to_hf_dataset(dict_pssm=dict_pssm, tokenizer=t5_tokenizer)
ds = ds.remove_columns(["cath_id", "replica", "sequence", 'sequence_processed', "temperature"])
ds = ds.rename_column("pssm", "labels")
# ds = ds.remove_columns("labels")

ds["train"] = ds["train"].select([0, 49])
ds["test"] = ds["test"].select([0, 49])

print(ds)

i = 1
print(len(ds["train"]["attention_mask"][i]), ":", *ds["train"]["input_ids"][i])
print(len(ds["train"]["attention_mask"][i]), ':', *ds["train"]["attention_mask"][i])
display(pd.DataFrame(ds["train"]["labels"][i]))
print(type(torch.tensor(ds["train"]["labels"][i])))

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
})
414 : 7 5 15 9 15 23 5 18 3 8 7 5 6 12 19 17 10 7 5 3 7 11 14 7 5 3 18 12 11 13 3 5 9 11 5 5 3 12 5 8 4 5 17 16 3 10 11 18 6 9 19 17 4 9 23 14 16 11 4 10 17 5 3 11 11 8 15 14 6 19 6 3 10 5 16 11 7 18 17 10 21 11 3 7 11 7 10 4 17 6 8 16 3 15 6 9 4 5 17 4 13 11 15 3 5 13 15 14 5 7 11 4 21 3 5 14 8 15 10 8 10 17 15 10 12 23 21 12 10 7 10 6 6 15 4 3 5 11 5 5 5 12 18 10 6 14 21 17 10 5 4 8 7 17 15 7 4 18 5 8 17 15 5 10 12 10 10 7 7 17 7 6 16 17 18 12 4 11 19 17 23 15 3 5 13 4 16 19 19 6 7 5 4 8 3 14 10 17 10 9 8 14 10 7 17 5 17 4 3 14 5 10 3 3 17 11 5 6 23 3 4 4 5 4 23 17 10 7 15 18 5 4 8 10 5 7 7 14 11 3 4 4 18 5 23 5 4 5 3 9 6 14 5 12 5 7 10 5 3 4 8 13 5 3 10 11 21 8 12 3 7 18 5 11 11 13 4 7 9 17 21 7 6 3 13 3 19 4 3 16 8 7 14 10 8 18 3 10 5 10 7 18 16 21 3 1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,403,404,405,406,407,408,409,410,411,412
0,-inf,1.632268,1.847997,0.201634,0.887525,2.711495,2.336283,2.419539,1.807355,0.378512,...,-3.321928,-inf,-inf,-inf,-2.736966,-inf,-inf,-inf,-inf,-inf
1,-inf,-1.152003,-1.736966,-2.0,-2.736966,-4.321928,-inf,-2.736966,-2.321928,-2.0,...,0.137504,-2.736966,-2.736966,-2.736966,-2.321928,-3.321928,-inf,-inf,-inf,-inf
2,4.459432,1.104337,2.510962,2.364572,1.485427,1.432959,1.405992,2.307429,1.722466,1.536053,...,2.419539,1.608809,1.906891,2.655352,3.008989,3.061776,2.847997,3.300124,2.217231,4.459432
3,-inf,-inf,-2.736966,-2.736966,-4.321928,-0.074001,-0.862496,-1.514573,0.432959,-3.321928,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
4,-inf,1.632268,0.536053,1.104337,2.292782,2.392317,2.485427,1.560715,2.035624,1.169925,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
5,-inf,-inf,-inf,-inf,-inf,-inf,-4.321928,-inf,-1.514573,-inf,...,0.722466,0.847997,1.432959,1.292782,0.201634,1.378512,-1.152003,1.292782,1.847997,-inf
6,-inf,-inf,-inf,-inf,-4.321928,-inf,-3.321928,-3.321928,-inf,-inf,...,0.963474,-0.234465,0.765535,0.765535,0.0,0.584963,-0.621488,1.232661,0.263034,-inf
7,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,...,-0.074001,0.201634,0.765535,1.632268,-0.152003,-1.514573,0.925999,-inf,-1.514573,-inf
8,-inf,-inf,-2.0,-inf,0.432959,-1.736966,-0.415037,1.632268,0.137504,1.722466,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
9,-inf,1.925999,1.292782,2.419539,0.321928,-0.415037,-0.321928,-0.736966,0.137504,1.722466,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf


<class 'torch.Tensor'>


---
### Model Loading and LoRA

In [3]:
t5_base_model, loading_info = T5EncoderModelForPssmGeneration.from_pretrained(
    pretrained_model_name_or_path=BASE_MODEL,
    output_loading_info=True,
    # device_map=device,
    # load_in_8bit=False,
    # custom_dropout_rate=0.1,
)

modules_to_save = ["classifier"]

lora_config = LoraConfig(
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q", "k", "v", "o"],
    bias="none",
    modules_to_save=loading_info['missing_keys'], # check if saving 'pssm_head' also works
)

t5_lora_model = peft.get_peft_model(t5_base_model, lora_config)
t5_lora_model.print_trainable_parameters()

Some weights of T5EncoderModelForPssmGeneration were not initialized from the model checkpoint at Rostlab/prot_t5_xl_uniref50 and are newly initialized: ['pssm_head.bn1.bias', 'pssm_head.bn1.num_batches_tracked', 'pssm_head.bn1.running_mean', 'pssm_head.bn1.running_var', 'pssm_head.bn1.weight', 'pssm_head.bn2.bias', 'pssm_head.bn2.num_batches_tracked', 'pssm_head.bn2.running_mean', 'pssm_head.bn2.running_var', 'pssm_head.bn2.weight', 'pssm_head.conv1.bias', 'pssm_head.conv1.weight', 'pssm_head.conv2.bias', 'pssm_head.conv2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 3,932,160 || all params: 1,216,534,272 || trainable%: 0.3232


In [4]:
loading_info

{'missing_keys': ['pssm_head.bn1.bias',
  'pssm_head.bn1.num_batches_tracked',
  'pssm_head.bn1.running_mean',
  'pssm_head.bn1.running_var',
  'pssm_head.bn1.weight',
  'pssm_head.bn2.bias',
  'pssm_head.bn2.num_batches_tracked',
  'pssm_head.bn2.running_mean',
  'pssm_head.bn2.running_var',
  'pssm_head.bn2.weight',
  'pssm_head.conv1.bias',
  'pssm_head.conv1.weight',
  'pssm_head.conv2.bias',
  'pssm_head.conv2.weight'],
 'unexpected_keys': ['lm_head.weight'],
 'mismatched_keys': [],
 'error_msgs': []}

---
### Model Training

In [5]:
data_collator = DataCollatorWithPadding(
    tokenizer=t5_tokenizer,
    padding=True,
    max_length=512,
)

training_args = TrainingArguments(
    output_dir=FILE_PATHS["models"],
    learning_rate=TRAINING_CONFIG["learning_rate"],
    per_device_train_batch_size=TRAINING_CONFIG["batch_size"],
    per_device_eval_batch_size=TRAINING_CONFIG["batch_size"] * 2,
    num_train_epochs=TRAINING_CONFIG["num_epochs"],
    logging_steps=TRAINING_CONFIG["logging_steps"],
    evaluation_strategy="steps", # use eval_strategy
    eval_steps=TRAINING_CONFIG["eval_steps"],
    save_strategy="steps",
    save_steps=TRAINING_CONFIG["save_steps"],
    remove_unused_columns=True,
    # label_names=["labels"],
    seed=SEED,
)

trainer = Trainer(
    model=t5_lora_model,
    args=training_args,
    train_dataset=ds["train"],
    # eval_dataset=dataset_signalp['valid'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [5]:
# print(*ds['train']['input_ids'], sep="\n")

In [6]:
# for name, param in t5_base_model.named_parameters():
#     print(name)

# t5_base_model.encoder.block[0].layer[0].SelfAttention.q.weight

In [6]:
gc.collect()

if torch.cuda.is_available():
    torch.cuda.empty_cache()
if torch.backends.mps.is_available():
    torch.mps.empty_cache()

trainer.train()

  0%|          | 0/10 [00:00<?, ?it/s]



ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
# parent_class_name = T5EncoderModelForPssmGeneration.__bases__[0].__name__
# parent_class_name