In [1]:
import re
from datasets import load_from_disk

from src._shared import load_tokenizers, load_config

Matplotlib created a temporary cache directory at /tmp/matplotlib-3p4i06jp because the default path (/home/lfi/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
config = load_config()
tokenizer_plm, tokenizer_llm = load_tokenizers(config)

In [4]:
dataset_path = "../tmp/data/unprocessed/protein_go_contrastive_concatenated"
dataset_path_processed = "../tmp/data/processed/protein_go_contrastive_concatenated"

dataset = load_from_disk(dataset_path)

In [5]:
print(dataset)
print(dataset['train'][0])

DatasetDict({
    train: Dataset({
        features: ['proteins', 'sequences', 'sentences'],
        num_rows: 54487
    })
    test: Dataset({
        features: ['proteins', 'sequences', 'sentences'],
        num_rows: 6055
    })
})
{'proteins': 'H0Y327', 'sequences': 'XGREGLLGQRRPQDGQARSSCSPGGRTPAARDSIVREVIQNSKEVLSLLQEKNPAFKPVLAIIQAGDDNLMQEINQNLAEEAGLNITHICLPPDSSEAEIIDEILKINEDTRVHGLALQISENLFSNKVLNALKPEKDVDGVTDINLGKLVRGDAHECFVSPVAKAVIELLEKSVGVNLDGKKILVVGAHGSLEAALQCLFQRKGSMTMSIQWKTRQLQSKLHEADIVVLGSPKPEEIPLTWIQPGTTVLNCSHDFLSGKVGCGSPRIHFGGLIEEDDVILLAAALRIQNMVSSGRRWLREQQHRRWRLHCLKLQPLSPVPSDIEISRGQTP', 'sentences': 'Relevant to: organelle,intracellular organelle,intracellular anatomical structure,cellular_component,cellular anatomical entity,membrane-bounded organelle,cytoplasm,intracellular membrane-bounded organelle,mitochondrion; Not relevant to: negative regulation of acetylcholine-gated cation channel activity,negative regulation of protein sumoylation,regulation of telomerase activi

In [None]:
for split in dataset:
    matches = [i for i, seq in enumerate(dataset[split]['proteins']) if 'P84690' in seq]
    if matches:
        print(f"Found P84690 in {split} split at indices: {matches}")
        for idx in matches:
            print(f"\nEntry at index {idx}:")
            print(dataset[split][idx])


In [None]:
identifier_sequence = "sequences"
identifier_text = "sentences"

for split in dataset:
    dataset[split] = dataset[split] = dataset[split].filter(lambda x: len(x[identifier_sequence]) < 256)
    processed_sequences = [" ".join(list(re.sub(r"[UZOB]", "X", seq))) for seq in dataset[split][identifier_sequence]]
    dataset[split] = dataset[split].add_column("sequence_processed", processed_sequences)

    tknz_plm = tokenizer_plm(text=dataset[split]["sequence_processed"], padding=False, truncation=False)
    tknz_llm = tokenizer_llm(text=dataset[split][identifier_text], padding=False, truncation=False)

    dataset[split] = dataset[split].add_column("input_ids_sequence", tknz_plm["input_ids"])
    dataset[split] = dataset[split].add_column("attention_mask_sequence", tknz_plm["attention_mask"])
    dataset[split] = dataset[split].add_column("input_ids_text", tknz_llm["input_ids"])
    dataset[split].add_column("attention_mask_text", tknz_llm["attention_mask"])
    
    dataset[split] = dataset[split].filter(lambda x: len(x['input_ids_text']) < 256)


In [None]:
print(dataset)
print(dataset['train'][1])

In [None]:
dataset.save_to_disk(dataset_path_processed)

In [None]:
import matplotlib.pyplot as plt

# Calculate lengths for each split
lengths = {}
for split in dataset:
    lengths[split] = [len(ids) for ids in dataset[split]['input_ids_text']]

# Create the plot
plt.figure(figsize=(10, 6))
for split, split_lengths in lengths.items():
    plt.hist(split_lengths, bins=50, alpha=0.5, label=split)

plt.xlabel('Sequence Length')
plt.ylabel('Count')
plt.title('Distribution of Text Token Lengths')
plt.legend()
plt.show()
