In [None]:
import os

import pandas as pd
import torch

from bioemu.observables.folding_stability import compute_folded_proportion_from_dG
from datasets import DatasetDict, load_dataset

seed = 42  # Set a seed for reproducibility

In [None]:
# 1. Load your dataset
dataset_tag = "dataset2"
dataset2 = load_dataset(
    path="RosettaCommons/MegaScale", name=dataset_tag, data_dir=dataset_tag
)

# 2. First split: 80% train / 20% (val + test)
train_testvalid = dataset2["train"].train_test_split(test_size=0.2, seed=42)

# 3. Second split: split that 20% into two equal parts (10% each)
test_valid = train_testvalid["test"].train_test_split(test_size=0.5, seed=42)

# 4. Build the final DatasetDict
dataset2_splits = DatasetDict(
    {
        "train": train_testvalid["train"],  # 80%
        "val": test_valid["train"],  # 10%
        "test": test_valid["test"],  # 10%
    }
)

In [None]:
os.makedirs("datasets/megascale", exist_ok=True)

dataset2_splits["train"].to_csv("datasets/megascale/train.csv")
dataset2_splits["val"].to_csv("datasets/megascale/val.csv")
dataset2_splits["test"].to_csv("datasets/megascale/test.csv")

In [None]:
train = pd.read_csv("datasets/megascale/train.csv")
val = pd.read_csv("datasets/megascale/val.csv")
test = pd.read_csv("datasets/megascale/test.csv")

# convert dG_ML to float and fileter out values that are not numbers
train["dG_ML"] = pd.to_numeric(train["dG_ML"], errors="coerce")
val["dG_ML"] = pd.to_numeric(val["dG_ML"], errors="coerce")
test["dG_ML"] = pd.to_numeric(test["dG_ML"], errors="coerce")
# filter out rows with NaN dG_ML
train = train.dropna(subset=["dG_ML"])
val = val.dropna(subset=["dG_ML"])
test = test.dropna(subset=["dG_ML"])
# compute the proportion of folded sequences based on dG_ML
train["p_folded"] = compute_folded_proportion_from_dG(
    torch.tensor(-train["dG_ML"].to_numpy())
).numpy()
val["p_folded"] = compute_folded_proportion_from_dG(
    torch.tensor(-val["dG_ML"].to_numpy())
).numpy()
test["p_folded"] = compute_folded_proportion_from_dG(
    torch.tensor(-test["dG_ML"].to_numpy())
).numpy()

In [None]:
# randmly sample 5 sequences from train and 1 sequence from val
# sequences should be shorter than 35 amino acids
train_sample = train[train["aa_seq"].str.len() < 35].sample(n=5, random_state=seed)
val_sample = val[val["aa_seq"].str.len() < 35].sample(n=1, random_state=seed)
# save the samples
os.makedirs("test/megascale", exist_ok=True)
train_sample.to_csv("test/megascale/train_sample.csv", index=False)
val_sample.to_csv("test/megascale/val_sample.csv", index=False)