# 새롭게 만든 실제 저자의 BioT5 FineTuning dataset

In [12]:
import os
import json
import pandas as pd
from huggingface_hub import hf_hub_download

# --------------------------------------------------
# 기본 설정
# --------------------------------------------------
REPO_ID = "QizhiPei/BioT5_finetune_dataset"

TASKS = {
    "text2mol": {
        "train": "tasks/task1_chebi20_text2mol_train.json",
        "validation": "tasks/task2_chebi20_text2mol_validation.json",
        "test": "tasks/task3_chebi20_text2mol_test.json",
    },
    "mol2text": {
        "train": "tasks/task4_chebi20_mol2text_train.json",
        "validation": "tasks/task5_chebi20_mol2text_validation.json",
        "test": "tasks/task6_chebi20_mol2text_test.json",
    },
}

# ⚠️ 예전에 말한 경로로 다시 맞춰줌
OUT_DIR = "/app/Mol-LLM_Custom/dataset/real_train/raw"
os.makedirs(OUT_DIR, exist_ok=True)


# --------------------------------------------------
# 1. 모든 task JSON 다운로드
# --------------------------------------------------
def download_all_tasks():
    local_paths = {}
    for kind, splits in TASKS.items():
        local_paths[kind] = {}
        for split, remote in splits.items():
            try:
                local_path = hf_hub_download(
                    repo_id=REPO_ID,
                    repo_type="dataset",
                    filename=remote,
                )
                print(f"[OK] {kind} {split}: {remote} -> {local_path}")
                local_paths[kind][split] = local_path
            except Exception as e:
                print(f"[FAIL] {kind} {split}: {remote}")
                print("       ", e)
    return local_paths


# --------------------------------------------------
# 2. JSON에서 Instances 꺼내기
# --------------------------------------------------
def load_instances(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    if isinstance(data, dict):
        if "Instances" in data:
            return data["Instances"]
        elif "instances" in data:
            return data["instances"]
        elif "data" in data:
            return data["data"]
        else:
            raise ValueError(
                f"{path} : dict 구조인데 'Instances' 같은 리스트 키를 찾을 수 없음. "
                f"keys={list(data.keys())[:10]}"
            )
    elif isinstance(data, list):
        return data
    else:
        raise ValueError(f"{path} : 알 수 없는 JSON 타입 {type(data)}")


# --------------------------------------------------
# 3. SELFIES 패턴 & 클리너
# --------------------------------------------------
def looks_like_selfies(s: str) -> bool:
    # 아주 단순하지만 이 데이터에선 충분히 잘 동작할 패턴
    return "[" in s and "]" in s

def clean_selfies(s: str) -> str:
    s = s.strip()
    if s.startswith("<bom>"):
        s = s[len("<bom>"):]
    if s.endswith("<eom>"):
        s = s[:-len("<eom>")]
    return s.strip()


# --------------------------------------------------
# 4. Instances -> DataFrame(SELFIES, description)
#    (mol2text, text2mol 둘 다 여기로 처리)
# --------------------------------------------------
def instances_to_df_selfies_desc(instances):
    rows = []
    for inst in instances:
        inp = inst.get("input")
        out_field = inst.get("output", "")

        # output이 리스트인 경우 첫 원소 사용
        if isinstance(out_field, list):
            out = out_field[0] if out_field else ""
        else:
            out = out_field

        if not isinstance(inp, str) or not isinstance(out, str):
            raise ValueError(f"input/output이 문자열이 아님: {inst}")

        # SELFIES / description 자동 판별
        if looks_like_selfies(inp) and not looks_like_selfies(out):
            selfies = inp
            desc = out
        elif looks_like_selfies(out) and not looks_like_selfies(inp):
            selfies = out
            desc = inp
        else:
            # 둘 다 셀피처럼 보이거나 둘 다 텍스트처럼 보이면
            # mol2text/text2mol 관계 생각 안 하고 그냥 input=SELFIES, output=desc로 둠
            selfies = inp
            desc = out

        selfies = clean_selfies(selfies)

        rows.append({
            "SELFIES": selfies,
            "description": desc,
        })

    return pd.DataFrame(rows)


# --------------------------------------------------
# 메인 실행
# --------------------------------------------------
def main():
    print("==== 다운로드 & CSV 생성 시작 ====")
    local_paths = download_all_tasks()

    # 1) mol2text → CSV
    mol2text_paths = local_paths.get("mol2text", {})
    if not mol2text_paths:
        print("[WARN] mol2text JSON을 하나도 못 받음")
    else:
        for split, path in mol2text_paths.items():
            if not os.path.exists(path):
                print(f"[SKIP] mol2text {split}: 로컬 파일 없음 ({path})")
                continue

            instances = load_instances(path)
            print(f"[INFO] mol2text {split}: {len(instances)} instances 로드")

            df = instances_to_df_selfies_desc(instances)
            out_csv = os.path.join(OUT_DIR, f"chebi20_mol2text_{split}.csv")
            df.to_csv(out_csv, index=False)
            print(f"[OK] CSV 저장: {out_csv} ({len(df)} rows)")

    # 2) text2mol → CSV
    text2mol_paths = local_paths.get("text2mol", {})
    if not text2mol_paths:
        print("[WARN] text2mol JSON을 하나도 못 받음")
    else:
        for split, path in text2mol_paths.items():
            if not os.path.exists(path):
                print(f"[SKIP] text2mol {split}: 로컬 파일 없음 ({path})")
                continue

            instances = load_instances(path)
            print(f"[INFO] text2mol {split}: {len(instances)} instances 로드")

            df = instances_to_df_selfies_desc(instances)
            out_csv = os.path.join(OUT_DIR, f"chebi20_text2mol_{split}.csv")
            df.to_csv(out_csv, index=False)
            print(f"[OK] CSV 저장: {out_csv} ({len(df)} rows)")

    print("==== 완료 ====")


if __name__ == "__main__":
    main()


==== 다운로드 & CSV 생성 시작 ====
[OK] text2mol train: tasks/task1_chebi20_text2mol_train.json -> /root/.cache/huggingface/hub/datasets--QizhiPei--BioT5_finetune_dataset/snapshots/9f70da9e8f0df32e7e62846f7ad78a829b00c0fd/tasks/task1_chebi20_text2mol_train.json
[OK] text2mol validation: tasks/task2_chebi20_text2mol_validation.json -> /root/.cache/huggingface/hub/datasets--QizhiPei--BioT5_finetune_dataset/snapshots/9f70da9e8f0df32e7e62846f7ad78a829b00c0fd/tasks/task2_chebi20_text2mol_validation.json
[OK] text2mol test: tasks/task3_chebi20_text2mol_test.json -> /root/.cache/huggingface/hub/datasets--QizhiPei--BioT5_finetune_dataset/snapshots/9f70da9e8f0df32e7e62846f7ad78a829b00c0fd/tasks/task3_chebi20_text2mol_test.json
[OK] mol2text train: tasks/task4_chebi20_mol2text_train.json -> /root/.cache/huggingface/hub/datasets--QizhiPei--BioT5_finetune_dataset/snapshots/9f70da9e8f0df32e7e62846f7ad78a829b00c0fd/tasks/task4_chebi20_mol2text_train.json
[OK] mol2text validation: tasks/task5_chebi20_mol2tex

In [None]:
# train: task4_chebi20_mol2text_train
# valid: task5_chebi20_mol2text_validation
# test: task6_chebi20_mol2text_test

# train: task1_chebi20_text2mol_train
# valid: task2_chebi20_text2mol_validation
# test: task3_chebi20_text2mol_test


# 기존 Chebi-20 다운로드 받는 코드

In [None]:
import os
import pandas as pd
from datasets import load_dataset
from rdkit import Chem
import selfies as sf

# ===== 사용자 설정 =====
DATASET_NAME = "duongttr/chebi-20-new"
OUT_ROOT = "/app/Mol-LLM_Custom/dataset/real_train/raw"   # download_dataset.py의 raw_data_root 아래 raw/ 경로
SPLIT_NAME = "train"                  # 보통 단일 split; 만약 이미 분할돼 있으면 적절히 바꾸세요.
SEED = 42
# ======================

def pick(cols, cands):
    for c in cands:
        if c in cols: return c
    return None

def canon_smiles(s):
    m = Chem.MolFromSmiles(s)
    return Chem.MolToSmiles(m) if m else None

ds = load_dataset(DATASET_NAME, split=SPLIT_NAME)

cols = set(ds.column_names)
cap_col    = pick(cols, ["description","caption","text","molecular_caption","molecular_captions"])
selfies_col= pick(cols, ["SELFIES","selfies"])
smiles_col = pick(cols, ["SMILES","smiles","smi"])

assert cap_col, "캡션/설명 열을 찾지 못했습니다. (description/caption/text 등 후보 확인)"
# SELFIES 보장
if selfies_col is None:
    assert smiles_col, "SELFIES가 없으므로 SMILES 열이 필요합니다."
    ds = ds.map(lambda x: {"_smi_canon": canon_smiles(x[smiles_col])})
    ds = ds.filter(lambda x: x["_smi_canon"] is not None)
    ds = ds.map(lambda x: {"SELFIES": sf.encoder(x["_smi_canon"])})
else:
    if selfies_col != "SELFIES":
        ds = ds.rename_column(selfies_col, "SELFIES")

# description으로 표준화
if cap_col != "description":
    ds = ds.rename_column(cap_col, "description")

# 필요한 열만 유지
keep = ["SELFIES","description"]
ds = ds.remove_columns([c for c in ds.column_names if c not in keep])

# 80/10/10 분할
splits = ds.train_test_split(test_size=0.2, seed=SEED)
tmp = splits["train"].train_test_split(test_size=0.111111, seed=SEED)  # 0.111... of 0.9 ~= 0.1
train, valid, test = tmp["train"], tmp["test"], splits["test"]

os.makedirs(OUT_ROOT, exist_ok=True)
train.to_pandas()[keep].to_csv(os.path.join(OUT_ROOT,"BioT5_chebi20_train.csv"), index=False)
valid.to_pandas()[keep].to_csv(os.path.join(OUT_ROOT,"BioT5_chebi20_valid.csv"), index=False)
test.to_pandas()[keep].to_csv(os.path.join(OUT_ROOT,"BioT5_chebi20_test.csv"), index=False)
print("CSV saved to:", OUT_ROOT)

CSV saved to: /app/Mol-LLM_Custom/dataset/real_train/raw
