In [1]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("QizhiPei/BioT5_finetune_dataset", split="test")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# pip install -U huggingface_hub pandas selfies

from huggingface_hub import list_repo_files, hf_hub_download
import pandas as pd
import json, os, re

# ===== 설정 =====
REPO = "QizhiPei/BioT5_finetune_dataset"
OUT_DIR = "/app/Mol-LLM/dataset/train/raw"
os.makedirs(OUT_DIR, exist_ok=True)

# ===== 레포 파일 나열 =====
files = list_repo_files(REPO, repo_type="dataset")

# 우선순위: tasks_plus > tasks > splits_plus > splits
PREFS = ["tasks_plus/", "tasks/", "splits_plus/", "splits/"]

def pick_bace_file(split: str) -> str:
    split_alias = ["validation", "valid"] if split == "validation" else [split]
    best = None
    best_key = (999, 10**9)
    for f in files:
        fl = f.lower()
        if not fl.endswith(".json"):
            continue
        if "bace" not in fl:
            continue
        if not any(sa in fl for sa in split_alias):
            continue
        pref_idx = next((i for i, p in enumerate(PREFS) if p in f), 999)
        key = (pref_idx, len(f))
        if key < best_key:
            best, best_key = f, key
    if best is None:
        raise FileNotFoundError(f"BACE {split} JSON을 찾지 못했습니다. repo={REPO}")
    return best

paths = {
    "train"     : pick_bace_file("train"),
    "validation": pick_bace_file("validation"),
    "test"      : pick_bace_file("test"),
}

# ===== 유틸 =====
def load_json_flex(local_path: str):
    """무슨 형태든 list[dict](샘플 리스트)로 반환. Instances가 있으면 평탄화."""
    with open(local_path, "rb") as f:
        raw = f.read()

    # 1) JSON Lines 시도
    try:
        lines = [ln for ln in raw.splitlines() if ln.strip()]
        objs = [json.loads(ln) for ln in lines]
        if objs and isinstance(objs[0], dict):
            # 혹시 각 줄이 dataset-level dict(Instances 포함)인 경우도 있음 → 아래에서 다시 평탄화
            data = objs
        else:
            data = json.loads(raw.decode("utf-8"))
    except Exception:
        data = json.loads(raw.decode("utf-8"))

    # ---- 평탄화 로직 ----
    def flatten_instances(x):
        # x가 dict이면
        if isinstance(x, dict):
            if "Instances" in x and isinstance(x["Instances"], list):
                return x["Instances"]
            # dict 내부에서 list[dict] 탐색
            for v in x.values():
                if isinstance(v, list) and (not v or isinstance(v[0], dict)):
                    return v
            return [x]
        # x가 list면
        if isinstance(x, list):
            # 리스트 요소가 dataset-level dict(=Instances를 가진)라면 전부 모아 평탄화
            if x and isinstance(x[0], dict) and "Instances" in x[0]:
                out = []
                for d in x:
                    if "Instances" in d and isinstance(d["Instances"], list):
                        out.extend(d["Instances"])
                return out
            # 이미 list[dict](샘플)인 경우
            return x
        # 그 외 타입은 실패
        raise ValueError("지원하지 않는 JSON 구조")

    return flatten_instances(data)

# SELFIES만 뽑기
def extract_selfies(raw: str) -> str | None:
    if pd.isna(raw):
        return None
    s = str(raw)

    m = re.search(r"SELFIES:\s*<bom>(.*)</eom>", s, flags=re.DOTALL)
    if m:
        return m.group(1).strip()
    m = re.search(r"<bom>(.*)</eom>", s, flags=re.DOTALL)
    if m:
        return m.group(1).strip()
    for line in s.splitlines():
        if "SELFIES:" in line:
            line = line.split("SELFIES:", 1)[1].strip()
            line = re.sub(r"^<bom>\s*|\s*</eom>$", "", line).strip()
            return line
    return s.strip()

def normalize_label(v) -> str:
    if isinstance(v, str):
        t = v.strip().lower()
        if t in ("true","yes","y","1"): return "True"
        if t in ("false","no","n","0"): return "False"
    try:
        return "True" if float(v) > 0 else "False"
    except Exception:
        return "True" if str(v).strip().lower() == "true" else "False"

# 선택: SELFIES 유효성 검사
try:
    import selfies as sf
    def is_valid_selfies(s: str) -> bool:
        try:
            _ = sf.decoder(s)
            return True
        except Exception:
            return False
except Exception:
    def is_valid_selfies(s: str) -> bool:
        return isinstance(s, str) and len(s) > 0

def to_biot5_df(examples: list[dict]) -> pd.DataFrame:
    """Instances 항목 안의 input/output(대소문자 가리지 않음) → SELFIES/label로 정규화"""
    rows = []
    for ex in examples:
        if not isinstance(ex, dict):
            continue
        # 입력/출력 키 탐색(대소문자 불문 + 다양한 별칭)
        kl = {k.lower(): k for k in ex.keys()}
        in_key = next((kl[k] for k in ("input","raw_input","selfies","x","mol") if k in kl), None)
        out_key = next((kl[k] for k in ("output","label","labels","y","target","answer") if k in kl), None)

        # Natural-Instructions 스타일로 'input'/'output'이 또 내부 dict에 들어가 있는 경우 처리
        cand = ex.get(in_key) if in_key else None
        if isinstance(cand, dict):
            # 흔한 후보
            in_key2 = next((k for k in cand.keys() if k.lower() in ("input","raw_input","selfies","x","mol","text")), None)
            input_val = cand.get(in_key2)
        else:
            input_val = cand

        cand = ex.get(out_key) if out_key else None
        if isinstance(cand, dict):
            out_key2 = next((k for k in cand.keys() if k.lower() in ("output","label","labels","y","target","answer","text")), None)
            output_val = cand.get(out_key2)
        else:
            output_val = cand

        if input_val is None or output_val is None:
            # 못 찾았으면 스킵
            continue

        # output이 리스트/딕트일 수 있음 → 문자열로 축약
        if isinstance(output_val, list):
            output_val = output_val[0] if output_val else ""
        if isinstance(output_val, dict):
            output_val = next((v for v in output_val.values() if isinstance(v, (str,int,float,bool))), str(output_val))

        rows.append({"SELFIES": extract_selfies(str(input_val)), "label": normalize_label(output_val)})

    df = pd.DataFrame(rows)
    # 무효 SELFIES 제거
    mask = df["SELFIES"].notna() & df["SELFIES"].map(is_valid_selfies)
    if (~mask).sum():
        print(f"[경고] 유효하지 않은 SELFIES {(~mask).sum()}개 제거")
    df = df[mask].reset_index(drop=True)
    return df[["SELFIES","label"]]

# ===== 다운로드 → 파싱 → CSV 저장 =====
for split, rel in paths.items():
    local_path = hf_hub_download(REPO, rel, repo_type="dataset")
    ex = load_json_flex(local_path)
    df = to_biot5_df(ex)
    fname = f"BioT5_bace_{'valid' if split=='validation' else split}.csv"
    out_path = os.path.join(OUT_DIR, fname)
    df.to_csv(out_path, index=False)
    print(f"{split:11s} {rel}  ->  {out_path}  (rows={len(df)})")

print("완료! 이제 다음 분기가 그대로 동작합니다:")
print(f"pd.read_csv(os.path.join('<raw_data_root>', 'raw', 'BioT5_bace_train.csv'))")
print(f"pd.read_csv(os.path.join('<raw_data_root>', 'raw', 'BioT5_bace_valid.csv'))")
print(f"pd.read_csv(os.path.join('<raw_data_root>', 'raw', 'BioT5_bace_test.csv'))")


[경고] 유효하지 않은 SELFIES 1210개 제거
train       tasks_plus/task31_bace_molnet_train.json  ->  /app/Mol-LLM/dataset/train/raw/BioT5_bace_train.csv  (rows=0)
[경고] 유효하지 않은 SELFIES 151개 제거
validation  tasks_plus/task32_bace_molnet_valid.json  ->  /app/Mol-LLM/dataset/train/raw/BioT5_bace_valid.csv  (rows=0)
[경고] 유효하지 않은 SELFIES 152개 제거
test        tasks_plus/task33_bace_molnet_test.json  ->  /app/Mol-LLM/dataset/train/raw/BioT5_bace_test.csv  (rows=0)
완료! 이제 다음 분기가 그대로 동작합니다:
pd.read_csv(os.path.join('<raw_data_root>', 'raw', 'BioT5_bace_train.csv'))
pd.read_csv(os.path.join('<raw_data_root>', 'raw', 'BioT5_bace_valid.csv'))
pd.read_csv(os.path.join('<raw_data_root>', 'raw', 'BioT5_bace_test.csv'))


In [5]:
# 필요 패키지 (이미 있으면 생략 OK)
# pip install -U deepchem selfies rdkit-pypi pandas

import os
import numpy as np
import pandas as pd
from rdkit import Chem
import selfies as sf
import deepchem as dc

RAW_DATA_ROOT = "/app/Mol-LLM/dataset/train"   # <- 네 config의 raw_data_root
RAW_DIR = os.path.join(RAW_DATA_ROOT, "raw")
os.makedirs(RAW_DIR, exist_ok=True)

def get_bace_loader():
    """
    DeepChem 버전별로 서로 다른 이름/경로를 순차 탐색해서
    BACE 로더 함수(호출 가능한 함수)를 반환.
    """
    # 1) 가장 직관적 이름
    if hasattr(dc.molnet, "load_bace"):
        return dc.molnet.load_bace
    # 2) 과거/다른 이름 추정치
    for name in ("load_bace_classification", "load_bace_clf"):
        if hasattr(dc.molnet, name):
            return getattr(dc.molnet, name)
    # 3) 범용 로더 팩토리
    try:
        from deepchem.molnet import load_function
        fn = load_function("bace")
        if callable(fn):
            return fn
    except Exception:
        pass
    # 4) 실패 시 에러
    raise RuntimeError(
        "이 DeepChem 버전에서 BACE 로더를 찾지 못했습니다. "
        "가능하면 deepchem>=2.6 를 권장합니다."
    )

def to_bool_str(y):
    # 1/True → "True", 0/False → "False"
    try:
        v = float(y)
        return "True" if v > 0.0 else "False"
    except Exception:
        s = str(y).strip().lower()
        if s in ("1","true","yes"): return "True"
        return "False"

def dc_dataset_to_csv(dc_dataset, out_csv_path):
    rows = []
    ys = np.array(dc_dataset.y)
    ys = ys.reshape(-1)  # (N,1) → (N,)
    for mol, y in zip(dc_dataset.X, ys):
        if mol is None:
            continue
        smi = Chem.MolToSmiles(mol)
        if not smi:
            continue
        try:
            selfies_str = sf.encoder(smi)
        except Exception:
            continue
        rows.append({"SELFIES": selfies_str, "label": to_bool_str(y)})
    df = pd.DataFrame(rows, columns=["SELFIES", "label"])
    df.to_csv(out_csv_path, index=False)
    print(f"[saved] {out_csv_path} (rows={len(df)})")

# ── 1) DeepChem에서 BACE 로더 확보 ───────────────────────────────────────────
loader = get_bace_loader()

# ── 2) MoleculeNet BACE 불러오기 (Raw + scaffold split) ─────────────────────
#    DeepChem 버전에 따라 파라미터 지원이 조금 다를 수 있어 kwargs로 안전 호출
kwargs = dict(featurizer="Raw", splitter="scaffold", reload=True)
try:
    tasks, datasets, transformers = loader(**kwargs)
except TypeError:
    # 어떤 버전에선 data_dir/save_dir 필요하거나 reload 미지원일 수 있음 → 최소 인자 재시도
    tasks, datasets, transformers = loader(featurizer="Raw", splitter="scaffold")

# datasets 순서는 보통 (train, valid, test)
train_dc, valid_dc, test_dc = datasets

# ── 3) CSV 저장 ─────────────────────────────────────────────────────────────
dc_dataset_to_csv(train_dc, os.path.join(RAW_DIR, "BioT5_bace_train.csv"))
dc_dataset_to_csv(valid_dc, os.path.join(RAW_DIR, "BioT5_bace_valid.csv"))
dc_dataset_to_csv(test_dc,  os.path.join(RAW_DIR, "BioT5_bace_test.csv"))

# ── 4) 프리뷰 ────────────────────────────────────────────────────────────────
for name in ["train","valid","test"]:
    p = os.path.join(RAW_DIR, f"BioT5_bace_{name}.csv")
    if os.path.exists(p):
        print(f"\n[{name}] preview")
        print(pd.read_csv(p).head(3))


[saved] /app/Mol-LLM/dataset/train/raw/BioT5_bace_train.csv (rows=1210)
[saved] /app/Mol-LLM/dataset/train/raw/BioT5_bace_valid.csv (rows=151)
[saved] /app/Mol-LLM/dataset/train/raw/BioT5_bace_test.csv (rows=152)

[train] preview
                                             SELFIES  label
0  [C][N][C][=Branch1][C][=O][C@@][Branch1][P][C]...   True
1  [C][N][C][=Branch1][C][=O][C@@][Branch1][P][C]...   True
2  [C][N][C][=Branch1][C][=O][C@@][Branch1][P][C]...   True

[valid] preview
                                             SELFIES  label
0  [C][C][Branch1][C][C][Branch1][C][C][C][=C][C]...  False
1  [C][C][=Branch1][C][=O][N][C][Branch1][S][C][C...  False
2  [C][C][Branch2][Ring1][O][C][=C][C][=C][C][Bra...  False

[test] preview
                                             SELFIES  label
0  [C][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=...   True
1  [C][C][Branch1][C][C][Branch1][C][C][C][C][=C]...   True
2  [C][O][C][C][=Branch1][C][=O][N][C][Branch2][R...   True
