In [1]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("QizhiPei/BioT5_finetune_dataset", split="test")

  from .autonotebook import tqdm as notebook_tqdm
  import pynvml  # type: ignore[import]


In [3]:
# 필요 패키지 (이미 있으면 생략 OK)
# pip install -U deepchem selfies rdkit-pypi pandas

import os
import numpy as np
import pandas as pd
from rdkit import Chem
import selfies as sf
import deepchem as dc

RAW_DATA_ROOT = "/app/Mol-LLM_Custom/dataset/real_train"   # <- 네 config의 raw_data_root
RAW_DIR = os.path.join(RAW_DATA_ROOT, "raw")
os.makedirs(RAW_DIR, exist_ok=True)

def get_bace_loader():
    """
    DeepChem 버전별로 서로 다른 이름/경로를 순차 탐색해서
    BACE 로더 함수(호출 가능한 함수)를 반환.
    """
    # 1) 가장 직관적 이름
    if hasattr(dc.molnet, "load_bace"):
        return dc.molnet.load_bace
    # 2) 과거/다른 이름 추정치
    for name in ("load_bace_classification", "load_bace_clf"):
        if hasattr(dc.molnet, name):
            return getattr(dc.molnet, name)
    # 3) 범용 로더 팩토리
    try:
        from deepchem.molnet import load_function
        fn = load_function("bace")
        if callable(fn):
            return fn
    except Exception:
        pass
    # 4) 실패 시 에러
    raise RuntimeError(
        "이 DeepChem 버전에서 BACE 로더를 찾지 못했습니다. "
        "가능하면 deepchem>=2.6 를 권장합니다."
    )

def to_bool_str(y):
    # 1/True → "True", 0/False → "False"
    try:
        v = float(y)
        return "True" if v > 0.0 else "False"
    except Exception:
        s = str(y).strip().lower()
        if s in ("1","true","yes"): return "True"
        return "False"

def dc_dataset_to_csv(dc_dataset, out_csv_path):
    rows = []
    ys = np.array(dc_dataset.y)
    ys = ys.reshape(-1)  # (N,1) → (N,)
    for mol, y in zip(dc_dataset.X, ys):
        if mol is None:
            continue
        smi = Chem.MolToSmiles(mol)
        if not smi:
            continue
        try:
            selfies_str = sf.encoder(smi)
        except Exception:
            continue
        rows.append({"SELFIES": selfies_str, "label": to_bool_str(y)})
    df = pd.DataFrame(rows, columns=["SELFIES", "label"])
    df.to_csv(out_csv_path, index=False)
    print(f"[saved] {out_csv_path} (rows={len(df)})")

# ── 1) DeepChem에서 BACE 로더 확보 ───────────────────────────────────────────
loader = get_bace_loader()

# ── 2) MoleculeNet BACE 불러오기 (Raw + scaffold split) ─────────────────────
#    DeepChem 버전에 따라 파라미터 지원이 조금 다를 수 있어 kwargs로 안전 호출
kwargs = dict(featurizer="Raw", splitter="scaffold", reload=True)
try:
    tasks, datasets, transformers = loader(**kwargs)
except TypeError:
    # 어떤 버전에선 data_dir/save_dir 필요하거나 reload 미지원일 수 있음 → 최소 인자 재시도
    tasks, datasets, transformers = loader(featurizer="Raw", splitter="scaffold")

# datasets 순서는 보통 (train, valid, test)
train_dc, valid_dc, test_dc = datasets

# ── 3) CSV 저장 ─────────────────────────────────────────────────────────────
dc_dataset_to_csv(train_dc, os.path.join(RAW_DIR, "BioT5_bace_train.csv"))
dc_dataset_to_csv(valid_dc, os.path.join(RAW_DIR, "BioT5_bace_valid.csv"))
dc_dataset_to_csv(test_dc,  os.path.join(RAW_DIR, "BioT5_bace_test.csv"))

# ── 4) 프리뷰 ────────────────────────────────────────────────────────────────
for name in ["train","valid","test"]:
    p = os.path.join(RAW_DIR, f"BioT5_bace_{name}.csv")
    if os.path.exists(p):
        print(f"\n[{name}] preview")
        print(pd.read_csv(p).head(3))


No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
No normalization for NumAmideBonds. Feature removed!
No normalization for NumAtomStereoCenters. Feature removed!
No normalization for NumBridgeheadAtoms. Feature removed!
No normalization for NumHeterocycles. Feature removed!
No normalization for NumSpiroAtoms. Feature removed!
No normalization for NumUnspecifiedAtomStereoCenters. Feature removed!
No normalization for Phi. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'dgl'
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'
Skipped loading some PyTorch models, missing a dependency. No module named 'tensorflow'


[saved] /app/Mol-LLM_Custom/dataset/real_train/raw/BioT5_bace_train.csv (rows=1210)
[saved] /app/Mol-LLM_Custom/dataset/real_train/raw/BioT5_bace_valid.csv (rows=151)
[saved] /app/Mol-LLM_Custom/dataset/real_train/raw/BioT5_bace_test.csv (rows=152)

[train] preview
                                             SELFIES  label
0  [C][N][C][=Branch1][C][=O][C@@][Branch1][P][C]...   True
1  [C][N][C][=Branch1][C][=O][C@@][Branch1][P][C]...   True
2  [C][N][C][=Branch1][C][=O][C@@][Branch1][P][C]...   True

[valid] preview
                                             SELFIES  label
0  [C][C][Branch1][C][C][Branch1][C][C][C][=C][C]...  False
1  [C][C][=Branch1][C][=O][N][C][Branch1][S][C][C...  False
2  [C][C][Branch2][Ring1][O][C][=C][C][=C][C][Bra...  False

[test] preview
                                             SELFIES  label
0  [C][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=...   True
1  [C][C][Branch1][C][C][Branch1][C][C][C][C][=C]...   True
2  [C][O][C][C][=Branch1][C][=O][N][C][Br