In [None]:
# pip install -U huggingface_hub pandas
from huggingface_hub import list_repo_files, hf_hub_download
import json, os, re
import pandas as pd

REPO = "QizhiPei/BioT5_finetune_dataset"
RAW_DATA_ROOT = "/appdataset/train/raw"   # 예: "./data"
OUT_DIR = os.path.join(RAW_DATA_ROOT, "raw")
os.makedirs(OUT_DIR, exist_ok=True)

# 1) 레포 파일 목록
files = list_repo_files(REPO, repo_type="dataset")

# 2) BACE JSON 선택 로직 (tasks_plus > tasks > splits_plus > splits 우선)
PREFS = ["tasks_plus/", "tasks/", "splits_plus/", "splits/"]
def pick_bace_file(split):
    # 'validation'과 'valid' 모두 허용
    split_alias = ["validation", "valid"] if split == "validation" else [split]
    cand = []
    for f in files:
        fl = f.lower()
        if not fl.endswith(".json"):
            continue
        if "bace" not in fl:
            continue
        if not any(p in f for p in PREFS):
            continue
        if not any(s in fl for s in split_alias):
            continue
        cand.append(f)
    if not cand:
        raise FileNotFoundError(f"BACE {split} 파일을 찾지 못함. repo={REPO}")
    # 우선순위 폴더 > 경로 짧은 순으로 선택
    cand.sort(key=lambda x: (PREFS.index(next(p for p in PREFS if p in x)), len(x)))
    return cand[0]

paths = {
    "train":      pick_bace_file("train"),
    "validation": pick_bace_file("validation"),
    "test":       pick_bace_file("test"),
}

# 3) JSON을 "무슨 형태든" 리스트[dict]로 변환
def load_json_flex(local_path):
    with open(local_path, "rb") as f:
        raw = f.read()

    # 3-1) jsonlines 시도
    try:
        lines = [ln for ln in raw.splitlines() if ln.strip()]
        objs = [json.loads(ln) for ln in lines]
        if isinstance(objs, list) and objs and isinstance(objs[0], dict):
            return objs
    except Exception:
        pass

    # 3-2) 일반 JSON 로드
    data = json.loads(raw.decode("utf-8"))

    # list[dict]
    if isinstance(data, list):
        if data and isinstance(data[0], dict):
            return data

    # dict -> 안에 list[dict]인 키 찾기
    if isinstance(data, dict):
        for k, v in data.items():
            if isinstance(v, list) and v and isinstance(v[0], dict):
                return v
        # dict of columns (각 key가 리스트)
        col_keys = [k for k, v in data.items() if isinstance(v, list)]
        if col_keys:
            n = max(len(data[k]) for k in col_keys)
            rows = []
            for i in range(n):
                row = {}
                for k in col_keys:
                    arr = data[k]
                    if i < len(arr):
                        row[k] = arr[i]
                rows.append(row)
            return rows

    raise ValueError(f"지원하지 않는 JSON 구조: {local_path}")

# 4) 샘플에서 SELFIES/label 컬럼 뽑아 표준화
def to_biot5_bace_df(examples):
    # examples: list of dict
    df = pd.DataFrame(examples)
    cols = {c.lower(): c for c in df.columns}

    selfies_col = cols.get("selfies") or cols.get("<selfies>") or cols.get("input") \
                  or cols.get("raw_input") or cols.get("x") or cols.get("mol")
    label_col   = cols.get("label") or cols.get("labels") or cols.get("output") \
                  or cols.get("y") or cols.get("target")

    if selfies_col is None or label_col is None:
        raise ValueError(f"SELFIES/label 컬럼 식별 실패: {list(df.columns)}")

    out = df.rename(columns={selfies_col: "SELFIES", label_col: "label"})[["SELFIES", "label"]]

    # label을 "True"/"False" 문자열로 정규화
    def to_tf(v):
        if isinstance(v, str):
            t = v.strip().lower()
            if t in ["true","yes","y","1"]: return "True"
            if t in ["false","no","n","0"]: return "False"
        try:
            return "True" if float(v) > 0 else "False"
        except Exception:
            return "True" if str(v).strip().lower() == "true" else "False"
    out["label"] = out["label"].map(to_tf)

    # SELFIES에 <SELFIES> 태그가 이미 있다면 그대로, 없으면 그대로 둡니다.
    # (네 파이프라인에서 wrap 단계에서 태그를 추가하므로 여기선 추가/삭제 안 함)

    # 결측/이상치 행 제거(선택)
    out = out.dropna(subset=["SELFIES", "label"]).reset_index(drop=True)
    return out

# 5) 파일 다운로드 → 파싱 → CSV 저장
for split, rel_path in paths.items():
    local = hf_hub_download(REPO, rel_path, repo_type="dataset")
    ex = load_json_flex(local)
    df = to_biot5_bace_df(ex)
    csv_name = f"BioT5_bace_{'valid' if split=='validation' else split}.csv"
    df.to_csv(os.path.join(OUT_DIR, csv_name), index=False)
    print(split, rel_path, "->", csv_name)

print("Done. CSVs saved to:", OUT_DIR)


In [None]:
train_bace = pd.read_csv(os.path.join(RAW_DATA_ROOT, "raw/BioT5_bace_train.csv"))

In [None]:
train_bace.head()

In [None]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("QizhiPei/BioT5_finetune_dataset", split="test")

In [None]:
# pip install -U huggingface_hub pandas selfies

from huggingface_hub import list_repo_files, hf_hub_download
import pandas as pd
import json, os, re

# ===== 설정 =====
REPO = "QizhiPei/BioT5_finetune_dataset"
OUT_DIR = "/appdataset/train/raw"
os.makedirs(OUT_DIR, exist_ok=True)

# ===== 레포 파일 나열 =====
files = list_repo_files(REPO, repo_type="dataset")

# 우선순위: tasks_plus > tasks > splits_plus > splits
PREFS = ["tasks_plus/", "tasks/", "splits_plus/", "splits/"]

def pick_bace_file(split: str) -> str:
    split_alias = ["validation", "valid"] if split == "validation" else [split]
    best = None
    best_key = (999, 10**9)
    for f in files:
        fl = f.lower()
        if not fl.endswith(".json"):
            continue
        if "bace" not in fl:
            continue
        if not any(sa in fl for sa in split_alias):
            continue
        pref_idx = next((i for i, p in enumerate(PREFS) if p in f), 999)
        key = (pref_idx, len(f))
        if key < best_key:
            best, best_key = f, key
    if best is None:
        raise FileNotFoundError(f"BACE {split} JSON을 찾지 못했습니다. repo={REPO}")
    return best

paths = {
    "train"     : pick_bace_file("train"),
    "validation": pick_bace_file("validation"),
    "test"      : pick_bace_file("test"),
}

# ===== 유틸 =====
def load_json_flex(local_path: str):
    """무슨 형태든 list[dict](샘플 리스트)로 반환. Instances가 있으면 평탄화."""
    with open(local_path, "rb") as f:
        raw = f.read()

    # 1) JSON Lines 시도
    try:
        lines = [ln for ln in raw.splitlines() if ln.strip()]
        objs = [json.loads(ln) for ln in lines]
        if objs and isinstance(objs[0], dict):
            # 혹시 각 줄이 dataset-level dict(Instances 포함)인 경우도 있음 → 아래에서 다시 평탄화
            data = objs
        else:
            data = json.loads(raw.decode("utf-8"))
    except Exception:
        data = json.loads(raw.decode("utf-8"))

    # ---- 평탄화 로직 ----
    def flatten_instances(x):
        # x가 dict이면
        if isinstance(x, dict):
            if "Instances" in x and isinstance(x["Instances"], list):
                return x["Instances"]
            # dict 내부에서 list[dict] 탐색
            for v in x.values():
                if isinstance(v, list) and (not v or isinstance(v[0], dict)):
                    return v
            return [x]
        # x가 list면
        if isinstance(x, list):
            # 리스트 요소가 dataset-level dict(=Instances를 가진)라면 전부 모아 평탄화
            if x and isinstance(x[0], dict) and "Instances" in x[0]:
                out = []
                for d in x:
                    if "Instances" in d and isinstance(d["Instances"], list):
                        out.extend(d["Instances"])
                return out
            # 이미 list[dict](샘플)인 경우
            return x
        # 그 외 타입은 실패
        raise ValueError("지원하지 않는 JSON 구조")

    return flatten_instances(data)

# SELFIES만 뽑기
def extract_selfies(raw: str) -> str | None:
    if pd.isna(raw):
        return None
    s = str(raw)

    m = re.search(r"SELFIES:\s*<bom>(.*)</eom>", s, flags=re.DOTALL)
    if m:
        return m.group(1).strip()
    m = re.search(r"<bom>(.*)</eom>", s, flags=re.DOTALL)
    if m:
        return m.group(1).strip()
    for line in s.splitlines():
        if "SELFIES:" in line:
            line = line.split("SELFIES:", 1)[1].strip()
            line = re.sub(r"^<bom>\s*|\s*</eom>$", "", line).strip()
            return line
    return s.strip()

def normalize_label(v) -> str:
    if isinstance(v, str):
        t = v.strip().lower()
        if t in ("true","yes","y","1"): return "True"
        if t in ("false","no","n","0"): return "False"
    try:
        return "True" if float(v) > 0 else "False"
    except Exception:
        return "True" if str(v).strip().lower() == "true" else "False"

# 선택: SELFIES 유효성 검사
try:
    import selfies as sf
    def is_valid_selfies(s: str) -> bool:
        try:
            _ = sf.decoder(s)
            return True
        except Exception:
            return False
except Exception:
    def is_valid_selfies(s: str) -> bool:
        return isinstance(s, str) and len(s) > 0

def to_biot5_df(examples: list[dict]) -> pd.DataFrame:
    """Instances 항목 안의 input/output(대소문자 가리지 않음) → SELFIES/label로 정규화"""
    rows = []
    for ex in examples:
        if not isinstance(ex, dict):
            continue
        # 입력/출력 키 탐색(대소문자 불문 + 다양한 별칭)
        kl = {k.lower(): k for k in ex.keys()}
        in_key = next((kl[k] for k in ("input","raw_input","selfies","x","mol") if k in kl), None)
        out_key = next((kl[k] for k in ("output","label","labels","y","target","answer") if k in kl), None)

        # Natural-Instructions 스타일로 'input'/'output'이 또 내부 dict에 들어가 있는 경우 처리
        cand = ex.get(in_key) if in_key else None
        if isinstance(cand, dict):
            # 흔한 후보
            in_key2 = next((k for k in cand.keys() if k.lower() in ("input","raw_input","selfies","x","mol","text")), None)
            input_val = cand.get(in_key2)
        else:
            input_val = cand

        cand = ex.get(out_key) if out_key else None
        if isinstance(cand, dict):
            out_key2 = next((k for k in cand.keys() if k.lower() in ("output","label","labels","y","target","answer","text")), None)
            output_val = cand.get(out_key2)
        else:
            output_val = cand

        if input_val is None or output_val is None:
            # 못 찾았으면 스킵
            continue

        # output이 리스트/딕트일 수 있음 → 문자열로 축약
        if isinstance(output_val, list):
            output_val = output_val[0] if output_val else ""
        if isinstance(output_val, dict):
            output_val = next((v for v in output_val.values() if isinstance(v, (str,int,float,bool))), str(output_val))

        rows.append({"SELFIES": extract_selfies(str(input_val)), "label": normalize_label(output_val)})

    df = pd.DataFrame(rows)
    # 무효 SELFIES 제거
    mask = df["SELFIES"].notna() & df["SELFIES"].map(is_valid_selfies)
    if (~mask).sum():
        print(f"[경고] 유효하지 않은 SELFIES {(~mask).sum()}개 제거")
    df = df[mask].reset_index(drop=True)
    return df[["SELFIES","label"]]

# ===== 다운로드 → 파싱 → CSV 저장 =====
for split, rel in paths.items():
    local_path = hf_hub_download(REPO, rel, repo_type="dataset")
    ex = load_json_flex(local_path)
    df = to_biot5_df(ex)
    fname = f"BioT5_bace_{'valid' if split=='validation' else split}.csv"
    out_path = os.path.join(OUT_DIR, fname)
    df.to_csv(out_path, index=False)
    print(f"{split:11s} {rel}  ->  {out_path}  (rows={len(df)})")

print("완료! 이제 다음 분기가 그대로 동작합니다:")
print(f"pd.read_csv(os.path.join('<raw_data_root>', 'raw', 'BioT5_bace_train.csv'))")
print(f"pd.read_csv(os.path.join('<raw_data_root>', 'raw', 'BioT5_bace_valid.csv'))")
print(f"pd.read_csv(os.path.join('<raw_data_root>', 'raw', 'BioT5_bace_test.csv'))")


In [None]:
# pip install -U huggingface_hub pandas

import os, re, json, textwrap
import pandas as pd
from huggingface_hub import list_repo_files, hf_hub_download

REPO = "QizhiPei/BioT5_finetune_dataset"
RAW_DATA_ROOT = "/appdataset/train"          # 네 환경에 맞게
RAW_DIR = os.path.join(RAW_DATA_ROOT, "raw")
os.makedirs(RAW_DIR, exist_ok=True)

# ─────────────────────────────────────────────────────────────────────────────
# 1) BACE JSON 경로 선택 (tasks_plus > tasks > splits_plus > splits 우선)
# ─────────────────────────────────────────────────────────────────────────────
files = list_repo_files(REPO, repo_type="dataset")
PREFS = ["tasks_plus/", "tasks/", "splits_plus/", "splits/"]

def pick_bace_file(split: str) -> str:
    split_alias = ["validation", "valid"] if split == "validation" else [split]
    best, best_key = None, (999, 10**9)
    for f in files:
        fl = f.lower()
        if not fl.endswith(".json"): continue
        if "bace" not in fl: continue
        if not any(sa in fl for sa in split_alias): continue
        pref_idx = next((i for i,p in enumerate(PREFS) if p in f), 999)
        key = (pref_idx, len(f))
        if key < best_key:
            best, best_key = f, key
    if best is None:
        raise FileNotFoundError(f"[bace] '{split}' 파일을 찾지 못했습니다.")
    return best

REL = {
    "train":      pick_bace_file("train"),       # ex) tasks_plus/task31_bace_molnet_train.json
    "validation": pick_bace_file("validation"),  # ex) tasks_plus/task32_bace_molnet_valid.json
    "test":       pick_bace_file("test"),        # ex) tasks_plus/task33_bace_molnet_test.json
}

# ─────────────────────────────────────────────────────────────────────────────
# 2) JSON 로딩 & Instances 평탄화
# ─────────────────────────────────────────────────────────────────────────────
def load_instances_any(json_path: str):
    with open(json_path, "rb") as f:
        raw = f.read()

    # jsonl 시도
    try:
        lines = [ln for ln in raw.splitlines() if ln.strip()]
        objs = [json.loads(ln) for ln in lines]
        data = objs if (objs and isinstance(objs[0], dict)) else json.loads(raw.decode("utf-8"))
    except Exception:
        data = json.loads(raw.decode("utf-8"))

    # Instances 평탄화
    if isinstance(data, dict):
        if "Instances" in data and isinstance(data["Instances"], list):
            return data["Instances"]
        for v in data.values():
            if isinstance(v, list) and (not v or isinstance(v[0], dict)):
                return v
        return [data]
    if isinstance(data, list):
        if data and isinstance(data[0], dict) and "Instances" in data[0]:
            out = []
            for d in data:
                if "Instances" in d and isinstance(d["Instances"], list):
                    out.extend(d["Instances"])
            return out
        return data
    return []

# ─────────────────────────────────────────────────────────────────────────────
# 3) SELFIES/label 추출 (Mol-LLM에서 바로 쓰는 포맷)
# ─────────────────────────────────────────────────────────────────────────────
SELFIES_RE = re.compile(r"SELFIES:\s*<bom>(.*?)</eom>", re.DOTALL)

def extract_selfies_from_input(s: str) -> str | None:
    """'IUPAC: ...\\nSELFIES: <bom> ... </eom>' 형태에서 SELFIES 본문만 추출"""
    if s is None: return None
    m = SELFIES_RE.search(str(s))
    if m:
        selfies = m.group(1).strip()
        # 내부 개행/따옴표 등은 CSV가 알아서 이스케이프하므로 그대로 둠
        return selfies
    # 백업: <bom>...</eom>만 있는 경우도 대비
    m = re.search(r"<bom>(.*?)</eom>", str(s), re.DOTALL)
    return m.group(1).strip() if m else None

def normalize_label(v) -> str:
    """Yes./No. → True/False 문자열"""
    if isinstance(v, list) and v:
        v = v[0]
    t = str(v).strip().lower()
    if "yes" in t or "true" in t:  return "True"
    if "no" in t  or "false" in t: return "False"
    # 숫자 등은 0/1 해석
    try:
        return "True" if float(t) > 0 else "False"
    except Exception:
        # 안전망: yes/no가 없으면 False로
        return "False"

# ─────────────────────────────────────────────────────────────────────────────
# 4) 원본 JSON 보관 + CSV 생성 + 프리뷰
# ─────────────────────────────────────────────────────────────────────────────
RAW_KEEP_DIR = os.path.join(RAW_DIR, "biot5_bace")  # 원본 저장 루트
os.makedirs(RAW_KEEP_DIR, exist_ok=True)

OUT_CSV = {
    "train":      os.path.join(RAW_DIR, "BioT5_bace_train.csv"),
    "validation": os.path.join(RAW_DIR, "BioT5_bace_valid.csv"),
    "test":       os.path.join(RAW_DIR, "BioT5_bace_test.csv"),
}

for split, rel in REL.items():
    # 4-1) HF 캐시에서 다운로드 & 원본 저장
    cached = hf_hub_download(REPO, rel, repo_type="dataset")
    keep_path = os.path.join(RAW_KEEP_DIR, rel)  # 폴더 구조 유지
    os.makedirs(os.path.dirname(keep_path), exist_ok=True)
    with open(cached, "rb") as src, open(keep_path, "wb") as dst:
        dst.write(src.read())
    print(f"[raw saved] {keep_path}")

    # 4-2) Instances -> SELFIES/label DataFrame
    instances = load_instances_any(keep_path)
    rows = []
    for ex in instances:
        if not isinstance(ex, dict): continue
        # input/output 키 찾기(대소문자 무시)
        kl = {k.lower(): k for k in ex.keys()}
        in_key  = next((kl[x] for x in ("input","raw_input") if x in kl), None)
        out_key = next((kl[x] for x in ("output","label","labels") if x in kl), None)
        if in_key is None or out_key is None: 
            continue
        selfies = extract_selfies_from_input(ex[in_key])
        label   = normalize_label(ex[out_key])
        if selfies is None:
            continue
        rows.append({"SELFIES": selfies, "label": label})

    df = pd.DataFrame(rows, columns=["SELFIES","label"])
    out_csv = OUT_CSV["validation" if split=="validation" else split]
    df.to_csv(out_csv, index=False)
    print(f"[csv saved] {out_csv} (rows={len(df)})")

    # 4-3) 프리뷰 5개만 출력
    print(f"\n[{split}] preview:")
    for i, r in df.head(5).iterrows():
        s = textwrap.shorten(r['SELFIES'].replace("\n","\\n"), width=120, placeholder=" …")
        print(f"  #{i+1}: SELFIES={s} | label={r['label']}")
    print("-"*100)


In [None]:
import json 

with open('/appdataset/train/raw/tasks_plus/task31_bace_molnet_train.json', 'r') as f:
    data = json.load(f)

data.keys()

In [2]:
from datasets import load_from_disk


test_set = load_from_disk('/appdataset/test')
test_set

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['task', 'x', 'edge_index', 'edge_attr', 'additional_x', 'additional_edge_index', 'additional_edge_attr', 'input_mol_string', 'prompt_text', 'target_text'],
    num_rows: 58757
})

In [3]:
sorted(test_set.unique('task'))

['alchemy_homo',
 'alchemy_homo_lumo_gap',
 'alchemy_lumo',
 'aqsol-logS',
 'bace',
 'chebi-20-mol2text',
 'chebi-20-text2mol',
 'forward_reaction_prediction',
 'orderly-forward_reaction_prediction',
 'orderly-retrosynthesis',
 'presto-forward_reaction_prediction',
 'presto-retrosynthesis',
 'qm9_homo',
 'qm9_homo_lumo_gap',
 'qm9_lumo',
 'reagent_prediction',
 'retrosynthesis',
 'smol-forward_synthesis',
 'smol-molecule_captioning',
 'smol-molecule_generation',
 'smol-property_prediction-bbbp',
 'smol-property_prediction-clintox',
 'smol-property_prediction-esol',
 'smol-property_prediction-hiv',
 'smol-property_prediction-lipo',
 'smol-property_prediction-sider',
 'smol-retrosynthesis']

In [14]:
test_set[0]

{'task': 'alchemy_homo',
 'x': [[5, 0, 4, 5, 3, 0, 2, 0, 0],
  [7, 0, 2, 5, 0, 0, 2, 0, 0],
  [5, 2, 4, 5, 1, 0, 2, 0, 1],
  [5, 2, 4, 5, 1, 0, 2, 0, 1],
  [5, 0, 4, 5, 2, 0, 2, 0, 1],
  [5, 2, 4, 5, 1, 0, 2, 0, 1],
  [5, 0, 3, 5, 1, 0, 1, 0, 1],
  [5, 0, 3, 5, 0, 0, 1, 0, 1],
  [5, 0, 4, 5, 3, 0, 2, 0, 0]],
 'edge_index': [[0, 1, 1, 2, 2, 3, 2, 7, 3, 4, 3, 5, 4, 5, 5, 6, 6, 7, 7, 8],
  [1, 0, 2, 1, 3, 2, 7, 2, 4, 3, 5, 3, 5, 4, 6, 5, 7, 6, 8, 7]],
 'edge_attr': [[0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [1, 0, 0],
  [1, 0, 0],
  [0, 0, 0],
  [0, 0, 0]],
 'additional_x': [[5, 0, 4, 5, 3, 0, 2, 0, 0],
  [7, 0, 2, 5, 0, 0, 2, 0, 0],
  [5, 2, 4, 5, 1, 0, 2, 0, 1],
  [5, 2, 4, 5, 1, 0, 2, 0, 1],
  [5, 0, 4, 5, 2, 0, 2, 0, 1],
  [5, 2, 4, 5, 1, 0, 2, 0, 1],
  [5, 0, 3, 5, 1, 0, 1, 0, 1],
  [5, 0, 3, 5, 0, 0, 1, 0, 1],
  [5, 0, 

In [15]:
import csv
import re
from tqdm import tqdm
from collections import Counter

# [INST]로 시작해서 첫 줄바꿈 전까지 캡처 (앞에 <s> 있을 수도 있음)
inst_pattern = re.compile(r'(?:<s>)?\[INST\]\s*(.*?)\r?\n', re.DOTALL)

instructions = []

for x in tqdm(test_set):
    sample = x.get('prompt_text', '')
    match = inst_pattern.search(sample)
    if match:
        inst_text = match.group(1).strip()
        instructions.append(inst_text)

counts = Counter(instructions)

output_path = '/appprompt/prompt.csv'
with open(output_path, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['instruction', 'count'])
    writer.writeheader()
    for inst_text, count in counts.items():
        writer.writerow({'instruction': inst_text, 'count': count})

print(f"✅ CSV 파일이 저장되었습니다: {output_path}")
print(f"총 {len(instructions)}개의 instruction이 추출되었습니다.")


100%|██████████| 58757/58757 [00:15<00:00, 3700.82it/s]

✅ CSV 파일이 저장되었습니다: /appprompt/prompt.csv
총 58757개의 instruction이 추출되었습니다.





In [6]:
result = results.sort(key=lambda x: (x['task'], -x['count']))
with open(output_path, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['task', 'prompt_text', 'count'])
    writer.writeheader()
    writer.writerows(results)

print(f"✅ CSV 파일이 저장되었습니다: {output_path}")

✅ CSV 파일이 저장되었습니다: /appprompt/prompt.csv
