In [1]:
from datasets import load_from_disk

# dataset = load_from_disk("/appdataset/train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415")
train_dataset = load_from_disk("/home/jovyan/CHJ/Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_train_3.3M_0415")
validation_dataset = load_from_disk("/home/jovyan/CHJ/Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_validation_3.3M_0415")
test_dataset = load_from_disk("/home/jovyan/CHJ/Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from collections import Counter, defaultdict
import pandas as pd

# 1) 세부 task → 논문에서의 Task group 매핑
TASK_TO_GROUP = {
    # Property Prediction (Regression)
    "qm9_homo": "Property Prediction (Regression)",
    "qm9_homo_lumo_gap": "Property Prediction (Regression)",
    "qm9_lumo": "Property Prediction (Regression)",
    "smol-property_prediction-esol/0": "Property Prediction (Regression)",
    "smol-property_prediction-lipo/0": "Property Prediction (Regression)",
    # /0 없는 버전 (test셋에 이렇게 들어올 수도 있음)
    "smol-property_prediction-esol": "Property Prediction (Regression)",
    "smol-property_prediction-lipo": "Property Prediction (Regression)",

    # Property Prediction (Classification)
    "bace": "Property Prediction (Classification)",
    "smol-property_prediction-bbbp/0": "Property Prediction (Classification)",
    "smol-property_prediction-clintox/0": "Property Prediction (Classification)",
    "smol-property_prediction-hiv/0": "Property Prediction (Classification)",
    "smol-property_prediction-sider/0": "Property Prediction (Classification)",
    # /0 없는 버전
    "smol-property_prediction-bbbp": "Property Prediction (Classification)",
    "smol-property_prediction-clintox": "Property Prediction (Classification)",
    "smol-property_prediction-hiv": "Property Prediction (Classification)",
    "smol-property_prediction-sider": "Property Prediction (Classification)",

    # Forward Reaction Prediction
    "forward_reaction_prediction": "Forward Reaction Prediction",
    "smol-forward_synthesis/0": "Forward Reaction Prediction",
    "smol-forward_synthesis": "Forward Reaction Prediction",

    # Retrosynthesis
    "retrosynthesis": "Retrosynthesis",
    "smol-retrosynthesis/0": "Retrosynthesis",
    "smol-retrosynthesis": "Retrosynthesis",

    # Reagent Prediction
    "reagent_prediction": "Reagent Prediction",

    # Molecule Captioning
    "chebi-20-mol2text/0": "Molecule Captioning",
    "chebi-20-mol2text": "Molecule Captioning",
    "smol-molecule_captioning/0": "Molecule Captioning",
    "smol-molecule_captioning": "Molecule Captioning",

    # Description-Guided Molecule Generation
    "chebi-20-text2mol/0": "Description-Guided Molecule Generation",
    "chebi-20-text2mol": "Description-Guided Molecule Generation",
    "smol-molecule_generation/0": "Description-Guided Molecule Generation",
    "smol-molecule_generation": "Description-Guided Molecule Generation",

    # Name Conversion
    "smol-name_conversion-i2s/0": "Name Conversion",
    "smol-name_conversion-s2i/0": "Name Conversion",
    "smol-name_conversion-i2s": "Name Conversion",
    "smol-name_conversion-s2i": "Name Conversion",
}

# 2) Task group → Data source (논문 표의 Data Sources 열)
GROUP_TO_SOURCE = {
    "Property Prediction (Regression)": "MoleculeNet / QM9 등",
    "Property Prediction (Classification)": "MoleculeNet (BBBP, Clintox, HIV, SIDER, BACE)",
    "Forward Reaction Prediction": "USPTO + SMolInstruct",
    "Retrosynthesis": "USPTO 500MT + SMolInstruct",
    "Reagent Prediction": "USPTO 500K",
    "Molecule Captioning": "ChEBI-20 + SMolInstruct",
    "Description-Guided Molecule Generation": "ChEBI-20 + SMolInstruct",
    "Name Conversion": "PubChem",
}

# 3) 논문 Table 8 순서 그대로 지정
ORDERED_GROUPS = [
    "Property Prediction (Regression)",
    "Property Prediction (Classification)",
    "Forward Reaction Prediction",
    "Retrosynthesis",
    "Reagent Prediction",
    "Molecule Captioning",
    "Description-Guided Molecule Generation",
    "Name Conversion",
]

def group_counts_from_single_dataset(dataset, split_name="test"):
    """
    HF Dataset 하나(test)에 대해서
    task → group 매핑 후 group별 샘플 수를 센다.
    """
    task_counts = Counter(dataset["task"])
    group_counts = defaultdict(int)

    for task, cnt in task_counts.items():
        group = TASK_TO_GROUP.get(task)
        if group is None:
            print(f"[WARN] {split_name}: task '{task}' has no group mapping")
            continue
        group_counts[group] += cnt

    return group_counts

# 4) test 단일 dataset에서 group별 개수 세기
mol_llm_testset = load_from_disk("/home/jovyan/CHJ/Mol-LLM_Custom/checkpoint/mol-llm_testset")
test_group_counts = group_counts_from_single_dataset(mol_llm_testset, "test")

# 5) 논문 Table 8 순서대로 DataFrame 구성 (test + all만)
rows = []
for group in ORDERED_GROUPS:
    test_n = test_group_counts.get(group, 0)

    rows.append({
        "Task": group,
        "Data Sources": GROUP_TO_SOURCE.get(group, ""),
        "#Test": test_n,
        "#All": test_n,  # train/val 없음 -> 전부 test
    })

df = pd.DataFrame(rows)

# 마크다운 표 형태로 출력
print(df.to_markdown(index=False))


[WARN] test: task 'alchemy_homo' has no group mapping
[WARN] test: task 'alchemy_lumo' has no group mapping
[WARN] test: task 'alchemy_homo_lumo_gap' has no group mapping
[WARN] test: task 'aqsol-logS' has no group mapping
[WARN] test: task 'presto-forward_reaction_prediction' has no group mapping
[WARN] test: task 'presto-retrosynthesis' has no group mapping
[WARN] test: task 'orderly-forward_reaction_prediction' has no group mapping
[WARN] test: task 'orderly-retrosynthesis' has no group mapping
| Task                                   | Data Sources                                  |   #Test |   #All |
|:---------------------------------------|:----------------------------------------------|--------:|-------:|
| Property Prediction (Regression)       | MoleculeNet / QM9 등                          |    2519 |   2519 |
| Property Prediction (Classification)   | MoleculeNet (BBBP, Clintox, HIV, SIDER, BACE) |    7460 |   7460 |
| Forward Reaction Prediction            | USPTO + SMolIns

In [7]:
def infer_data_class(task: str) -> str:
    """
    get_dataset() / main()의 dataset_cls 분기를 반영해서
    각 task가 어떤 Dataset class(SMolInstructDataset, ...)로 전처리되었는지 반환.
    """
    base = task.split("/")[0]  # 'smol-xxx/0' -> 'smol-xxx'

    if "smol" in base:
        return "SMolInstructDataset"
    elif base in ["toxcast", "tox21", "qm9_additional_label", "hopv"]:
        return "MoleculeNetDatasetDeepChem"
    elif base in ["chebi-20-mol2text", "chebi-20-text2mol"]:
        return "ChEBIDataset"
    else:
        # 나머지: reagent_prediction, forward_reaction_prediction,
        # retrosynthesis, qm9_homo, qm9_lumo, qm9_homo_lumo_gap, bace 등
        return "MolInstructionDatset"
from collections import Counter
import pandas as pd

def task_level_counts_by_split(dataset, split_name="train"):
    """
    HF Dataset에서 'task' 컬럼 기준으로 샘플 수를 세고,
    각 task에 대해 Task group / Data Source / Data Class까지 붙여서 DataFrame으로 리턴.
    """
    task_counts = Counter(dataset["task"])
    rows = []

    for task, cnt in task_counts.items():
        group = TASK_TO_GROUP.get(task)
        if group is None:
            print(f"[WARN] {split_name}: task '{task}' has no group mapping")
            group = "UNKNOWN"

        data_source = GROUP_TO_SOURCE.get(group, "UNKNOWN")
        data_class  = infer_data_class(task)   # ← 여기서 Data Class 결정

        rows.append({
            "Task": task,             # 유니크 key
            "Task Group": group,
            "Data Sources": data_source,
            "Data Class": data_class,  # 새 컬럼
            f"#{split_name}": cnt,
        })

    return pd.DataFrame(rows)

# split별 task-level 집계
train_task_df = task_level_counts_by_split(train_dataset, split_name="train")
test_task_df  = task_level_counts_by_split(test_dataset,  split_name="test")

# Task / Task Group / Data Sources / Data Class 기준 outer join
df_task = train_task_df.merge(
    test_task_df,
    on=["Task", "Task Group", "Data Sources", "Data Class"],
    how="outer"
).fillna(0)

df_task["#train"] = df_task["#train"].astype(int)
df_task["#test"]  = df_task["#test"].astype(int)
df_task["#all"]   = df_task["#train"] + df_task["#test"]

df_task = df_task[[
    "Task",
    "Task Group",
    "Data Sources",
    "Data Class",   # ← 여기 추가됨
    "#train",
    "#test",
    "#all",
]]


df_task = df_task.sort_values(
    by=["Task Group", "Task"],
    key=lambda col: col.map({g: i for i, g in enumerate(ORDERED_GROUPS)}) if col.name == "Task Group" else col
)
df_task = df_task.sort_values(by=["Task Group", "Task"])
overall_row = {
    "Task": "OVERALL",
    "Task Group": "OVERALL",
    "Data Sources": "",
    "Data Class": "",
    "#train": df_task["#train"].sum(),
    "#test":  df_task["#test"].sum(),
    "#all":   df_task["#all"].sum(),
}
df_task = pd.concat([df_task, pd.DataFrame([overall_row])], ignore_index=True)
print(df_task.to_markdown(index=False))



| Task                               | Task Group                             | Data Sources                                  | Data Class           |   #train |   #test |    #all |
|:-----------------------------------|:---------------------------------------|:----------------------------------------------|:---------------------|---------:|--------:|--------:|
| chebi-20-text2mol/0                | Description-Guided Molecule Generation | ChEBI-20 + SMolInstruct                       | ChEBIDataset         |    25854 |    3300 |   29154 |
| smol-molecule_generation/0         | Description-Guided Molecule Generation | ChEBI-20 + SMolInstruct                       | SMolInstructDataset  |    48532 |    2493 |   51025 |
| forward_reaction_prediction        | Forward Reaction Prediction            | USPTO + SMolInstruct                          | MolInstructionDatset |   121890 |    1000 |  122890 |
| smol-forward_synthesis/0           | Forward Reaction Prediction            | USPTO + SM

In [10]:
# --- Task Group 단위 집계 ---
# OVERALL 행은 제외하고 그룹별 합계 계산
df_group = (
    df_task[df_task["Task Group"] != "OVERALL"]
    .groupby("Task Group", as_index=False)
    .agg({
        "Data Sources": lambda s: "+".join(sorted({x for x in s if x})),  # 같은 그룹 내 source들을 합치기
        "#train": "sum",
        "#test": "sum",
        "#all": "sum",
    })
)

# Task Group 순서를 ORDERED_GROUPS 기준으로 정렬 (있다고 가정)
order_map = {g: i for i, g in enumerate(ORDERED_GROUPS)}
df_group = df_group.sort_values(
    by="Task Group",
    key=lambda col: col.map(order_map).fillna(len(order_map))
)

# 맨 아래에 OVERALL 행 추가
overall_group_row = {
    "Task Group": "OVERALL",
    "Data Sources": "",
    "#train": df_group["#train"].sum(),
    "#test":  df_group["#test"].sum(),
    "#all":   df_group["#all"].sum(),
}
df_group = pd.concat([df_group, pd.DataFrame([overall_group_row])], ignore_index=True)

print(df_group.to_markdown(index=False))


| Task Group                             | Data Sources                                  |   #train |   #test |    #all |
|:---------------------------------------|:----------------------------------------------|---------:|--------:|--------:|
| Property Prediction (Regression)       | MoleculeNet / QM9 등                          |   357276 |    2542 |  359818 |
| Property Prediction (Classification)   | MoleculeNet (BBBP, Clintox, HIV, SIDER, BACE) |    59606 |    7459 |   67065 |
| Forward Reaction Prediction            | USPTO + SMolInstruct                          |  1093241 |    5062 | 1098303 |
| Retrosynthesis                         | USPTO 500MT + SMolInstruct                    |  1064095 |    5156 | 1069251 |
| Reagent Prediction                     | USPTO 500K                                    |   121896 |    1000 |  122896 |
| Molecule Captioning                    | ChEBI-20 + SMolInstruct                       |    74385 |    5838 |   80223 |
| Description-Guided Mole

# Captioning, Molecule generation, Retrosynthesis, Forward Reaction Prediction

train/test 그대로 유지하면서, 각 그룹 안에서만 dedup + train↔test 중복 제거

In [12]:
from datasets import concatenate_datasets
from tqdm import tqdm
from datasets import load_from_disk
import re
import sys
import os

# load_dataset.ipynb 가 utils/ 안에 있으므로 상위 폴더(project root) 추가
sys.path.append(os.path.abspath(".."))
import model.added_tokens as added_tokens
# dataset = load_from_disk("/appdataset/train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415")
train_dataset = load_from_disk("/app/Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_train_3.3M_0415")
validation_dataset = load_from_disk("/app/Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_validation_3.3M_0415")
test_dataset = load_from_disk("/app/Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415")

# 1) Molecule Captioning
CAPTION_TASKS = {
    "chebi-20-mol2text/0",
    "smol-molecule_captioning/0",
}

# 2) Description-Guided Molecule Generation (Text2Mol)
TEXT2MOL_TASKS = {
    "chebi-20-text2mol/0",
    "smol-molecule_generation/0",
}

# 3) Retrosynthesis
RETRO_TASKS = {
    "retrosynthesis",
    "smol-retrosynthesis/0",
}

# 4) Forward Reaction Prediction
FORWARD_TASKS = {
    "forward_reaction_prediction",
    "smol-forward_synthesis/0",
}

def select_by_tasks(dataset, task_set):
    tasks = dataset["task"]
    idxs = [i for i, t in enumerate(tasks) if t in task_set]
    return dataset.select(idxs)

def drop_by_tasks(dataset, task_set):
    tasks = dataset["task"]
    idxs = [i for i, t in enumerate(tasks) if t not in task_set]
    return dataset.select(idxs)

import model.added_tokens as added_tokens
DESC_START, DESC_END = added_tokens.DESCRIPTION  # e.g., "<DESC>", "</DESC>"

def extract_description_from_instruction(instr: str):
    """
    instruction 문자열에서 DESCRIPTION 토큰 사이의 description만 추출.
    못 찾으면 None 반환.
    """
    try:
        s = instr.split(DESC_START, 1)[1]
        s = s.split(DESC_END, 1)[0]
        return s.strip()
    except Exception:
        return None
    
def dedup_group_train_test(train_dataset, test_dataset, task_set, key_fn):
    """
    1) train/test에서 해당 task_set에 속한 샘플만 골라서
    2) train 내부 중복 제거
    3) test 내부 + (train과의 교집합) 제거
    4) 나머지 task들과 다시 합쳐서 새로운 train/test 반환
    """

    # 1) group subset / other subset 분리
    group_train = select_by_tasks(train_dataset, task_set)
    group_test  = select_by_tasks(test_dataset, task_set)

    other_train = drop_by_tasks(train_dataset, task_set)
    other_test  = drop_by_tasks(test_dataset, task_set)

    print(f"[{task_set}] before dedup  train={len(group_train)}, test={len(group_test)}")

    # 2) train dedup
    seen_train = set()
    keep_train_idx = []
    for i in tqdm(range(len(group_train))):
        ex = group_train[i]
        key = key_fn(ex)
        if key is None:
            # 키를 못 만들면 dedup에 안 쓰고 그냥 남겨둔다.
            keep_train_idx.append(i)
            continue
        if key in seen_train:
            continue
        seen_train.add(key)
        keep_train_idx.append(i)
    group_train_clean = group_train.select(keep_train_idx)

    # 3) test dedup (train과의 cross-dup + test 내부 dup 제거)
    seen_test = set()
    keep_test_idx = []
    for i in range(len(group_test)):
        ex = group_test[i]
        key = key_fn(ex)
        if key is None:
            keep_test_idx.append(i)
            continue
        # train에 이미 있는 (입력,출력) 쌍이면 test에서 제외
        if key in seen_train:
            continue
        # test 내부 중복 제거
        if key in seen_test:
            continue
        seen_test.add(key)
        keep_test_idx.append(i)
    group_test_clean = group_test.select(keep_test_idx)

    print(f"[{task_set}] after  dedup  train={len(group_train_clean)}, test={len(group_test_clean)}")

    # 4) 다시 합치기
    new_train = concatenate_datasets([other_train, group_train_clean])
    new_test  = concatenate_datasets([other_test,  group_test_clean])

    return new_train, new_test

def mol_input_label_key(ex):
    return (ex["input_mol_string"], ex["label"])
def text2mol_key(ex):
    desc = extract_description_from_instruction(ex["instruction"])
    if desc is None:
        return None
    return (desc, ex["label"])
# 0) 시작점: 현재 global train_dataset, test_dataset
# (이미 caption/generation/reaction 등이 섞여 있는 상태)

# 1) Molecule Captioning
train_dataset, test_dataset = dedup_group_train_test(
    train_dataset,
    test_dataset,
    CAPTION_TASKS,
    key_fn=mol_input_label_key,
)

# 2) Description-Guided Molecule Generation (Text2Mol)
train_dataset, test_dataset = dedup_group_train_test(
    train_dataset,
    test_dataset,
    TEXT2MOL_TASKS,
    key_fn=text2mol_key,
)

# 3) Retrosynthesis
train_dataset, test_dataset = dedup_group_train_test(
    train_dataset,
    test_dataset,
    RETRO_TASKS,
    key_fn=mol_input_label_key,
)

# 4) Forward Reaction Prediction
train_dataset, test_dataset = dedup_group_train_test(
    train_dataset,
    test_dataset,
    FORWARD_TASKS,
    key_fn=mol_input_label_key,
)

print("FINAL train size:", len(train_dataset))
print("FINAL test  size:", len(test_dataset))

import os

# =========================================================
# 5) 저장 경로 설정
# =========================================================
base_path = "/app/Mol-LLM_Custom/dataset/real_train"

# 파일명 템플릿
train_path = os.path.join(base_path, "mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_train_3.3M_0415_custom")
test_path  = os.path.join(base_path, "mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415_custom")
val_path   = os.path.join(base_path, "mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_validation_3.3M_0415_custom")

print(f"--- Saving datasets to {base_path} ---")

# =========================================================
# 6) 데이터셋 저장 (Save to Disk)
# =========================================================

# 1. Train Dataset 저장
print(f"Saving TRAIN dataset ({len(train_dataset)} samples)...")
train_dataset.save_to_disk(train_path)

# 2. Test Dataset 저장
print(f"Saving TEST dataset ({len(test_dataset)} samples)...")
test_dataset.save_to_disk(test_path)

# 3. Validation Dataset 저장 
# (요청하신 대로 validation과 test는 같은 데이터셋이므로 처리된 test_dataset을 저장)
print(f"Saving VALIDATION dataset (Copy of Test, {len(test_dataset)} samples)...")
test_dataset.save_to_disk(val_path)

print("\nAll datasets have been successfully saved!")

# =========================================================
# 7) (Optional) 최종 데이터 분포 검증
#    제공해주신 TASK_TO_GROUP을 사용하여 잘 합쳐졌는지 확인
# =========================================================
from collections import Counter

def print_distribution(dataset, name="Dataset"):
    print(f"\n[{name} Distribution]")
    task_counts = Counter(dataset['task'])
    
    # 그룹별 카운팅을 위한 초기화
    group_counts = {group: 0 for group in ORDERED_GROUPS}
    unknown_tasks = 0
    
    for task_name, count in task_counts.items():
        if task_name in TASK_TO_GROUP:
            group_name = TASK_TO_GROUP[task_name]
            # ORDERED_GROUPS에 있는 경우만 집계 (혹은 별도 처리)
            if group_name in group_counts:
                group_counts[group_name] += count
        else:
            # 매핑에 없는 Task가 있다면
            unknown_tasks += count

    # 출력
    total = 0
    for group in ORDERED_GROUPS:
        cnt = group_counts[group]
        total += cnt
        print(f"  - {group}: {cnt}")
    
    if unknown_tasks > 0:
        print(f"  - (Unknown/Other Tasks): {unknown_tasks}")
        total += unknown_tasks
        
    print(f"  => Total: {total}")

# 분포 확인 실행
print_distribution(train_dataset, "Final Custom TRAIN")
print_distribution(test_dataset, "Final Custom TEST")

[{'chebi-20-mol2text/0', 'smol-molecule_captioning/0'}] before dedup  train=74385, test=5838


100%|██████████| 74385/74385 [00:33<00:00, 2223.55it/s]


[{'chebi-20-mol2text/0', 'smol-molecule_captioning/0'}] after  dedup  train=52994, test=4088
[{'smol-molecule_generation/0', 'chebi-20-text2mol/0'}] before dedup  train=74386, test=5793


100%|██████████| 74386/74386 [00:05<00:00, 12526.82it/s]


[{'smol-molecule_generation/0', 'chebi-20-text2mol/0'}] after  dedup  train=52995, test=4056
[{'smol-retrosynthesis/0', 'retrosynthesis'}] before dedup  train=1064095, test=5156


100%|██████████| 1064095/1064095 [06:33<00:00, 2702.12it/s]


[{'smol-retrosynthesis/0', 'retrosynthesis'}] after  dedup  train=976951, test=5155
[{'smol-forward_synthesis/0', 'forward_reaction_prediction'}] before dedup  train=1093241, test=5062


100%|██████████| 1093241/1093241 [10:05<00:00, 1804.60it/s]


[{'smol-forward_synthesis/0', 'forward_reaction_prediction'}] after  dedup  train=1079537, test=5062
FINAL train size: 3301035
FINAL test  size: 29362
--- Saving datasets to /app/Mol-LLM_Custom/dataset/real_train ---
Saving TRAIN dataset (3301035 samples)...


Saving the dataset (64/64 shards): 100%|██████████| 3301035/3301035 [01:38<00:00, 33559.30 examples/s]


Saving TEST dataset (29362 samples)...


Saving the dataset (1/1 shards): 100%|██████████| 29362/29362 [00:00<00:00, 39057.13 examples/s]


Saving VALIDATION dataset (Copy of Test, 29362 samples)...


Saving the dataset (1/1 shards): 100%|██████████| 29362/29362 [00:00<00:00, 38911.59 examples/s]



All datasets have been successfully saved!

[Final Custom TRAIN Distribution]
  - Property Prediction (Regression): 357276
  - Property Prediction (Classification): 59606
  - Forward Reaction Prediction: 1079537
  - Retrosynthesis: 976951
  - Reagent Prediction: 121896
  - Molecule Captioning: 52994
  - Description-Guided Molecule Generation: 52995
  - Name Conversion: 599780
  => Total: 3301035

[Final Custom TEST Distribution]
  - Property Prediction (Regression): 2542
  - Property Prediction (Classification): 7459
  - Forward Reaction Prediction: 5062
  - Retrosynthesis: 5155
  - Reagent Prediction: 1000
  - Molecule Captioning: 4088
  - Description-Guided Molecule Generation: 4056
  - Name Conversion: 0
  => Total: 29362


# Mol-LLM Test Set

In [5]:
from datasets import load_from_disk
from collections import Counter, defaultdict
import pandas as pd
import numpy as np

mol_llm_testset = load_from_disk("/app/Mol-LLM_Custom/checkpoint/mol-llm_testset")

# 1) Task → Task Group 매핑
TASK_TO_GROUP = {
    # Regression
    "qm9_homo": "Property Prediction (Regression)",
    "qm9_homo_lumo_gap": "Property Prediction (Regression)",
    "qm9_lumo": "Property Prediction (Regression)",
    "smol-property_prediction-esol/0": "Property Prediction (Regression)",
    "smol-property_prediction-lipo/0": "Property Prediction (Regression)",
    "smol-property_prediction-esol": "Property Prediction (Regression)",
    "smol-property_prediction-lipo": "Property Prediction (Regression)",

    # Classification
    "bace": "Property Prediction (Classification)",
    "smol-property_prediction-bbbp/0": "Property Prediction (Classification)",
    "smol-property_prediction-clintox/0": "Property Prediction (Classification)",
    "smol-property_prediction-hiv/0": "Property Prediction (Classification)",
    "smol-property_prediction-sider/0": "Property Prediction (Classification)",
    "smol-property_prediction-bbbp": "Property Prediction (Classification)",
    "smol-property_prediction-clintox": "Property Prediction (Classification)",
    "smol-property_prediction-hiv": "Property Prediction (Classification)",
    "smol-property_prediction-sider": "Property Prediction (Classification)",

    # Forward reaction
    "forward_reaction_prediction": "Forward Reaction Prediction",
    "smol-forward_synthesis/0": "Forward Reaction Prediction",
    "smol-forward_synthesis": "Forward Reaction Prediction",

    # Retro
    "retrosynthesis": "Retrosynthesis",
    "smol-retrosynthesis/0": "Retrosynthesis",
    "smol-retrosynthesis": "Retrosynthesis",

    # Reagent
    "reagent_prediction": "Reagent Prediction",

    # Captioning
    "chebi-20-mol2text/0": "Molecule Captioning",
    "chebi-20-mol2text": "Molecule Captioning",
    "smol-molecule_captioning/0": "Molecule Captioning",
    "smol-molecule_captioning": "Molecule Captioning",

    # Text2Mol
    "chebi-20-text2mol/0": "Description-Guided Molecule Generation",
    "chebi-20-text2mol": "Description-Guided Molecule Generation",
    "smol-molecule_generation/0": "Description-Guided Molecule Generation",
    "smol-molecule_generation": "Description-Guided Molecule Generation",

    # NameConv
    "smol-name_conversion-i2s/0": "Name Conversion",
    "smol-name_conversion-s2i/0": "Name Conversion",
    "smol-name_conversion-i2s": "Name Conversion",
    "smol-name_conversion-s2i": "Name Conversion",
}

# 2) Task → DataClass
TASK_TO_DATA_CLASS = {
    # MolInstruction
    "qm9_homo": "MolInstructionDatset",
    "qm9_homo_lumo_gap": "MolInstructionDatset",
    "qm9_lumo": "MolInstructionDatset",
    "bace": "MolInstructionDatset",
    "forward_reaction_prediction": "MolInstructionDatset",
    "retrosynthesis": "MolInstructionDatset",
    "reagent_prediction": "MolInstructionDatset",

    # SMol
    "smol-property_prediction-esol/0": "SMolInstructDataset",
    "smol-property_prediction-lipo/0": "SMolInstructDataset",
    "smol-property_prediction-bbbp/0": "SMolInstructDataset",
    "smol-property_prediction-clintox/0": "SMolInstructDataset",
    "smol-property_prediction-hiv/0": "SMolInstructDataset",
    "smol-property_prediction-sider/0": "SMolInstructDataset",
    "smol-forward_synthesis/0": "SMolInstructDataset",
    "smol-retrosynthesis/0": "SMolInstructDataset",
    "smol-molecule_captioning/0": "SMolInstructDataset",
    "smol-molecule_generation/0": "SMolInstructDataset",
    "smol-name_conversion-i2s/0": "SMolInstructDataset",
    "smol-name_conversion-s2i/0": "SMolInstructDataset",

    # /0 없는 애들
    "smol-property_prediction-esol": "SMolInstructDataset",
    "smol-property_prediction-lipo": "SMolInstructDataset",
    "smol-property_prediction-bbbp": "SMolInstructDataset",
    "smol-property_prediction-clintox": "SMolInstructDataset",
    "smol-property_prediction-hiv": "SMolInstructDataset",
    "smol-property_prediction-sider": "SMolInstructDataset",
    "smol-forward_synthesis": "SMolInstructDataset",
    "smol-retrosynthesis": "SMolInstructDataset",
    "smol-molecule_captioning": "SMolInstructDataset",
    "smol-molecule_generation": "SMolInstructDataset",
    "smol-name_conversion-i2s": "SMolInstructDataset",
    "smol-name_conversion-s2i": "SMolInstructDataset",

    # ChEBI
    "chebi-20-mol2text/0": "ChEBIDataset",
    "chebi-20-text2mol/0": "ChEBIDataset",
    "chebi-20-mol2text": "ChEBIDataset",
    "chebi-20-text2mol": "ChEBIDataset",
}
GROUP_TO_SOURCE = {
    "Property Prediction (Regression)": "MoleculeNet / QM9 등",
    "Property Prediction (Classification)": "MoleculeNet (BBBP, Clintox, HIV, SIDER, BACE)",
    "Forward Reaction Prediction": "USPTO + SMolInstruct",
    "Retrosynthesis": "USPTO 500MT + SMolInstruct",
    "Reagent Prediction": "USPTO 500K",
    "Molecule Captioning": "ChEBI-20 + SMolInstruct",
    "Description-Guided Molecule Generation": "ChEBI-20 + SMolInstruct",
    "Name Conversion": "PubChem",
}

ORDERED_GROUPS = [
    "Property Prediction (Regression)",
    "Property Prediction (Classification)",
    "Forward Reaction Prediction",
    "Retrosynthesis",
    "Reagent Prediction",
    "Molecule Captioning",
    "Description-Guided Molecule Generation",
    "Name Conversion",
]
from collections import Counter
import pandas as pd

def build_task_table_only_test(test_dataset):
    test_counts = Counter(test_dataset["task"])
    rows = []

    for task, count in test_counts.items():
        if task not in TASK_TO_GROUP:
            continue

        rows.append({
            "Task": task,
            "Task Group": TASK_TO_GROUP[task],
            "Data Class": TASK_TO_DATA_CLASS.get(task, "Unknown"),
            "#test": count,
            "#all": count,   # train 없음 → test = all
        })

    df = pd.DataFrame(rows)
    return df.sort_values(["Task Group", "Task"])
def build_group_table_only_test(test_dataset):
    test_counts = Counter(test_dataset["task"])
    group_counts = {g: 0 for g in ORDERED_GROUPS}

    for task, cnt in test_counts.items():
        group = TASK_TO_GROUP.get(task)
        if group is None:
            continue
        group_counts[group] += cnt

    rows = []
    for g in ORDERED_GROUPS:
        rows.append({
            "Task Group": g,
            "Data Sources": GROUP_TO_SOURCE[g],
            "#Test": group_counts[g],
            "#All": group_counts[g],
        })

    df = pd.DataFrame(rows)
    return df


In [7]:
df_task = build_task_table_only_test(mol_llm_testset).sort_values(by=["Task Group", "Task"])
df_group = build_group_table_only_test(mol_llm_testset).sort_values(by=["Task Group"])

print(df_task.to_markdown(index=False))
print(df_group.to_markdown(index=False))


| Task                             | Task Group                             | Data Class           |   #test |   #all |
|:---------------------------------|:---------------------------------------|:---------------------|--------:|-------:|
| chebi-20-text2mol                | Description-Guided Molecule Generation | ChEBIDataset         |    3300 |   3300 |
| smol-molecule_generation         | Description-Guided Molecule Generation | SMolInstructDataset  |    2493 |   2493 |
| forward_reaction_prediction      | Forward Reaction Prediction            | MolInstructionDatset |    1000 |   1000 |
| smol-forward_synthesis           | Forward Reaction Prediction            | SMolInstructDataset  |    4062 |   4062 |
| chebi-20-mol2text                | Molecule Captioning                    | ChEBIDataset         |    3300 |   3300 |
| smol-molecule_captioning         | Molecule Captioning                    | SMolInstructDataset  |    2538 |   2538 |
| bace                             | Pro

# Train/Test set leakage 확인

In [1]:
import torch

torch.cuda.is_available()

False