In [None]:
import random
from datasets import load_from_disk, Dataset

# 1. 설정: 적용할 데이터셋 경로 (저자의 데이터셋 사용 권장 -> 순서 보장됨)
# 저자의 데이터셋 경로를 입력하세요. 
dataset_path = "/home/jovyan/CHJ/Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415_writer_bace"
save_path_base = "/home/jovyan/CHJ/Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415_writer_bace_template_switched"

# 2. 템플릿 리스트 정의
bace = [
    "<INPUT> Based on the molecule given above, suggest the biological activity against BACE-1.",
    "Based on the given molecule: <INPUT>, what biological activity could potentially be observed against BACE-1?",
    "Given the following molecule, please provide the biological activity against BACE-1. <INPUT>",
    "<INPUT> Given the above molecule, what could be the probable biological activity against BACE-1?",
    "Please provide the biological activity value for this molecule against BACE-1: <INPUT>.",
    "Consider that for this molecule, if <INPUT> is given, what is the biological activity against BACE-1?",
    "Propose the biological activity value given this molecule against BACE-1. <INPUT>",
    "Predict the biological activity of the molecule <INPUT> against BACE-1.",
    "Can you tell me the biological activity of the molecule that uses <INPUT> against BACE-1?",
    "Using <INPUT> as the molecule, tell me the biological activity against BACE-1.",
    "Predict the possible biological activity against BACE-1 for the listed molecule. <INPUT>",
    "<INPUT> Considering the given molecule, what might be the biological activity against BACE-1?",
    "A molecule <INPUT> is given; what could be the biological activity against BACE-1?",
]


# 3. 필수 구성 요소 정의 (이전 로그 기반 복원)
# 시스템 프롬프트 (줄바꿈 포함 정확히 맞춰야 함)
system_prompt = "You are a helpful assistant for molecular chemistry, to address tasks including molecular property classification, molecular property regression, chemical reaction prediction, molecule captioning, molecule generation."

# 그래프 토큰 (매우 중요)
graph_tokens = "<GRAPH>" + "<mol>" * 32 + "</GRAPH>"


def generate_new_prompt(example, template_idx=None):
    """
    example: 데이터셋의 행
    template_idx: 
        - 숫자 (0~12): 해당 인덱스의 템플릿으로 고정 (Specific Template Test)
        - None: 랜덤하게 선택 (General Performance Test)
    """
    # 1. 템플릿 선택
    if template_idx is not None:
        template = bace_templates[template_idx]
    else:
        template = random.choice(bace_templates)
    
    # 2. 내용물 구성 (Input Molecule String + Graph Tokens)
    # input_mol_string은 데이터셋에 있는 컬럼명이어야 합니다.
    mol_content = example['input_mol_string'] + graph_tokens
    
    # 3. <INPUT> 태그 치환
    instruction = template.replace("<INPUT>", mol_content)
    
    # 4. 전체 프롬프트 조립 (Mistral Instruct 포맷 준수)
    # 형식: <s>[INST] System \n\n Instruction [/INST]
    full_prompt = f"<s>[INST] {system_prompt} \n\n{instruction} [/INST]"
    
    example['prompt_text'] = full_prompt
    return example

# --- 실행 부분 ---

# 데이터 로드
ds = load_from_disk(dataset_path)
print(f"Loaded dataset size: {len(ds)}")

# [옵션 A] 랜덤하게 섞어서(일반적인 경우) 테스트하고 싶을 때
# ds_new = ds.map(lambda x: generate_new_prompt(x, template_idx=None))
# suffix = "_random"

# [옵션 B] 특정 템플릿(예: 0번)만 고정해서 테스트하고 싶을 때 (추천: 변수 통제)
target_idx = 0  # 0번 템플릿 사용
ds_new = ds.map(lambda x: generate_new_prompt(x, template_idx=target_idx))
suffix = f"_fixed_{target_idx}"

# 저장
final_path = save_path_base + suffix
ds_new.save_to_disk(final_path)

print(f"\n[Done] New dataset saved to: {final_path}")
print("[Example Prompt Preview]")
print(ds_new[0]['prompt_text'])

  from .autonotebook import tqdm as notebook_tqdm


Loaded dataset size: 152


Map: 100%|██████████| 152/152 [00:00<00:00, 4981.67 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 152/152 [00:00<00:00, 16167.53 examples/s]


[Done] New dataset saved to: /home/jovyan/CHJ/Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415_writer_bace_template_switched_fixed_0
[Example Prompt Preview]
<s>[INST] You are a helpful assistant for molecular chemistry, to address tasks including molecular property classification, molecular property regression, chemical reaction prediction, molecule captioning, molecule generation. 

<SELFIES> [C][C][Branch1][C][C][Branch1][C][C][C][C][=C][C][Branch2][Branch1][=N][C][NH2+1][C][C][S][=Branch1][C][=O][=Branch1][C][=O][C][C][Branch2][Ring2][Branch2][C][C][=C][C][Branch1][C][F][=C][Branch1][C][N][C][Branch2][Ring1][Branch1][O][C][Branch1][=Branch2][C][Branch1][C][F][Branch1][C][F][F][C][Branch1][C][F][Branch1][C][F][F][=C][Ring2][Ring1][C][C][Ring2][Ring1][O][O][=N][O][Ring2][Ring2][Ring1] </SELFIES><GRAPH><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><mol><m


