In [3]:
import pandas as pd
from datasets import Dataset

def compare_sorted_datasets(dataset1, dataset2, sort_key='input_mol_string', name1="Author", name2="Mine"):
    print(f"\nSorting datasets by '{sort_key}' and comparing...")
    
    # 1. 정렬 (Sort)
    # HuggingFace Dataset의 sort는 새로운 데이터셋을 반환합니다.
    ds1_sorted = dataset1.sort(sort_key)
    ds2_sorted = dataset2.sort(sort_key)
    
    # 길이 확인
    if len(ds1_sorted) != len(ds2_sorted):
        print(f"ERROR: Length mismatch! {name1}: {len(ds1_sorted)} vs {name2}: {len(ds2_sorted)}")
        return

    mismatch_count = 0
    
    # 2. 순차 비교
    for i in range(len(ds1_sorted)):
        # 비교할 대상들 추출
        mol1 = ds1_sorted[i]['input_mol_string'].strip()
        mol2 = ds2_sorted[i]['input_mol_string'].strip()
        
        # 정답(Label) 컬럼 확인 (보통 output, answer, label 중 하나)
        # 데이터셋의 실제 컬럼명을 확인해 주세요.
        target_col = 'output' if 'output' in ds1_sorted.column_names else ds1_sorted.column_names[-1]
        
        out1 = str(ds1_sorted[i][target_col]).strip()
        out2 = str(ds2_sorted[i][target_col]).strip()
        
        # 분자 문자열 자체가 다르면 정렬 후에도 매칭이 안 된 것 (즉, 데이터셋 구성이 다름)
        if mol1 != mol2:
            print(f"\n[Index {i} - Mismatch Molecule]")
            print(f"  {name1}: {mol1}")
            print(f"  {name2}: {mol2}")
            mismatch_count += 1
            if mismatch_count > 5: break # 5개만 보고 중단
            continue

        # 분자는 같은데 정답(Label)이 다르면 심각한 문제
        if out1 != out2:
            print(f"\n[Index {i} - Mismatch Label for Same Molecule]")
            print(f"  Molecule: {mol1}")
            print(f"  {name1} Label: {out1}")
            print(f"  {name2} Label: {out2}")
            mismatch_count += 1
            if mismatch_count > 5: break

    if mismatch_count == 0:
        print(f"\nSuccess! Both datasets contain exactly the same data (content-wise).")
    else:
        print(f"\nFound mismatches. The datasets contain different data.")

# --- 실행 ---
# ds_A, ds_B는 로드된 데이터셋
ds_A = load_from_disk("Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415_download_InstructGraph_bace")
ds_B = load_from_disk("Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415_writier_bace")
compare_sorted_datasets(ds_A, ds_B, sort_key='input_mol_string', name1="Old_Dataset", name2="New_Dataset")


Sorting datasets by 'input_mol_string' and comparing...

Success! Both datasets contain exactly the same data (content-wise).


In [None]:
import pandas as pd
from datasets import Dataset, load_from_disk

# 1. 데이터셋 로드
path_A = "Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415_download_InstructGraph_bace"
path_B = "Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415_writier_bace"

ds_A = load_from_disk(path_A)
ds_B = load_from_disk(path_B)

print(f"Original ds_A size: {len(ds_A)}")
print(f"Original ds_B size: {len(ds_B)}")

# 2. ds_B(저자 데이터)에서 {분자: 프롬프트} 매핑 딕셔너리 생성
# 순서가 달라도 분자 문자열(input_mol_string)이 같으면 저자의 프롬프트를 가져오기 위함입니다.
mol_to_prompt_map = dict(zip(ds_B['input_mol_string'], ds_B['prompt_text']))

# 3. ds_A의 prompt_text를 교체하는 함수
def update_prompt_from_B(example):
    mol_key = example['input_mol_string']
    
    # ds_B에 해당 분자가 있으면 그 프롬프트로 교체, 없으면 원래 것 유지 (혹은 에러 처리)
    if mol_key in mol_to_prompt_map:
        example['prompt_text'] = mol_to_prompt_map[mol_key]
    
    return example

# 4. 매핑 적용 (ds_A의 순서는 그대로 유지되면서 내용만 바뀜)
print("Updating prompt_text in ds_A from ds_B...")
ds_A_switched = ds_A.map(update_prompt_from_B)

# 5. 저장
save_path = "Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415_download_InstructGraph_bace_switched"
ds_A_switched.save_to_disk(save_path)

print(f"Saved switched dataset to: {save_path}")

# --- 검증 (잘 들어갔는지 확인) ---
print("\n[Verification] Comparing first 3 rows:")
for i in range(3):
    print(f"Row {i} - Mol: {ds_A_switched[i]['input_mol_string']}...")
    # 원본 ds_A의 순서에 맞는 ds_B의 텍스트가 들어갔는지 확인
    # (여기서 ds_B[i]와 비교하면 안 되고, map에서 가져온 값이 맞는지 논리적으로 확인된 셈입니다)

Original ds_A size: 152
Original ds_B size: 152
Updating prompt_text in ds_A from ds_B...


Map: 100%|██████████| 152/152 [00:00<00:00, 4640.53 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 152/152 [00:00<00:00, 15551.89 examples/s]

Saved switched dataset to: Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415_download_InstructGraph_bace_switched

[Verification] Comparing first 3 rows:
Row 0 - Mol: <SELFIES> [C][C][=C][C][=C][C]...
Row 1 - Mol: <SELFIES> [C][C][Branch1][C][C...
Row 2 - Mol: <SELFIES> [C][O][C][C][=Branch...





In [2]:
import pandas as pd
from datasets import Dataset, load_from_disk

# 1. 데이터셋 로드
path_A = "Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415_download_InstructGraph_bace"
path_B = "Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415_writer_bace"

ds_A = load_from_disk(path_A)
ds_B = load_from_disk(path_B)

print(f"Original ds_A size: {len(ds_A)}")
print(f"Original ds_B size: {len(ds_B)}")

# 2. ds_B(저자 데이터)에서 {분자: 프롬프트} 매핑 딕셔너리 생성
# 순서가 달라도 분자 문자열(input_mol_string)이 같으면 저자의 프롬프트를 가져오기 위함입니다.
mol_to_prompt_map = dict(zip(ds_B['input_mol_string'], ds_B['prompt_text']))

Original ds_A size: 152
Original ds_B size: 152


In [14]:
filtered_ds = ds_A.filter(lambda x : x['input_mol_string'] == '<SELFIES> [C][C][Branch1][C][C][Branch1][C][C][C][C][=C][C][Branch2][Branch1][=N][C][NH2+1][C][C][S][=Branch1][C][=O][=Branch1][C][=O][C][C][Branch2][Ring2][Branch2][C][C][=C][C][Branch1][C][F][=C][Branch1][C][N][C][Branch2][Ring1][Branch1][O][C][Branch1][=Branch2][C][Branch1][C][F][Branch1][C][F][F][C][Branch1][C][F][Branch1][C][F][F][=C][Ring2][Ring1][C][C][Ring2][Ring1][O][O][=N][O][Ring2][Ring2][Ring1] </SELFIES>')
print(sorted(filtered_ds[0].items()))
for key, value in mol_to_prompt_map.items():
    print(key)
    print(value)
    break

[('additional_edge_attr', [[0, 0, 0], [0, 0, 0]]), ('additional_edge_index', [[0, 1], [1, 0]]), ('additional_x', [[5, 0, 4, 5, 3, 0, 2, 0, 0], [5, 0, 4, 5, 3, 0, 2, 0, 0]]), ('edge_attr', [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [3, 0, 1], [3, 0, 1], [3, 0, 1], [3, 0, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [3, 0, 1], [3, 0, 1], [3, 0, 1], [3, 0, 1], [0, 0, 0], [0, 0, 0], [3, 0, 1], [3, 0, 1], [0, 0, 1], [0, 0, 1], [3, 0, 1], [3, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [3, 0, 1], [3, 0, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0

In [5]:
ds_switched = load_from_disk("Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415_download_InstructGraph_bace_switched")

In [6]:
ds_switched[0]

{'x': [[5, 0, 4, 5, 3, 0, 2, 0, 0],
  [5, 0, 3, 5, 0, 0, 1, 1, 1],
  [5, 0, 3, 5, 1, 0, 1, 1, 1],
  [5, 0, 3, 5, 1, 0, 1, 1, 1],
  [5, 0, 3, 5, 1, 0, 1, 1, 1],
  [5, 0, 3, 5, 1, 0, 1, 1, 1],
  [5, 0, 3, 5, 0, 0, 1, 1, 1],
  [5, 0, 3, 5, 0, 0, 1, 1, 1],
  [5, 0, 3, 5, 1, 0, 1, 1, 1],
  [5, 0, 3, 5, 1, 0, 1, 1, 1],
  [5, 0, 3, 5, 0, 0, 1, 1, 1],
  [6, 0, 2, 5, 0, 0, 1, 1, 1],
  [5, 0, 3, 5, 0, 0, 1, 1, 1],
  [6, 0, 3, 5, 2, 0, 1, 0, 0],
  [5, 0, 3, 5, 0, 0, 1, 1, 1],
  [5, 0, 4, 5, 2, 0, 2, 0, 0],
  [5, 0, 4, 5, 1, 0, 2, 0, 0],
  [5, 0, 4, 5, 3, 0, 2, 0, 0],
  [5, 0, 3, 5, 0, 0, 1, 0, 0],
  [7, 0, 1, 5, 0, 0, 1, 0, 0],
  [6, 0, 3, 5, 1, 0, 1, 0, 0],
  [5, 0, 4, 5, 2, 0, 2, 0, 0],
  [5, 0, 4, 5, 1, 0, 2, 0, 1],
  [5, 0, 4, 5, 2, 0, 2, 0, 1],
  [5, 0, 4, 5, 2, 0, 2, 0, 1],
  [7, 0, 2, 5, 0, 0, 2, 0, 1],
  [5, 0, 4, 5, 2, 0, 2, 0, 1],
  [5, 0, 4, 5, 2, 0, 2, 0, 1],
  [5, 0, 3, 5, 1, 0, 1, 1, 1],
  [5, 0, 3, 5, 0, 0, 1, 1, 1],
  [5, 0, 3, 5, 1, 0, 1, 1, 1]],
 'edge_index': [[0,
   1,
   1,
 