In [1]:
from datasets import load_from_disk, load_dataset
import pandas as pd
import sys

# 데이터셋 로드 및 필터링 (기존 코드)
writer_test_dataset = load_from_disk("Mol-LLM_Custom/dataset/real_train(download_v1)/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415_writer")
writer_forward_reaction_prediction_test_dataset = writer_test_dataset.filter(lambda x : 'forward_reaction_prediction' in x['task'] )

download_test_dataset = load_from_disk("Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415")
download_forward_reaction_prediction_test_dataset = download_test_dataset.filter(lambda x : 'forward_reaction_prediction' in x['task'] )

  from .autonotebook import tqdm as notebook_tqdm
Filter: 100%|██████████| 3010/3010 [00:01<00:00, 1768.63 examples/s]


In [11]:
for key, value in sorted(writer_forward_reaction_prediction_test_dataset[0].items()):
    print(f"{key}: {value}")

additional_edge_attr: [[1, 0, 1], [1, 0, 1], [0, 0, 0], [0, 0, 0], [0, 0, 1], [0, 0, 1], [1, 0, 1], [1, 0, 1], [0, 0, 0], [0, 0, 0], [1, 0, 1], [1, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 1], [0, 0, 1], [1, 0, 1], [1, 0, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]
additional_edge_index: [[0, 1, 1, 2, 1, 3, 3, 4, 3, 5, 6, 7, 7, 8, 7, 9, 9, 10, 10, 11, 11, 12, 11, 13, 11, 14, 15, 16, 16, 17, 16, 18, 18, 19, 20, 21, 21, 22], [1, 0, 2, 1, 3, 1, 4, 3, 5, 3, 7, 6, 8, 7, 9, 7, 10, 9, 11, 10, 12, 11, 13, 11, 14, 11, 16, 15, 17, 16, 18, 16, 19, 18, 21, 20, 22, 21]]
additional_x: [[7, 0, 1, 5, 0, 0, 1, 0, 0], [5, 0, 3, 5, 0, 0, 1, 0, 0], [16, 0, 1, 5, 0, 0, 2, 0, 0], [5, 0, 3, 5, 0, 0, 1, 0, 0], [7, 0, 1, 5, 0, 0, 1, 0, 0], [16, 0, 1, 5, 0, 0, 2, 0, 0], [7, 0, 1, 5, 0, 0, 1, 0, 0], [5, 0, 3, 5, 0, 0, 1, 0, 0], [7, 0, 2, 5,

In [16]:
import pandas as pd
from datasets import load_from_disk
from tqdm import tqdm
import re

# 1. 설정 및 데이터 로드
writer_path = "Mol-LLM_Custom/dataset/real_train(download_v1)/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415_writer"
my_path = "Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415"

print("Loading Datasets...")
writer_dataset = load_from_disk(writer_path)
my_dataset = load_from_disk(my_path)

print(f"Writer Dataset Size: {len(writer_dataset)}")
print(f"My Dataset Size: {len(my_dataset)}")

# 2. 타겟 Task 정의 (비교하고 싶은 Task 키워드)
target_keywords = [
    "forward_reaction_prediction",
    "qm9_homo",
    "qm9_homo_lumo_gap", # 순서 중요: qm9_homo가 qm9_homo_lumo_gap에 매칭되지 않도록 긴 것을 먼저 쓰거나 정확히 일치시킴
    "qm9_lumo"
]


def get_task_key(task_name):
    # 데이터셋의 task 이름(예: presto-forward...)에서 핵심 키워드만 추출하여 통일
    for keyword in target_keywords:
        if keyword in task_name:
            # qm9_homo와 qm9_homo_lumo_gap 구분을 위해 정확한 매칭 확인
            if keyword == "qm9_homo" and "gap" in task_name:
                continue
            return keyword
    return None

# 4. Writer 데이터셋 Indexing (Hash Map 구축)
# Key: (Task_Keyword, Normalized_Input_String) -> Value: Data Instance
print("\nIndexing Writer Dataset...")
writer_index = {}

for i in tqdm(range(len(writer_dataset)), desc="Indexing Writer"):
    item = writer_dataset[i]
    raw_task = item['task']
    
    task_key = get_task_key(raw_task)
    if task_key:
        norm_input = item['input_mol_string']
        # 검색 속도를 위해 Dictionary에 저장
        writer_index[(task_key, norm_input)] = {
            "target_text": item['target_text'],
            "original_task": raw_task,
            "index": i
        }

# 5. 내 데이터셋과 비교 (Compare)
print("\nComparing My Dataset against Writer Index...")
matches = []
matched_count = 0
missing_count = 0

for i in tqdm(range(len(my_dataset)), desc="Scanning My Dataset"):
    item = my_dataset[i]
    raw_task = item['task']
    
    task_key = get_task_key(raw_task)
    if task_key:
        norm_input = item['input_mol_string']
        
        # Writer Index에 존재하는지 확인
        if (task_key, norm_input) in writer_index:
            ref_data = writer_index[(task_key, norm_input)]
            
            # 결과 저장
            matches.append({
                "Task Group": task_key,
                "My Task Name": raw_task,
                "Match Found": True,
                "Input Molecule (Snippet)": norm_input, # 너무 기니까 잘라서 표시
                "Writer Target": ref_data['target_text'],
                "My Target": item['target_text'],
                "Target Match": ref_data['target_text'] == item['target_text'] # Target 값도 완전히 같은지 확인
            })
            matched_count += 1
        else:
            # 내 데이터에는 있는데 Writer에는 없는 경우 (필요시 주석 해제하여 확인)
            # matches.append({
            #     "Task Group": task_key,
            #     "My Task Name": raw_task,
            #     "Match Found": False,
            #     "Input Molecule (Snippet)": norm_input[:50] + "...",
            #     "Writer Target": "N/A",
            #     "My Target": item['target_text'],
            #     "Target Match": False
            # })
            missing_count += 1

# 6. 결과 출력 (Pandas DataFrame)
df = pd.DataFrame(matches)

print("\n" + "="*50)
print(f" Comparison Summary")
print("="*50)
print(f"Total Target Items in My Dataset: {matched_count + missing_count}")
print(f"Matched with Writer Dataset:      {matched_count}")
print(f"Missing in Writer Dataset:        {missing_count}")
print("="*50)

if not df.empty:
    print("\n[Sample Matches (First 10 rows)]")
    # 가독성을 위해 컬럼 선택 및 출력 옵션 조정
    pd.set_option('display.max_colwidth', 50)
    display_cols = ["Task Group", "Input Molecule (Snippet)", "Writer Target", "My Target", "Target Match"]
    print(df[display_cols].head(10).to_markdown(index=False))

    # Target 값이 서로 다른 경우가 있는지 확인
    mismatch_targets = df[df["Target Match"] == False]
    if not mismatch_targets.empty:
        print("\n[Warning] Input matched but Target value is different:")
        print(mismatch_targets[display_cols].head().to_markdown(index=False))
    else:
        print("\n[Success] All matched inputs have identical target values.")
        
    # 결과를 CSV로 저장 (선택 사항)
    # df.to_csv("dataset_comparison_result.csv", index=False)
    # print("\nDetailed result saved to 'dataset_comparison_result.csv'")
else:
    print("No matches found for the specified tasks.")

Loading Datasets...
Writer Dataset Size: 58757
My Dataset Size: 3010

Indexing Writer Dataset...


Indexing Writer: 100%|██████████| 58757/58757 [00:32<00:00, 1834.87it/s]



Comparing My Dataset against Writer Index...


Scanning My Dataset: 100%|██████████| 3010/3010 [00:01<00:00, 1780.94it/s]


 Comparison Summary
Total Target Items in My Dataset: 3010
Matched with Writer Dataset:      0
Missing in Writer Dataset:        3010
No matches found for the specified tasks.





In [18]:
import pandas as pd
from datasets import load_from_disk
from tqdm import tqdm
import re
import difflib

# 1. 설정 및 데이터 로드
writer_path = "Mol-LLM_Custom/dataset/real_train(download_v1)/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415_writer"
my_path = "Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415"

print("Loading Datasets...")
writer_dataset = load_from_disk(writer_path)
my_dataset = load_from_disk(my_path)

# 2. 타겟 Task 정의
target_keywords = [
    "forward_reaction_prediction",
    "qm9_homo",
    "qm9_homo_lumo_gap",
    "qm9_lumo"
]

# 3. 연결고리가 될 Normalize 함수 (이걸 거치면 같아진다고 가정)
def normalize_input(text):
    if text is None: return ""
    # 태그 제거 및 공백 정규화
    text_clean = text.replace("<SELFIES>", "").replace("</SELFIES>", "")
    return re.sub(r'\s+', ' ', text_clean).strip()

def get_task_key(task_name):
    for keyword in target_keywords:
        if keyword in task_name:
            if keyword == "qm9_homo" and "gap" in task_name: continue
            return keyword
    return None

# 4. Writer 데이터셋 Indexing (Key: Normalized String -> Value: Raw String)
print("\nIndexing Writer Dataset...")
writer_map = {}

for i in tqdm(range(len(writer_dataset)), desc="Indexing"):
    item = writer_dataset[i]
    raw_task = item['task']
    task_key = get_task_key(raw_task)
    
    if task_key:
        norm_key = normalize_input(item['input_mol_string'])
        # Raw String을 저장해둠
        writer_map[(task_key, norm_key)] = item['input_mol_string']

# 5. 불일치 원인 상세 분석
print("\nAnalyzing Mismatches...")
analysis_results = []
inspect_count = 0

for i in tqdm(range(len(my_dataset)), desc="Scanning"):
    item = my_dataset[i]
    raw_task = item['task']
    task_key = get_task_key(raw_task)
    
    if task_key:
        my_raw = item['input_mol_string']
        norm_key = normalize_input(my_raw)
        
        # 1. Normalize로는 매칭이 되는 경우 (Recovered Match)
        if (task_key, norm_key) in writer_map:
            writer_raw = writer_map[(task_key, norm_key)]
            
            # 2. 하지만 Raw String은 서로 다른 경우 -> 분석 대상
            if my_raw != writer_raw:
                inspect_count += 1
                
                # 어디서부터 달라지는지 인덱스 찾기
                diff_idx = 0
                min_len = min(len(my_raw), len(writer_raw))
                while diff_idx < min_len and my_raw[diff_idx] == writer_raw[diff_idx]:
                    diff_idx += 1
                
                # 차이점 기록
                analysis_results.append({
                    "Task": raw_task,
                    "Index": i,
                    "Diff Index": diff_idx,
                    "My Raw (Snippet)": repr(my_raw[diff_idx-10:diff_idx+20]), # repr로 공백 문자 확인
                    "Writer Raw (Snippet)": repr(writer_raw[diff_idx-10:diff_idx+20]),
                    "Full My Raw": my_raw,
                    "Full Writer Raw": writer_raw
                })
        else:
            # 아예 Normalize로도 매칭 안되는 경우 (데이터 누락 등)
            pass

# 6. 결과 리포트
print("\n" + "="*60)
print(f" Mismatch Analysis Report")
print("="*60)
print(f"Total Mismatches found (Recovered via Norm): {len(analysis_results)}")

if len(analysis_results) > 0:
    print("\n[Top 5 Mismatch Examples - Character Level]")
    print("Tip: Look at 'repr()' output. '\\n' is newline, ' ' is space.")
    
    for idx, row in enumerate(analysis_results[:30]):
        print(f"\n--- Example {idx+1} (Task: {row['Task']}) ---")
        print(f"Difference starts at index: {row['Diff Index']}")
        
        # repr()을 사용하여 눈에 안보이는 문자까지 출력
        print(f"My Raw Snippet    : {row['My Raw (Snippet)']}")
        print(f"Writer Raw Snippet: {row['Writer Raw (Snippet)']}")
        
        # 전체 문자열 길이 비교
        print(f"Length -> My: {len(row['Full My Raw'])}, Writer: {len(row['Full Writer Raw'])}")
        
        # 간단한 설명 추론
        m_snip = row['My Raw (Snippet)']
        w_snip = row['Writer Raw (Snippet)']
        if "<SELFIES> " in m_snip and "<SELFIES>[" in w_snip:
            print(">> DIAGNOSIS: Spacing after tag (<SELFIES> vs <SELFIES>[)")
        elif " " in m_snip and "  " in w_snip:
            print(">> DIAGNOSIS: Double space issue")
        elif "\\n" in m_snip or "\\n" in w_snip:
            print(">> DIAGNOSIS: Newline character difference")
            
else:
    print("Great! No discrepancies found between normalized matches.")

Loading Datasets...

Indexing Writer Dataset...


Indexing: 100%|██████████| 58757/58757 [00:32<00:00, 1805.32it/s]



Analyzing Mismatches...


Scanning: 100%|██████████| 3010/3010 [00:01<00:00, 1731.76it/s]


 Mismatch Analysis Report
Total Mismatches found (Recovered via Norm): 2328

[Top 5 Mismatch Examples - Character Level]
Tip: Look at 'repr()' output. '\n' is newline, ' ' is space.

--- Example 1 (Task: forward_reaction_prediction) ---
Difference starts at index: 9
My Raw Snippet    : ''
Writer Raw Snippet: ''
Length -> My: 161, Writer: 163

--- Example 2 (Task: forward_reaction_prediction) ---
Difference starts at index: 9
My Raw Snippet    : ''
Writer Raw Snippet: ''
Length -> My: 172, Writer: 174

--- Example 3 (Task: forward_reaction_prediction) ---
Difference starts at index: 9
My Raw Snippet    : ''
Writer Raw Snippet: ''
Length -> My: 336, Writer: 338

--- Example 4 (Task: forward_reaction_prediction) ---
Difference starts at index: 9
My Raw Snippet    : ''
Writer Raw Snippet: ''
Length -> My: 346, Writer: 348

--- Example 5 (Task: forward_reaction_prediction) ---
Difference starts at index: 9
My Raw Snippet    : ''
Writer Raw Snippet: ''
Length -> My: 1039, Writer: 1041

--- 




In [3]:
import pandas as pd
from datasets import load_from_disk
from tqdm import tqdm
import re

# 1. 경로 설정
writer_path = "Mol-LLM_Custom/dataset/real_train(download_v1)/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415_writer"
my_path = "Mol-LLM_Custom/dataset/real_train/mistralai-Mistral-7B-Instruct-v0.3_string+graph_q32_test_3.3M_0415"

print("Loading Datasets...")
writer_dataset = load_from_disk(writer_path)
my_dataset = load_from_disk(my_path)

# 2. 비교를 위한 타겟 Task 및 Normalize 함수
target_keywords = ["forward_reaction_prediction", "qm9_homo", "qm9_homo_lumo_gap", "qm9_lumo"]

# def normalize_input(text):
#     if text is None: return ""
#     # 태그, 공백, 줄바꿈 모두 제거하여 순수 텍스트만 남김
#     text = text.replace("<SELFIES>", "").replace("</SELFIES>", "")
#     return re.sub(r'\s+', '', text).strip()

def get_task_key(task_name):
    for keyword in target_keywords:
        if keyword in task_name:
            if keyword == "qm9_homo" and "gap" in task_name: continue
            return keyword
    return None

# 3. Writer 데이터셋 Indexing (매칭 짝을 찾기 위함)
print("Indexing Writer Dataset...")
writer_map = {}
# 너무 오래 걸리지 않게, 비교용으로 앞쪽 5000개만 스캔해서 Indexing (충분함)
for i in range(min(len(writer_dataset), 10000)):
    item = writer_dataset[i]
    task_key = get_task_key(item['task'])
    if task_key:
        norm_key = item['input_mol_string']
        writer_map[(task_key, norm_key)] = item['input_mol_string']

# 4. 내 데이터셋을 순회하며 불일치 샘플 전체 출력
print("\nSearching for mismatches...")
found_count = 0

for i in range(len(my_dataset)):
    item = my_dataset[i]
    task_key = get_task_key(item['task'])
    
    if task_key:
        my_raw = item['input_mol_string']
        norm_key = my_raw
        
        # Normalize하면 같은데, Raw가 다른 경우 발견!
        if (task_key, norm_key) in writer_map:
            writer_raw = writer_map[(task_key, norm_key)]
            
            if my_raw != writer_raw:
                found_count += 1
                
                print("\n" + "="*80)
                print(f" MISMATCH FOUND (Sample Index: {i}) | Task: {item['task']}")
                print("="*80)
                
                # 차이나는 지점 찾기
                diff_idx = 0
                min_len = min(len(my_raw), len(writer_raw))
                while diff_idx < min_len and my_raw[diff_idx] == writer_raw[diff_idx]:
                    diff_idx += 1
                
                print(f">> Difference starts at index: {diff_idx}")
                print(f">> Character at diff index (Writer): {repr(writer_raw[diff_idx]) if diff_idx < len(writer_raw) else 'End of String'}")
                print(f">> Character at diff index (My)    : {repr(my_raw[diff_idx]) if diff_idx < len(my_raw) else 'End of String'}")
                
                print("-" * 80)
                print(" [WRITER DATASET SAMPLE - FULL CONTENT]")
                print("-" * 80)
                # repr()을 사용하여 공백, 줄바꿈 등을 명시적으로 출력
                print(repr(writer_raw))
                
                print("\n" + "-" * 80)
                print(" [MY DATASET SAMPLE - FULL CONTENT]")
                print("-" * 80)
                print(repr(my_raw))
                print("="*80 + "\n")
                
                # 하나만 보고 싶으면 break, 몇 개 더 보고 싶으면 숫자 조절
                if found_count >= 1: 
                    print("Stopped after printing 1 example.")
                    break

if found_count == 0:
    print("No mismatches found in the scanned range.")

Loading Datasets...
Indexing Writer Dataset...

Searching for mismatches...
No mismatches found in the scanned range.


In [4]:
my_dataset[0]

{'x': [[5, 0, 4, 5, 3, 0, 2, 0, 0],
  [5, 0, 4, 5, 2, 0, 2, 0, 0],
  [52, 0, 1, 5, 0, 0, 2, 0, 0],
  [7, 0, 1, 5, 0, 0, 1, 0, 0],
  [5, 0, 3, 5, 0, 0, 1, 0, 0],
  [7, 0, 2, 5, 1, 0, 1, 0, 0],
  [5, 0, 3, 5, 0, 0, 1, 1, 1],
  [5, 0, 3, 5, 1, 0, 1, 1, 1],
  [5, 0, 3, 5, 1, 0, 1, 1, 1],
  [5, 0, 3, 5, 1, 0, 1, 1, 1],
  [5, 0, 3, 5, 1, 0, 1, 1, 1],
  [5, 0, 3, 5, 0, 0, 1, 1, 1],
  [34, 0, 1, 5, 0, 0, 2, 0, 0],
  [5, 0, 4, 5, 3, 0, 2, 0, 0],
  [6, 0, 3, 5, 0, 0, 1, 0, 0],
  [5, 0, 4, 5, 3, 0, 2, 0, 0],
  [5, 0, 3, 5, 1, 0, 1, 0, 0],
  [7, 0, 1, 5, 0, 0, 1, 0, 0],
  [7, 0, 2, 5, 2, 0, 2, 0, 0],
  [7, 0, 1, 5, 0, 0, 1, 0, 0],
  [5, 0, 3, 5, 0, 0, 1, 0, 0],
  [7, 0, 1, 4, 0, 0, 1, 0, 0],
  [7, 0, 2, 5, 1, 0, 1, 0, 0],
  [10, 0, 0, 6, 0, 0, 5, 0, 0]],
 'edge_index': [[0,
   1,
   1,
   2,
   3,
   4,
   4,
   5,
   4,
   6,
   6,
   7,
   7,
   8,
   8,
   9,
   9,
   10,
   10,
   11,
   11,
   12,
   13,
   14,
   14,
   15,
   14,
   16,
   16,
   17,
   19,
   20,
   20,
   21,
   20,
   22

In [None]:
from dataset import load_from_disk