# Mol-LLM 데이터셋 검증 스크립트

In [None]:
import os
import re
import pandas as pd
from collections import defaultdict, Counter
from tqdm import tqdm
import selfies as sf
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from datasets import load_from_disk, Dataset, enable_progress_bar
from tabulate import tabulate
import multiprocessing

# HF Progress Bar 활성화
enable_progress_bar()

# =============================================================================
# [Configuration]
# =============================================================================
SCAFFOLD_SPLIT_TASKS = {"bace", "bbbp", "clintox", "tox21", "toxcast", "sider", "hiv", "muv", "esol", "freesolv", "lipo", "hopv"}
CATEGORY_ORDER = ["Property Prediction (Regression)", "Property Prediction (Classification)", "Forward Reaction Prediction", "Retrosynthesis", "Reagent Prediction", "Molecule Captioning", "Description-Guided Molecule Generation", "Name Conversion"]
TASK_MAPPING = {
    "Property Prediction (Regression)": ["qm9_homo", "qm9_lumo", "qm9_homo_lumo_gap", "qm9_additional_label", "smol-property_prediction-esol", "smol-property_prediction-lipo", "smol-property_prediction-freesolv", "esol", "lipo", "freesolv"],
    "Property Prediction (Classification)": ["bace", "tox21", "toxcast", "clintox", "bbbp", "hiv", "sider", "muv", "hopv", "smol-property_prediction-bbbp", "smol-property_prediction-clintox", "smol-property_prediction-hiv", "smol-property_prediction-sider", "smol-property_prediction-tox21", "smol-property_prediction-toxcast", "smol-property_prediction-muv"],
    "Forward Reaction Prediction": ["forward_reaction_prediction", "smol-forward_synthesis"],
    "Retrosynthesis": ["retrosynthesis", "smol-retrosynthesis"],
    "Reagent Prediction": ["reagent_prediction"],
    "Molecule Captioning": ["chebi-20-mol2text", "smol-molecule_captioning"],
    "Description-Guided Molecule Generation": ["chebi-20-text2mol", "smol-molecule_generation"],
    "Name Conversion": ["smol-name_conversion-i2s", "smol-name_conversion-s2i", "smol-name_conversion-i2f", "smol-name_conversion-s2f"]
}
DATA_SOURCES = {"Property Prediction (Regression)": "MoleculeNet", "Property Prediction (Classification)": "MoleculeNet", "Forward Reaction Prediction": "USPTO", "Retrosynthesis": "USPTO 500MT", "Reagent Prediction": "USPTO 500K", "Molecule Captioning": "ChEBI-20", "Description-Guided Molecule Generation": "ChEBI-20", "Name Conversion": "PubChem"}

def decode_and_get_info(batch):
    input_mols = batch["input_mol_string"]
    canon_smiles_list, scaffold_list, valid_list = [], [], []
    for input_mol in input_mols:
        res_smiles, res_scaffold, is_valid = "", "", False
        try:
            if input_mol:
                clean_str = re.sub(r"<[^>]+>", "", str(input_mol)).strip()
                smiles = sf.decoder(clean_str)
                if smiles:
                    mol = Chem.MolFromSmiles(smiles)
                    if mol:
                        res_smiles = Chem.MolToSmiles(mol, canonical=True)
                        res_scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol) or ""
                        is_valid = True
        except: pass
        canon_smiles_list.append(res_smiles); scaffold_list.append(res_scaffold); valid_list.append(is_valid)
    return {"canon_smiles": canon_smiles_list, "scaffold": scaffold_list, "valid": valid_list}

def map_task_to_category(task_name):
    for cat, tasks in TASK_MAPPING.items():
        if task_name in tasks: return cat
    return "Others"

def final_cleanup_and_stats_fast(train_path, val_path, test_path, base_save_dir, num_cores=32):
    print(f"=== [Phase 1] Loading & Internal Deduplication (Pandas) ===")
    splits = {"train": train_path, "val": val_path, "test": test_path}
    dfs = {}
    
    for name, path in splits.items():
        if not os.path.exists(path):
            print(f"[Skip] Path not found: {path}")
            continue
        ds = load_from_disk(path)
        df = ds.to_pandas()
        
        possible_output_cols = ['label', 'output_string', 'output', 'target', 'answers']
        actual_out_col = next((c for c in possible_output_cols if c in df.columns), None)
        if actual_out_col is None:
            raise KeyError(f"[{name}] 정답 컬럼을 찾을 수 없습니다. 목록: {df.columns.tolist()}")
        
        initial_len = len(df)
        df = df.drop_duplicates(subset=['task', 'input_mol_string', actual_out_col], keep='first').copy()
        dfs[name] = df
        print(f" -> {name.upper()}: {initial_len:,} -> {len(df):,} (Removed: {initial_len - len(df):,})")

    print(f"\n=== [Phase 2] Molecular Parsing (Multiprocessing) ===")
    for name in ["train", "val", "test"]:
        if name not in dfs: continue
        print(f" -> Parsing {name}...")
        temp_ds = Dataset.from_pandas(dfs[name][['task', 'input_mol_string']], preserve_index=False)
        # num_proc 사용 시 Notebook 환경 안정성을 위해 batch_size 조절
        parsed = temp_ds.map(decode_and_get_info, batched=True, batch_size=1000, num_proc=num_cores, desc=f"Mapping {name}")
        parsed_df = parsed.to_pandas()
        
        info_only_df = parsed_df[['canon_smiles', 'scaffold', 'valid']].reset_index(drop=True)
        dfs[name] = pd.concat([dfs[name].reset_index(drop=True), info_only_df], axis=1).copy()

    print(f"\n=== [Phase 3] Fast Decontamination (Pandas Masking) ===")
    test_df = dfs["test"]
    test_black_smiles = test_df[test_df['valid']].groupby('task')['canon_smiles'].apply(set).to_dict()
    test_black_scaf = test_df[test_df['valid']].groupby('task')['scaffold'].apply(set).to_dict()

    final_dfs = {"test": dfs["test"]}
    for name in ["train", "val"]:
        if name not in dfs: continue
        df = dfs[name]
        
        def is_leak(row):
            t, s, scaf, v = row['task'], row['canon_smiles'], row['scaffold'], row['valid']
            if not v: return False
            if s in test_black_smiles.get(t, set()): return True
            if t in SCAFFOLD_SPLIT_TASKS:
                if scaf in test_black_scaf.get(t, set()): return True
            return False

        tqdm.pandas(desc=f"Filtering {name}")
        leak_mask = df.progress_apply(is_leak, axis=1)
        final_dfs[name] = df[~leak_mask].copy()
        print(f" -> {name.upper()} Decontaminated: {len(df):,} -> {len(final_dfs[name]):,}")

    print(f"\n=== [Phase 4] Statistics & Final Saving ===")
    table_data = []
    totals = {"Train": 0, "Val": 0, "Test": 0, "All": 0}

    for cat in CATEGORY_ORDER:
        cat_counts = {"Train": 0, "Val": 0, "Test": 0}
        for split in ["train", "val", "test"]:
            if split not in final_dfs: continue
            split_df = final_dfs[split]
            tasks_in_cat = TASK_MAPPING[cat]
            count = split_df[split_df['task'].isin(tasks_in_cat)].shape[0]
            cat_counts[split.capitalize()] = count
        
        n_all = cat_counts["Train"] + cat_counts["Test"]
        table_data.append({
            "Task Category": cat,
            "Data Sources": DATA_SOURCES.get(cat, "-"),
            "# Train": f"{cat_counts['Train']:,}",
            "# Val": f"{cat_counts['Val']:,}",
            "# Test": f"{cat_counts['Test']:,}",
            "# All": f"{n_all:,}"
        })
        for k in ["Train", "Val", "Test"]: totals[k] += cat_counts[k]
        totals["All"] += n_all

    print("\n" + "="*120)
    print(tabulate(pd.DataFrame(table_data), headers="keys", tablefmt="github", stralign="right"))
    print("-" * 120)
    print(f"{'Overall':<40} {'':<15} {totals['Train']:>10,} {totals['Val']:>10,} {totals['Test']:>10,} {totals['All']:>10,}")
    print("="*120)

    for name in ["train", "val", "test"]:
        if name not in final_dfs: continue
        final_df = final_dfs[name].drop(columns=["canon_smiles", "scaffold", "valid"])
        save_path = os.path.join(base_save_dir, f"GSAI-ML-LLaDA-8B-Instruct_string+graph_q32_{name}_FINAL_CLEANED")
        Dataset.from_pandas(final_df, preserve_index=False).save_to_disk(save_path)
        print(f"[Saved] {name.upper()} -> {save_path}")

if __name__ == "__main__":
    # OS 환경에 따른 멀티프로세싱 시작 방식 설정 (Unix 계열 권장)
    try:
        multiprocessing.set_start_method('spawn', force=True)
    except RuntimeError:
        pass

    train_in = "Mol-LLM_Custom/dataset/train_official/GSAI-ML-LLaDA-8B-Instruct_string+graph_q32_train_3.3M_0415_raw"
    val_in = "Mol-LLM_Custom/dataset/train_official/GSAI-ML-LLaDA-8B-Instruct_string+graph_q32_validation_3.3M_0415_raw"
    test_in = "Mol-LLM_Custom/dataset/train_official/GSAI-ML-LLaDA-8B-Instruct_string+graph_q32_test_3.3M_0415_raw"
    save_dir = "Mol-LLM_Custom/dataset/train_official/"
    
    # 멈춤 현상 방지를 위해 num_cores를 시스템 물리 코어의 절반 정도로 설정하는 것을 권장합니다 (예: 16~32)
    final_cleanup_and_stats_fast(train_in, val_in, test_in, save_dir, num_cores=16)

=== [Phase 1] Loading & Internal Deduplication (Pandas) ===


Loading dataset from disk:   0%|          | 0/45 [00:00<?, ?it/s]

In [3]:
from datasets import load_from_disk
train_path = "Mol-LLM_Custom/dataset/train_official/GSAI-ML-LLaDA-8B-Instruct_string+graph_q32_train_3.3M_0415_deduplicate_CLEANED"
test_path = "Mol-LLM_Custom/dataset/train_official/GSAI-ML-LLaDA-8B-Instruct_string+graph_q32_test_3.3M_0415_deduplicate_CLEANED"
val_path = "Mol-LLM_Custom/dataset/train_official/GSAI-ML-LLaDA-8B-Instruct_string+graph_q32_validation_3.3M_0415_deduplicate_CLEANED"

train_ds = load_from_disk(train_path)
test_ds = load_from_disk(test_path)
val_ds = load_from_disk(val_path)

len(train_ds), len(test_ds), len(val_ds)

Loading dataset from disk:   0%|          | 0/45 [00:00<?, ?it/s]

(3464783, 32822, 36016)

In [None]:
import os
import re
import pandas as pd
from collections import defaultdict, Counter
from tqdm import tqdm
import selfies as sf
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from datasets import load_from_disk, Dataset, enable_progress_bar
from tabulate import tabulate
import multiprocessing

# HF Progress Bar 활성화
enable_progress_bar()

# =============================================================================
# [Configuration]
# =============================================================================
SCAFFOLD_SPLIT_TASKS = {"bace", "bbbp", "clintox", "tox21", "toxcast", "sider", "hiv", "muv", "esol", "freesolv", "lipo", "hopv"}
CATEGORY_ORDER = [
    "Property Prediction (Regression)", "Property Prediction (Classification)", 
    "Forward Reaction Prediction", "Retrosynthesis", "Reagent Prediction", 
    "Molecule Captioning", "Description-Guided Molecule Generation", "Name Conversion"
]

TASK_MAPPING = {
    "Property Prediction (Regression)": ["qm9_homo", "qm9_lumo", "qm9_homo_lumo_gap", "qm9_additional_label", "smol-property_prediction-esol", "smol-property_prediction-lipo", "smol-property_prediction-freesolv", "esol", "lipo", "freesolv"],
    "Property Prediction (Classification)": ["bace", "tox21", "toxcast", "clintox", "bbbp", "hiv", "sider", "muv", "hopv", "smol-property_prediction-bbbp", "smol-property_prediction-clintox", "smol-property_prediction-hiv", "smol-property_prediction-sider", "smol-property_prediction-tox21", "smol-property_prediction-toxcast", "smol-property_prediction-muv"],
    "Forward Reaction Prediction": ["forward_reaction_prediction", "smol-forward_synthesis"],
    "Retrosynthesis": ["retrosynthesis", "smol-retrosynthesis"],
    "Reagent Prediction": ["reagent_prediction"],
    "Molecule Captioning": ["chebi-20-mol2text", "smol-molecule_captioning"],
    "Description-Guided Molecule Generation": ["chebi-20-text2mol", "smol-molecule_generation"],
    "Name Conversion": ["smol-name_conversion-i2s", "smol-name_conversion-s2i", "smol-name_conversion-i2f", "smol-name_conversion-s2f"]
}

# =============================================================================
# [Step 2: Molecular Parsing Logic]
# =============================================================================
def decode_and_get_info(batch):
    """HF map 전용: 멀티프로세싱으로 분자 정보 파싱"""
    input_mols = batch["input_mol_string"]
    canon_smiles_list, scaffold_list, valid_list = [], [], []
    for input_mol in input_mols:
        res_smiles, res_scaffold, is_valid = "", "", False
        try:
            if input_mol:
                clean_str = re.sub(r"<[^>]+>", "", str(input_mol)).strip()
                smiles = sf.decoder(clean_str)
                if smiles:
                    mol = Chem.MolFromSmiles(smiles)
                    if mol:
                        res_smiles = Chem.MolToSmiles(mol, canonical=True)
                        res_scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol) or ""
                        is_valid = True
        except: pass
        canon_smiles_list.append(res_smiles); scaffold_list.append(res_scaffold); valid_list.append(is_valid)
    return {"canon_smiles": canon_smiles_list, "scaffold": scaffold_list, "valid": valid_list}

# =============================================================================
# [Main Pipeline]
# =============================================================================
def final_integrated_cleanup(train_path, val_path, test_path, base_save_dir, num_cores=24):
    print(f"=== [Step 2] Molecular Parsing with HF Multiprocessing ===")
    splits = {"train": train_path, "val": val_path, "test": test_path}
    dfs = {}
    
    for name, path in splits.items():
        print(f" -> Processing {name} split...")
        ds = load_from_disk(path)
        
        # 1. 필요한 컬럼만 추출하여 병렬 파싱 수행
        # (병목을 줄이기 위해 batch_size를 키우고 num_proc 활용)
        parsed_info = ds.select_columns(['task', 'input_mol_string']).map(
            decode_and_get_info, batched=True, batch_size=2000, num_proc=num_cores, desc=f"Parsing {name}"
        )
        
        # 2. 파싱 결과와 원본 데이터를 Pandas에서 결합 (가장 빠른 방식)
        df_raw = ds.to_pandas()
        df_info = parsed_info.to_pandas()[['canon_smiles', 'scaffold', 'valid']]
        dfs[name] = pd.concat([df_raw.reset_index(drop=True), df_info.reset_index(drop=True)], axis=1)

    print(f"\n=== [Step 3] Fast Decontamination & Cross-Dataset Dedup ===")
    # 드랍 원인 추적을 위한 로그
    drop_log = defaultdict(Counter)
    
    # Test 세트의 블랙리스트 구축 (Fast Lookup을 위해 set 사용)
    test_df = dfs["test"]
    test_black_smiles = test_df[test_df['valid']].groupby('task')['canon_smiles'].apply(set).to_dict()
    test_black_scaf = test_df[test_df['valid']].groupby('task')['scaffold'].apply(set).to_dict()

    final_dfs = {"test": dfs["test"]}
    
    for name in ["train", "val"]:
        df = dfs[name]
        out_col = next((c for c in ['label', 'output_string', 'output', 'target'] if c in df.columns), "output")
        
        # A. Leakage 체크 (Test셋 유출 제거)
        def check_leak(row):
            t, s, scaf, v = row['task'], row['canon_smiles'], row['scaffold'], row['valid']
            if not v: return "Keep"
            if s in test_black_smiles.get(t, set()): return "Drop: Exact Match"
            if t in SCAFFOLD_SPLIT_TASKS and scaf in test_black_scaf.get(t, set()): return "Drop: Scaffold Match"
            return "Keep"

        tqdm.pandas(desc=f"Checking leaks in {name}")
        df['status'] = df.progress_apply(check_leak, axis=1)
        
        # B. Cross-Dataset 중복 제거 (retrosynthesis vs smol-retrosynthesis 등)
        # 내용이 같으면 중복으로 간주하고 하나만 남김
        df_clean = df[df['status'] == "Keep"].copy()
        initial_sub_count = len(df_clean)
        
        # Pandas의 drop_duplicates는 C 기반이라 수백만 행도 매우 빠름
        df_unique = df_clean.drop_duplicates(subset=['input_mol_string', out_col], keep='first')
        
        # 기록 업데이트
        for task, status in zip(df['task'], df['status']):
            if status != "Keep": drop_log[task][f"{name.upper()} - {status}"] += 1
        
        # 중복으로 지워진 개수 기록
        removed_dups = initial_sub_count - len(df_unique)
        print(f" -> {name.upper()}: Removed {removed_dups:,} cross-dataset duplicates.")
        
        final_dfs[name] = df_unique

    print(f"\n=== [Step 4] Final Report & Saving ===")
    # 1. 상세 드랍 원인 리포트
    if drop_log:
        print("\n[Detailed Drop Reasons by Task]")
        print(tabulate(pd.DataFrame(drop_log).T.fillna(0), headers="keys", tablefmt="grid"))

    # 2. 최종 통계 테이블 출력 및 저장
    table_data = []
    for cat in CATEGORY_ORDER:
        counts = {s: final_dfs[s][final_dfs[s]['task'].isin(TASK_MAPPING[cat])].shape[0] for s in ["train", "val", "test"]}
        table_data.append({
            "Task": cat, "# Train": f"{counts['train']:,}", "# Val": f"{counts['val']:,}", 
            "# Test": f"{counts['test']:,}", "# All": f"{counts['train']+counts['test']:,}"
        })
    
    print("\n" + tabulate(table_data, headers="keys", tablefmt="github", stralign="right"))

    for name, df in final_dfs.items():
        save_path = os.path.join(base_save_dir, f"GSAI-ML-LLaDA-8B-Instruct_string+graph_q32_{name}_FINAL_CLEANED")
        # 분석용 컬럼 제거 후 저장
        final_ds = Dataset.from_pandas(df.drop(columns=['canon_smiles', 'scaffold', 'valid', 'status'], errors='ignore'), preserve_index=False)
        final_ds.save_to_disk(save_path)
        print(f"[Saved] {save_path}")

if __name__ == "__main__":
    # 데이터 경로 설정
    train_in = "Mol-LLM_Custom/dataset/train_official/GSAI-ML-LLaDA-8B-Instruct_string+graph_q32_train_3.3M_0415_raw"
    val_in = "Mol-LLM_Custom/dataset/train_official/GSAI-ML-LLaDA-8B-Instruct_string+graph_q32_validation_3.3M_0415_raw"
    test_in = "Mol-LLM_Custom/dataset/train_official/GSAI-ML-LLaDA-8B-Instruct_string+graph_q32_test_3.3M_0415_raw"
    save_dir = "Mol-LLM_Custom/dataset/train_official/"
    
    final_integrated_cleanup(train_in, val_in, test_in, save_dir, num_cores=32)

In [None]:
import pandas as pd
from collections import Counter
from tabulate import tabulate

# =============================================================================
# [1. 설정: Task Group 매핑 정의]
# =============================================================================
# 각 태스크 그룹에 속하는 개별 태스크 리스트입니다.
TASK_GROUP_MAPPING = {
    "Property Prediction (Regression)": [
        "qm9_homo", "qm9_lumo", "qm9_homo_lumo_gap", "qm9_additional_label", 
        "qm9_dipole_moment", "qm9_isotropic_polarizability", "qm9_electronic_spatial_extent",
        "qm9_zero_point_vibrational_energy", "qm9_heat_capacity_298K", "qm9_internal_energy_298K", 
        "qm9_enthalpy_298K", "qm9_free_energy_298K",
        "esol", "lipo", "freesolv",
        "smol-property_prediction-esol", "smol-property_prediction-lipo", "smol-property_prediction-freesolv"
    ],
    "Property Prediction (Classification)": [
        "bace", "bbbp", "clintox", "tox21", "toxcast", "sider", "hiv", "muv", "hopv",
        "smol-property_prediction-bbbp", "smol-property_prediction-clintox", "smol-property_prediction-hiv", 
        "smol-property_prediction-sider", "smol-property_prediction-tox21", "smol-property_prediction-toxcast", 
        "smol-property_prediction-muv"
    ],
    "Forward Reaction Prediction": [
        "forward_reaction_prediction", "smol-forward_synthesis"
    ],
    "Retrosynthesis": [
        "retrosynthesis", "smol-retrosynthesis"
    ],
    "Reagent Prediction": [
        "reagent_prediction"
    ],
    "Molecule Captioning": [
        "chebi-20-mol2text", "smol-molecule_captioning"
    ],
    "Description-Guided Molecule Generation": [
        "chebi-20-text2mol", "smol-molecule_generation"
    ],
    "Name Conversion": [
        "smol-name_conversion-i2s", "smol-name_conversion-s2i", 
        "smol-name_conversion-i2f", "smol-name_conversion-s2f"
    ]
}

# 태스크 이름을 입력하면 그룹 이름을 반환하는 역방향 매핑 생성
TASK_TO_GROUP = {}
for group, tasks in TASK_GROUP_MAPPING.items():
    for task in tasks:
        TASK_TO_GROUP[task] = group

# =============================================================================
# [2. 입력 데이터 (예시)]
# =============================================================================
# 사용자가 제공한 Train 카운터 데이터
train_ds = load_from_disk('Mol-LLM_Custom/dataset/train_official/GSAI-ML-LLaDA-8B-Instruct_string+graph_q32_train_3.3M_0415_raw')
test_ds = load_from_disk('Mol-LLM_Custom/dataset/train_official/GSAI-ML-LLaDA-8B-Instruct_string+graph_q32_test_3.3M_0415_raw')
val_ds = load_from_disk('Mol-LLM_Custom/dataset/train_official/GSAI-ML-LLaDA-8B-Instruct_string+graph_q32_validation_3.3M_0415_raw')

# Val/Test는 실제 데이터가 있다면 해당 Counter를 넣으시면 됩니다. (여기선 0으로 가정하거나 예시 값 사용)
train_counts = Counter(train_ds['task'])
val_counts = Counter(val_ds['task']) 
test_counts = Counter(test_ds['task'])

# =============================================================================
# [3. 집계 로직]
# =============================================================================
def generate_group_statistics(train_cnt, val_cnt, test_cnt):
    table_data = []
    
    # 8개 그룹 순서대로 순회
    for group_name in TASK_GROUP_MAPPING.keys():
        # 해당 그룹에 속하는 태스크들 찾기
        # (Counter에 있는 키들 중에서 현재 그룹에 속하는 것만 필터링)
        
        # 1. 현재 그룹에 매핑된 태스크 중 실제로 데이터에 존재하는 태스크 찾기
        included_tasks = []
        
        # Train, Val, Test 모든 키를 합쳐서 검사
        all_keys = set(train_cnt.keys()) | set(val_cnt.keys()) | set(test_cnt.keys())
        
        g_train = 0
        g_val = 0
        g_test = 0
        
        for task in all_keys:
            # 해당 태스크가 현재 그룹에 속하는지 확인
            if TASK_TO_GROUP.get(task) == group_name:
                included_tasks.append(task)
                g_train += train_cnt.get(task, 0)
                g_val += val_cnt.get(task, 0)
                g_test += test_cnt.get(task, 0)
        
        # 포함된 태스크 이름 정렬 및 문자열 변환
        included_tasks_str = ", ".join(sorted(included_tasks)) if included_tasks else "-"
        
        # 결과 리스트 추가
        table_data.append({
            "Task Group": group_name,
            "Included Tasks": included_tasks_str,
            "Train": g_train,
            "Val": g_val,
            "Test": g_test,
            "Train + Test": g_train + g_test
        })

    # DataFrame 생성
    df = pd.DataFrame(table_data)
    
    # 숫자 포맷팅 (천 단위 콤마)
    for col in ["Train", "Val", "Test", "Train + Test"]:
        df[col] = df[col].apply(lambda x: f"{x:,}")
        
    return df

# =============================================================================
# [4. 실행 및 출력]
# =============================================================================
result_df = generate_group_statistics(train_counts, val_counts, test_counts)

print("\n" + "="*120)
print("Dataset Statistics by Task Group")
print("="*120)
print(tabulate(result_df, headers="keys", tablefmt="grid", showindex=False))
print("="*120)