In [5]:
import pandas as pd
import glob
import re
from collections import defaultdict

# Define mapping to match acronyms from your LaTeX table
model_name_map = {
    "Linear Regression": "Linear",
    "Ridge Regression": "Ridge",
    "Lasso Regression": "Lasso",
    "ElasticNet Regression": "ElasticNet",
    "Bayesian Ridge Regression": "Bayesian Ridge",
    "Stochastic Gradient Descent": "SGD",
    "Support Vector Regression": "SVR",
    "Gaussian Process": "GP",
    "Decision Tree": "Decision Tree",
    "Random Forest": "Random Forest",
    "Extra Trees": "Extra Trees",
    "AdaBoost": "AdaBoost",
    "Gradient Boosting": "Gradient Boosting",
    "XGBRegressor": "XGB",
}

# Use this to enforce model row order
model_order = [
    "Linear", "Ridge", "Lasso", "ElasticNet", "Bayesian Ridge",
    "SGD", "SVR", "GP", "Decision Tree", "Random Forest",
    "Extra Trees", "AdaBoost", "Gradient Boosting", "XGB"
]

# Strategy column order
strategies = ["No FP", "Zero-512", "Zero-1024", "Zero-2048", "Drop-512", "Drop-1024", "Drop-2048"]

# Match file names
pattern = re.compile(r"OneHotEncoder_(?P<fp>no_fp|morgan_fp_(?P<imp>zeros|nan)_(?P<size>\d+))_seed_\d+\.csv")

# Read all data
data = defaultdict(lambda: defaultdict(list))  # data[model][strategy] = list of LB WPCs

for filepath in glob.glob("./*.csv"):
    filename = filepath.split("/")[-1]
    match = pattern.match(filename)
    if not match:
        continue

    fp = match.group("fp")
    imp = match.group("imp")
    size = match.group("size")

    if fp == "no_fp":
        strategy = "No FP"
    elif imp == "zeros":
        strategy = f"Zero-{size}"
    elif imp == "nan":
        strategy = f"Drop-{size}"
    else:
        continue

    df = pd.read_csv(filepath)
    for _, row in df.iterrows():
        model = model_name_map[row["Model"]]
        lb_wpc = row["LB WPC"]
        data[model][strategy].append(lb_wpc)

# Create LaTeX table
header = r"""\begin{table}[ht]
\centering
\footnotesize
\caption{\textbf{LB WPC comparison with and without Morgan fingerprints using different imputation strategies.} Results are reported as mean (standard deviation) over 10 random seeds.}
\begin{tabular}{lccccccc}
\toprule
Model & No FP & Zero-512 & Zero-1024 & Zero-2048 & Drop-512 & Drop-1024 & Drop-2048 \\
\midrule
"""

body = ""
for model in model_order:
    row_vals = []
    for strat in strategies:
        vals = data[model].get(strat, [])
        if vals:
            mean = pd.Series(vals).mean()
            std = pd.Series(vals).std()
            row_vals.append((mean, std))
        else:
            row_vals.append(None)

    # Identify best mean
    valid_means = [(i, val[0]) for i, val in enumerate(row_vals) if val is not None]
    best_idx = max(valid_means, key=lambda x: x[1])[0] if valid_means else None

    # Build row
    row = [model]
    for i, val in enumerate(row_vals):
        if val is None:
            row.append("-")
        else:
            mean, std = val
            formatted = f"{mean:.3f} ({std:.3f})"
            if i == best_idx:
                formatted = r"\textbf{" + formatted + "}"
            row.append(formatted)
    body += " & ".join(row) + r" \\" + "\n"

footer = r"""\bottomrule
\end{tabular}
\end{table}"""

# Output LaTeX table
print(header + body + footer)


\begin{table}[ht]
\centering
\footnotesize
\caption{\textbf{LB WPC comparison with and without Morgan fingerprints using different imputation strategies.} Results are reported as mean (standard deviation) over 10 random seeds.}
\begin{tabular}{lccccccc}
\toprule
Model & No FP & Zero-512 & Zero-1024 & Zero-2048 & Drop-512 & Drop-1024 & Drop-2048 \\
\midrule
Linear & \textbf{0.268 (0.000)} & 0.254 (0.000) & 0.258 (0.000) & 0.225 (0.000) & 0.259 (0.000) & 0.220 (0.000) & 0.244 (0.000) \\
Ridge & 0.257 (0.000) & 0.266 (0.000) & 0.266 (0.000) & 0.266 (0.000) & 0.281 (0.000) & 0.281 (0.000) & \textbf{0.281 (0.000)} \\
Lasso & 0.099 (0.000) & 0.121 (0.000) & 0.130 (0.000) & 0.127 (0.000) & 0.133 (0.000) & \textbf{0.156 (0.000)} & 0.156 (0.000) \\
ElasticNet & 0.113 (0.000) & 0.113 (0.000) & 0.117 (0.000) & 0.117 (0.000) & 0.170 (0.000) & \textbf{0.183 (0.000)} & 0.182 (0.000) \\
Bayesian Ridge & \textbf{0.268 (0.000)} & 0.133 (0.000) & 0.121 (0.000) & 0.116 (0.000) & 0.189 (0.000) & 0.189 (0.