In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from ms_pred.common.plot_utils import *
set_style()

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
dataset_names = ["nist20", "canopus_train_public"]
outfolder = "../results/figs_scarf/coverage/"
outfolder = Path(outfolder)
outfolder.mkdir(parents=True, exist_ok=True)

In [11]:
names = [
    "SCARF", 
    "SCARF-F",
    "SCARF-R",
    "Autoregressive",
    "CFM-ID",
    "Random",
    "Frequency"
]
sort_order = {"CFM-ID": 3, "Frequency":2, "Random": 1, "Autoregressive": 3.05, "SCARF": 4, "SCARF-R": 3.1, "SCARF-F": 3.5}

dataset_to_res = {}
for dataset_name in dataset_names:
    
    results_files = [
        f"../results/scarf_{dataset_name}/split_1/inten_thresh_sweep/summary.tsv",
        f"../results/scarf_{dataset_name}_ablate/forward/inten_thresh_sweep/summary.tsv",
        f"../results/scarf_{dataset_name}_ablate/reverse/inten_thresh_sweep/summary.tsv",
        f"../results/autoregr_{dataset_name}/split_1/inten_thresh_sweep/summary.tsv",
        f"../results/cfm_id_{dataset_name}/inten_thresh_sweep/summary.tsv",
        f"../results/rand_baseline_{dataset_name}/split_1/inten_thresh_sweep/summary.tsv",
        f"../results/freq_baseline_{dataset_name}/split_1/inten_thresh_sweep/summary.tsv",
    ]
    cov_dfs = {i: pd.read_csv(j, sep="\t") for i, j in zip(names, results_files)}
    dataset_to_res[dataset_name] = cov_dfs


In [12]:
dataset_to_res["canopus_train_public"];

In [13]:
combined_df = []
# max_preds = [10, 20, 30, 40, 50, 100, 200, 300, 500, 1000]
max_preds = [10, 20, 30, 50, 100, 300, 1000]
for dataset_name in dataset_names:
    cov_dfs = dataset_to_res[dataset_name]
    for name, sub_df in cov_dfs.items():
        for _, row in sub_df.iterrows():
            num_nodes = row['nm_nodes']
            if num_nodes not in max_preds: continue
            coverage = row['avg_coverage']
            digitized_coverage = row['avg_digitized_coverage']
            sem_coverage = row['sem_coverage']
            avg_num_pred = row['avg_num_pred']
            new_entry = {
                "Coverage": coverage,
                "SEM Coverage": sem_coverage,
                "Method": name,
                "Coverage (disc.)": digitized_coverage,
                "Num pred.": avg_num_pred,
                "Nodes": num_nodes,
                "Dataset": dataset_name
            }
            combined_df.append(new_entry)

new_df = pd.DataFrame(combined_df)

In [14]:
# Round coverage
new_df["Coverage"] = new_df["Coverage"].round(3)
new_df['SEM Coverage'] = new_df['SEM Coverage'].round(3)

# Create a single column that just has $Coverage \pm SEM Coverage$ using list comprehension 
new_df['Coverage'] = [rf"${i} \pm {j}$" for i, j in zip(new_df['Coverage'], new_df['SEM Coverage'])]


In [15]:
for dataset_name, temp_df in new_df.groupby("Dataset"): 
    new_df_round = temp_df#.round(3)

    # Filter df round to only have rows where Coverage is in [10, 30, 300, 10000]
    new_df_round = new_df_round[new_df_round["Nodes"].isin([10, 30, 300, 1000])]
    
    round_df_pivot = new_df_round.pivot_table(index="Method", columns=["Nodes"], values=["Coverage"], aggfunc=lambda x: x)
    display(round_df_pivot)
    round_df_pivot.columns = [f"{int(i[1])}" for i in round_df_pivot.columns]
    round_df_pivot.index.name = None
    round_df_pivot.columns.name = "Coverage @"
    round_df_pivot  = round_df_pivot.sort_index(key=lambda x: [sort_order[i] for i in x])
    display(round_df_pivot)
    data_str = {"canopus_train_public": r"\gnpsData", "nist20": r"\nistData"}[dataset_name]

    tex_table = round_df_pivot.to_latex(
        na_rep="--", 
        label=f"tab:coverage_{dataset_name}", 
        caption=rf"Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the {data_str} dataset.",
        escape=False,
    )
    print(tex_table)
    with open(outfolder / f"tab_coverage_{dataset_name}.tex", "w") as f:
        f.write(tex_table)


Unnamed: 0_level_0,Coverage,Coverage,Coverage,Coverage
Nodes,10.0,30.0,300.0,1000.0
Method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Autoregressive,$0.073 \pm 0.003$,$0.084 \pm 0.003$,$0.099 \pm 0.004$,$0.106 \pm 0.004$
CFM-ID,$0.169 \pm 0.005$,$0.267 \pm 0.006$,,
Frequency,$0.092 \pm 0.004$,$0.152 \pm 0.005$,$0.476 \pm 0.009$,$0.695 \pm 0.009$
Random,$0.005 \pm 0.001$,$0.013 \pm 0.001$,$0.129 \pm 0.006$,$0.343 \pm 0.011$
SCARF,$0.181 \pm 0.005$,$0.333 \pm 0.007$,$0.745 \pm 0.008$,$0.891 \pm 0.006$
SCARF-F,$0.163 \pm 0.005$,$0.319 \pm 0.007$,$0.721 \pm 0.009$,$0.868 \pm 0.007$
SCARF-R,$0.165 \pm 0.005$,$0.289 \pm 0.007$,$0.691 \pm 0.009$,$0.866 \pm 0.007$


Coverage @,10,30,300,1000
Random,$0.005 \pm 0.001$,$0.013 \pm 0.001$,$0.129 \pm 0.006$,$0.343 \pm 0.011$
Frequency,$0.092 \pm 0.004$,$0.152 \pm 0.005$,$0.476 \pm 0.009$,$0.695 \pm 0.009$
CFM-ID,$0.169 \pm 0.005$,$0.267 \pm 0.006$,,
Autoregressive,$0.073 \pm 0.003$,$0.084 \pm 0.003$,$0.099 \pm 0.004$,$0.106 \pm 0.004$
SCARF-R,$0.165 \pm 0.005$,$0.289 \pm 0.007$,$0.691 \pm 0.009$,$0.866 \pm 0.007$
SCARF-F,$0.163 \pm 0.005$,$0.319 \pm 0.007$,$0.721 \pm 0.009$,$0.868 \pm 0.007$
SCARF,$0.181 \pm 0.005$,$0.333 \pm 0.007$,$0.745 \pm 0.008$,$0.891 \pm 0.006$


\begin{table}
\centering
\caption{Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the \gnpsData dataset.}
\label{tab:coverage_canopus_train_public}
\begin{tabular}{lllll}
\toprule
Coverage @ &                 10 &                 30 &                300 &               1000 \\
\midrule
Random         &  $0.005 \pm 0.001$ &  $0.013 \pm 0.001$ &  $0.129 \pm 0.006$ &  $0.343 \pm 0.011$ \\
Frequency      &  $0.092 \pm 0.004$ &  $0.152 \pm 0.005$ &  $0.476 \pm 0.009$ &  $0.695 \pm 0.009$ \\
CFM-ID         &  $0.169 \pm 0.005$ &  $0.267 \pm 0.006$ &                 -- &                 -- \\
Autoregressive &  $0.073 \pm 0.003$ &  $0.084 \pm 0.003$ &  $0.099 \pm 0.004$ &  $0.106 \pm 0.004$ \\
SCARF-R        &  $0.165 \pm 0.005$ &  $0.289 \pm 0.007$ &  $0.691 \pm 0.009$ &  $0.866 \pm 0.007$ \\
SCARF-F        &  $0.163 \pm 0.005$ &  $0.319 \pm 0.007$ &  $0.721 \pm 0.009$ &  $0.868 \pm 0.007$ \\
SCARF          &  $0.181 \pm 0.005$ &  $0.333 \pm 0.0

  tex_table = round_df_pivot.to_latex(


Unnamed: 0_level_0,Coverage,Coverage,Coverage,Coverage
Nodes,10.0,30.0,300.0,1000.0
Method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Autoregressive,$0.214 \pm 0.003$,$0.276 \pm 0.003$,$0.324 \pm 0.004$,$0.33 \pm 0.004$
CFM-ID,$0.193 \pm 0.003$,$0.281 \pm 0.004$,,
Frequency,$0.164 \pm 0.003$,$0.268 \pm 0.003$,$0.659 \pm 0.004$,$0.831 \pm 0.004$
Random,$0.008 \pm 0.0$,$0.024 \pm 0.001$,$0.232 \pm 0.004$,$0.533 \pm 0.006$
SCARF,$0.316 \pm 0.003$,$0.559 \pm 0.004$,$0.911 \pm 0.003$,$0.97 \pm 0.002$
SCARF-F,$0.263 \pm 0.003$,$0.491 \pm 0.004$,$0.859 \pm 0.004$,$0.943 \pm 0.002$
SCARF-R,$0.252 \pm 0.003$,$0.431 \pm 0.004$,$0.843 \pm 0.004$,$0.942 \pm 0.003$


Coverage @,10,30,300,1000
Random,$0.008 \pm 0.0$,$0.024 \pm 0.001$,$0.232 \pm 0.004$,$0.533 \pm 0.006$
Frequency,$0.164 \pm 0.003$,$0.268 \pm 0.003$,$0.659 \pm 0.004$,$0.831 \pm 0.004$
CFM-ID,$0.193 \pm 0.003$,$0.281 \pm 0.004$,,
Autoregressive,$0.214 \pm 0.003$,$0.276 \pm 0.003$,$0.324 \pm 0.004$,$0.33 \pm 0.004$
SCARF-R,$0.252 \pm 0.003$,$0.431 \pm 0.004$,$0.843 \pm 0.004$,$0.942 \pm 0.003$
SCARF-F,$0.263 \pm 0.003$,$0.491 \pm 0.004$,$0.859 \pm 0.004$,$0.943 \pm 0.002$
SCARF,$0.316 \pm 0.003$,$0.559 \pm 0.004$,$0.911 \pm 0.003$,$0.97 \pm 0.002$


\begin{table}
\centering
\caption{Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the \nistData dataset.}
\label{tab:coverage_nist20}
\begin{tabular}{lllll}
\toprule
Coverage @ &                 10 &                 30 &                300 &               1000 \\
\midrule
Random         &    $0.008 \pm 0.0$ &  $0.024 \pm 0.001$ &  $0.232 \pm 0.004$ &  $0.533 \pm 0.006$ \\
Frequency      &  $0.164 \pm 0.003$ &  $0.268 \pm 0.003$ &  $0.659 \pm 0.004$ &  $0.831 \pm 0.004$ \\
CFM-ID         &  $0.193 \pm 0.003$ &  $0.281 \pm 0.004$ &                 -- &                 -- \\
Autoregressive &  $0.214 \pm 0.003$ &  $0.276 \pm 0.003$ &  $0.324 \pm 0.004$ &   $0.33 \pm 0.004$ \\
SCARF-R        &  $0.252 \pm 0.003$ &  $0.431 \pm 0.004$ &  $0.843 \pm 0.004$ &  $0.942 \pm 0.003$ \\
SCARF-F        &  $0.263 \pm 0.003$ &  $0.491 \pm 0.004$ &  $0.859 \pm 0.004$ &  $0.943 \pm 0.002$ \\
SCARF          &  $0.316 \pm 0.003$ &  $0.559 \pm 0.004$ &  $0.911 

  tex_table = round_df_pivot.to_latex(


NIST Output:

```
\begin{table}
\centering
\caption{Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the \nistData dataset.}
\label{tbl:coverage}
\begin{tabular}{lrrrrrrr}
\toprule
Coverage @ &     10 &     20 &     30 &     50 &    100 &    300 &   1000 \\
\midrule
Random    &  0.008 &  0.017 &  0.024 &  0.042 &  0.085 &  0.232 &  0.533 \\
Frequency &  0.164 &  0.224 &  0.268 &  0.336 &  0.462 &  0.659 &  0.831 \\
CFM-ID    &  0.198 &  0.254 &  0.281 &  0.302 &  0.305 &     -- &     -- \\
SCARF-R   &  0.252 &  0.356 &  0.431 &  0.536 &  0.675 &  0.843 &  0.942 \\
SCARF-F   &  0.263 &  0.404 &  0.491 &  0.598 &  0.719 &  0.859 &  0.943 \\
SCARF     &  0.316 &  0.465 &  0.559 &  0.674 &  0.796 &  0.911 &  0.970 \\
\bottomrule
\end{tabular}
\end{table}
```


Canopus output: 
```
\begin{table}
\centering
\caption{Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the \nistData dataset.}
\label{tab:coverage}
\begin{tabular}{lrrrrrrr}
\toprule
Coverage @ &     10 &     20 &     30 &     50 &    100 &    300 &   1000 \\
\midrule
Random    &  0.003 &  0.008 &  0.015 &  0.021 &  0.046 &  0.130 &  0.337 \\
Frequency &  0.092 &  0.124 &  0.152 &  0.202 &  0.294 &  0.476 &  0.695 \\
CFM-ID    &  0.169 &  0.229 &  0.267 &  0.298 &  0.303 &     -- &     -- \\
SCARF-R   &  0.165 &  0.234 &  0.289 &  0.371 &  0.498 &  0.691 &  0.866 \\
SCARF-F   &  0.163 &  0.251 &  0.319 &  0.409 &  0.538 &  0.721 &  0.868 \\
SCARF     &  0.181 &  0.270 &  0.333 &  0.420 &  0.549 &  0.745 &  0.891 \\
\bottomrule
\end{tabular}
\end{table}
```