In [14]:
import pandas as pd
import numpy as np

# Read in the CSV file
df = pd.read_csv("evaluation_results.csv")

# Rename columns
df.rename(columns={'Corrected Facts': 'LLM-generated Corrections', 'Removed Facts': 'LLM-generated Removals', 'Score Corrected': 'Correction Rate'}, inplace=True)



# Mapping for LLM names
llm_mapping = {
    'llama': 'Llama-8B-Instruct',
    'llama_70B': 'Llama-70B-Instruct',
    'openai': 'GPT-4o-mini',
    'llama_finetuned': 'Llama-8B-Instruct-\\textbf{Finetuning}'
}
df['LLM'] = df['LLM'].map(llm_mapping)

# Pivot the DataFrame
pivot_df = df.melt(id_vars=['Dataset', 'LLM'], var_name='Metric', value_name='Value')
pivot_df = pivot_df.pivot_table(index=['Dataset', 'Metric'], columns='LLM', values='Value')

# Generate LaTeX table from the pivoted DataFrame with bold max values
def generate_latex_table(pivot_df):
    llms = pivot_df.columns  # Get unique LLM names for columns
    latex_table = "\\begin{table}[h]\n" \
                  "\\centering\n" \
                  "\\small\n" \
                  "\\begin{tabular}{p{1.5cm} | p{4.5cm}| " + " | ".join(['p{1.5cm}' for _ in llms]) + "}\n" \
                  "\\toprule\n" \
                  "Test Set & Metric & " + " & ".join([f'{llm}' for llm in llms]) + " \\\\\n" \
                  "\\midrule\n" \
                  "\\addlinespace[10pt]\n"

    for dataset in pivot_df.index.levels[0]:  # Loop through datasets (fever and hover)
        dataset_name = 'FEVER' if dataset == 'fever' else 'HoVer'
        len_facts = int(pivot_df.loc[(dataset, "Facts")].values[0])
        print(len_facts)
        if dataset == 'fever':
            latex_table += f"\\multirow{{4}}{{*}}{{\\vspace{{-8pt}}\\parbox{{2cm}}{{{dataset_name}: \\\ {len_facts} \\\ samples}}}}\n"
        else:
            latex_table += "\\addlinespace[10pt]\n"
            latex_table += f"\\multirow{{4}}{{*}}{{\\vspace{{-10pt}}\\parbox{{2cm}}{{{dataset_name}: \\\ {len_facts} \\\ samples}}}}\n"
        
        for idx, metric in enumerate(['LLM-generated Removals', 'LLM-generated Corrections', 'True Corrected Facts', 'Correction Rate']):
            if idx == 0:
                latex_table += f"& {metric} & "
            else:
                latex_table += f"& {metric} & "

            # Determine the maximum value in the row for bolding (only for specific metrics)
            if metric in ['LLM-generated Corrections', 'Correction Rate']:
                max_value = pivot_df.loc[(dataset, metric)].max()

            metric_values = []
            for llm in llms:
                value = pivot_df.loc[(dataset, metric), llm]
                if pd.notna(value):
                    if metric in ['LLM-generated Removals', 'LLM-generated Corrections', 'True Corrected Facts']:
                        formatted_value = f"{value:.0f}"  # Integer format
                    else:
                        formatted_value = f"{value:.3f}"  # Float format for "Correction Rate"

                    # Apply bold formatting only if the value is the maximum and metric is not 'LLM-generated Removals'
                    if metric in ['LLM-generated Corrections', 'Correction Rate'] and np.isclose(value, max_value):
                        formatted_value = f"\\textbf{{{formatted_value}}}"
                    
                    metric_values.append(formatted_value)
                else:
                    metric_values.append('-')

            latex_table += " & ".join(metric_values) + " \\\\\n"
            #latex_table += "\\addlinespace[5pt]\n"

        latex_table += "\\addlinespace[10pt]\n"
        latex_table += "\\midrule\n"

    latex_table = latex_table.rstrip("\\midrule\n")  # Remove the last midrule
    latex_table += "\\bottomrule\n" \
                   "\\end{tabular}\n" \
                   "\\caption{Comparison of LLM-generated Corrections across LLMs for FEVER and HoVer test sets}\n" \
                   "\\label{tab:correction_comparison}\n" \
                   "\\end{table}"

    return latex_table

# Print the generated LaTeX table
latex_table = generate_latex_table(pivot_df)
print(latex_table)



1355
1543
\begin{table}[h]
\centering
\small
\begin{tabular}{p{1.5cm} | p{4.5cm}| p{1.5cm} | p{1.5cm} | p{1.5cm} | p{1.5cm}}
\toprule
Test Set & Metric & GPT-4o-mini & Llama-70B-Instruct & Llama-8B-Instruct & Llama-8B-Instruct-\textbf{Finetuning} \\
\midrule
\addlinespace[10pt]
\multirow{4}{*}{\vspace{-8pt}\parbox{2cm}{FEVER: \\ 1355 \\ samples}}
& LLM-generated Removals & 521 & 542 & 866 & 606 \\
& LLM-generated Corrections & \textbf{821} & 763 & 429 & 742 \\
& True Corrected Facts & 648 & 631 & 170 & 657 \\
& Correction Rate & 0.789 & 0.827 & 0.396 & \textbf{0.885} \\
\addlinespace[10pt]
\midrule
\addlinespace[10pt]
\multirow{4}{*}{\vspace{-10pt}\parbox{2cm}{HoVer: \\ 1543 \\ samples}}
& LLM-generated Removals & 621 & 604 & 1127 & 660 \\
& LLM-generated Corrections & \textbf{910} & 891 & 354 & 867 \\
& True Corrected Facts & 697 & 734 & 110 & 724 \\
& Correction Rate & 0.766 & 0.824 & 0.311 & \textbf{0.835} \\
\addlinespace[10pt]\bottomrule
\end{tabular}
\caption{Comparison of LLM-ge

In [3]:
pivot_df

Unnamed: 0_level_0,LLM,GPT-4o-mini,Llama-70B-Instruct,Llama-8B-Instruct,Llama-8B-Instruct-\textbf{Finetuning}
Dataset,Metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
fever,Correction Rate,0.789281,0.826999,0.39627,0.885445
fever,Facts,1355.0,1355.0,1355.0,1355.0
fever,LLM-generated Corrections,821.0,763.0,429.0,742.0
fever,LLM-generated Removals,521.0,542.0,866.0,606.0
fever,Score Removed,0.953935,0.948339,0.967667,0.945545
fever,Total Score,0.845018,0.845018,0.743911,0.907749
fever,True Corrected Facts,648.0,631.0,170.0,657.0
fever,True Removed Facts,497.0,514.0,838.0,573.0
fever,Unnamed: 0,2.0,0.0,3.0,1.0
hover,Correction Rate,0.765934,0.823793,0.310734,0.835063
