In [1]:
import json
import os

In [2]:
def print_Logic_LM_table(logic_lm_results, dataset_names, backup='CoT'):
    for dataset in dataset_names:

        table_header = f"|{dataset}|"
        table_header += "|".join(logic_lm_results.keys()) 
        table_header += "|\n |---|---|---|---|---| \n"

        table_rows = []
        for metric in ['Overall_Accuracy', 'Executable_Rate', 'Executable_Accuracy']:
            row = f"|{metric}|"
            for model_name in logic_lm_results.keys():
                try:
                    row += f"{100 * logic_lm_results[model_name][dataset][backup][metric]:.2f}"
                except:
                    row += "---"
                row += "|"
            table_rows.append(row)

        table_string = table_header + "\n".join(table_rows)
        print(table_string)
        print()


def print_best_vs_best_table(baseline_results, logic_lm_results, dataset_names):
    for dataset in dataset_names:
        max = -1
        base_direct = -1
        base_cot = -1
        
        max_model = ""
        base_direct_model = ""
        base_cot_model = ""

        for model_name in baseline_results.keys():
            for backup in ["random", "Direct", "CoT"]:
                acc = logic_lm_results[model_name][dataset][backup]['Overall_Accuracy']
                if acc > max:
                    max = acc
                    max_model = f"{model_name}   {backup}"
            direct_acc = baseline_results[model_name][dataset]['Direct']['Average_EM_score']
            cot_acc = baseline_results[model_name][dataset]['CoT']['Average_EM_score']
            if direct_acc > base_direct:
                base_direct = direct_acc
                base_direct_model = model_name
            if cot_acc > base_cot:
                base_cot = cot_acc
                base_cot_model = model_name

        row_acc = " | ".join([f"{100*base_direct:.2f}", f"{100*base_cot:.2f}", f"{100*max:.2f}"])
        row_model = " | ".join([base_direct_model, base_cot_model, max_model])

        print(row_acc)        
        print(row_model)

In [None]:
with open("./baselines/evaluation/evaluation_baselines.json", "r") as f:
    baseline_results = json.load(f)

with open("./outputs/evaluation/evaluation_logic_programs_gemini.json", "r") as f:
    logic_lm_results = json.load(f)


dataset_names = ("ProntoQA", "ProofWriter", "FOLIO", "LogicalDeduction", "AR-LSAT")


In [None]:
print_Logic_LM_table(logic_lm_results, dataset_names)

In [None]:
print_best_vs_best_table(baseline_results, logic_lm_results, dataset_names)

In [None]:
def print_all_gemini_results(baseline_results, logic_lm_results, dataset_names):
    for dataset in dataset_names:
        for model_name in baseline_results.keys():
            acc = logic_lm_results[model_name][dataset]['Overall_Accuracy']
            direct_acc = baseline_results[model_name][dataset]['Direct']['Average_EM_score']
            cot_acc = baseline_results[model_name][dataset]['CoT']['Average_EM_score']
            if acc > max:
                max = acc
                max_model = model_name
            if direct_acc > base_direct:
                base_direct = direct_acc
                base_direct_model = model_name
            if cot_acc > base_cot:
                base_cot = cot_acc
                base_cot_model = model_name

        row_acc = " | ".join([f"{100*base_direct:.2f}", f"{100*base_cot:.2f}", f"{100*max:.2f}"])
        row_model = " | ".join([base_direct_model, base_cot_model, max_model])

        print(row_acc)        
        print(row_model)