In [1]:
import json
import os

In [2]:
def print_Logic_LM_table(logic_lm_results, dataset_names, backup='CoT'):
    for dataset in dataset_names:

        table_header = f"|{dataset}|"
        table_header += "|".join(logic_lm_results.keys()) 
        table_header += "|\n |---|---|---|---|---| \n"

        table_rows = []
        for metric in ['Overall_Accuracy', 'Executable_Rate', 'Executable_Accuracy']:
            row = f"|{metric}|"
            for model_name in logic_lm_results.keys():
                try:
                    row += f"{100 * logic_lm_results[model_name][dataset][backup][metric]:.2f}"
                except:
                    row += "---"
                row += "|"
            table_rows.append(row)

        table_string = table_header + "\n".join(table_rows)
        print(table_string)
        print()


def print_best_vs_best_table(baseline_results, logic_lm_results, dataset_names):
    for dataset in dataset_names:
        max = -1
        base_direct = -1
        base_cot = -1
        
        max_model = ""
        base_direct_model = ""
        base_cot_model = ""

        for model_name in baseline_results.keys():
            for backup in ["random", "Direct", "CoT"]:
                acc = logic_lm_results[model_name][dataset][backup]['Overall_Accuracy']
                if acc > max:
                    max = acc
                    max_model = f"{model_name}   {backup}"
            direct_acc = baseline_results[model_name][dataset]['Direct']['Average_EM_score']
            cot_acc = baseline_results[model_name][dataset]['CoT']['Average_EM_score']
            if direct_acc > base_direct:
                base_direct = direct_acc
                base_direct_model = model_name
            if cot_acc > base_cot:
                base_cot = cot_acc
                base_cot_model = model_name

        row_acc = " | ".join([f"{100*base_direct:.2f}", f"{100*base_cot:.2f}", f"{100*max:.2f}"])
        row_model = " | ".join([base_direct_model, base_cot_model, max_model])

        print(row_acc)        
        print(row_model)

In [3]:
with open("./baselines/evaluation/evaluation_baselines.json", "r") as f:
    baseline_results = json.load(f)

with open("./outputs/evaluation/evaluation_logic_programs_gemini.json", "r") as f:
    logic_lm_results = json.load(f)


dataset_names = ("ProntoQA", "ProofWriter", "FOLIO", "LogicalDeduction", "AR-LSAT")


In [6]:
baseline_results['gemini-1.0-pro-vision-001']

{'ProntoQA': {'Direct': {'Average_EM_score': 0.0},
  'CoT': {'Average_EM_score': 0.0}},
 'ProofWriter': {'Direct': {'Average_EM_score': 0.0}},
 'FOLIO': {'Direct': {'Average_EM_score': 0.0}},
 'LogicalDeduction': {'Direct': {'Average_EM_score': 0.0}},
 'AR-LSAT': {'Direct': {'Average_EM_score': 0.0}}}

In [11]:
logic_lm_results["gemini-1.5-pro-preview-0514"]

{'ProntoQA': {'random': {'Overall_Accuracy': 0.464,
   'Executable_Rate': 0.0,
   'Executable_Accuracy': 0},
  'Direct': {'Overall_Accuracy': 0,
   'Executable_Rate': 0.0,
   'Executable_Accuracy': 0},
  'CoT': {'Overall_Accuracy': 0,
   'Executable_Rate': 0.0,
   'Executable_Accuracy': 0}},
 'ProofWriter': {'random': {'Overall_Accuracy': 0.345,
   'Executable_Rate': 0.0,
   'Executable_Accuracy': 0},
  'Direct': {'Overall_Accuracy': 0,
   'Executable_Rate': 0.0,
   'Executable_Accuracy': 0},
  'CoT': {'Overall_Accuracy': 0,
   'Executable_Rate': 0.0,
   'Executable_Accuracy': 0}},
 'FOLIO': {'random': {'Overall_Accuracy': 0.7114427860696517,
   'Executable_Rate': 0.7711442786069652,
   'Executable_Accuracy': 0.8516129032258064},
  'Direct': {'Overall_Accuracy': 0.6666666666666666,
   'Executable_Rate': 0.7810945273631841,
   'Executable_Accuracy': 0.8535031847133758},
  'CoT': {'Overall_Accuracy': 0.6666666666666666,
   'Executable_Rate': 0.7810945273631841,
   'Executable_Accuracy': 

In [9]:
print_Logic_LM_table(logic_lm_results, dataset_names)

|ProntoQA|gemini-1.0-pro-vision-001|gemini-1.5-pro-preview-0409|gemini-1.5-pro-preview-0514|gemini-1.5-flash-preview-0514|
 |---|---|---|---|---| 
|Overall_Accuracy|77.40|93.80|0.00|0.00|
|Executable_Rate|100.00|96.40|0.00|0.00|
|Executable_Accuracy|77.40|97.30|0.00|0.00|

|ProofWriter|gemini-1.0-pro-vision-001|gemini-1.5-pro-preview-0409|gemini-1.5-pro-preview-0514|gemini-1.5-flash-preview-0514|
 |---|---|---|---|---| 
|Overall_Accuracy|---|71.96|0.00|2.50|
|Executable_Rate|---|89.36|0.00|4.67|
|Executable_Accuracy|---|80.53|0.00|53.57|

|FOLIO|gemini-1.0-pro-vision-001|gemini-1.5-pro-preview-0409|gemini-1.5-pro-preview-0514|gemini-1.5-flash-preview-0514|
 |---|---|---|---|---| 
|Overall_Accuracy|---|48.51|66.67|4.41|
|Executable_Rate|---|58.91|78.11|4.41|
|Executable_Accuracy|---|82.35|85.35|100.00|

|LogicalDeduction|gemini-1.0-pro-vision-001|gemini-1.5-pro-preview-0409|gemini-1.5-pro-preview-0514|gemini-1.5-flash-preview-0514|
 |---|---|---|---|---| 
|Overall_Accuracy|---|52.33|84.

In [None]:
print_best_vs_best_table(baseline_results, logic_lm_results, dataset_names)

In [None]:
def print_all_gemini_results(baseline_results, logic_lm_results, dataset_names):
    for dataset in dataset_names:
        for model_name in baseline_results.keys():
            acc = logic_lm_results[model_name][dataset]['Overall_Accuracy']
            direct_acc = baseline_results[model_name][dataset]['Direct']['Average_EM_score']
            cot_acc = baseline_results[model_name][dataset]['CoT']['Average_EM_score']
            if acc > max:
                max = acc
                max_model = model_name
            if direct_acc > base_direct:
                base_direct = direct_acc
                base_direct_model = model_name
            if cot_acc > base_cot:
                base_cot = cot_acc
                base_cot_model = model_name

        row_acc = " | ".join([f"{100*base_direct:.2f}", f"{100*base_cot:.2f}", f"{100*max:.2f}"])
        row_model = " | ".join([base_direct_model, base_cot_model, max_model])

        print(row_acc)        
        print(row_model)