In [1]:
import json
import os

In [2]:
with open("./baselines/evaluation/evaluation_baselines.json", "r") as f:
    baseline_results = json.load(f)

with open("./outputs/evaluation/evaluation_logic_programs_gemini.json", "r") as f:
    logic_lm_results = json.load(f)


dataset_names = ("ProntoQA", "ProofWriter", "FOLIO", "LogicalDeduction", "AR-LSAT")


In [None]:
def print_Logic_LM_table(logic_lm_results, dataset_names, backup='CoT'):
    for dataset in dataset_names:

        table_header = f"|{dataset}|"
        table_header += "|".join(logic_lm_results.keys()) 
        table_header += "|\n |---|---|---|---|---| \n"

        table_rows = []
        for metric in ['Overall_Accuracy', 'Executable_Rate', 'Executable_Accuracy']:
            row = f"|{metric}|"
            for model_name in logic_lm_results.keys():
                try:
                    row += f"{100 * logic_lm_results[model_name][dataset][backup][metric]:.2f}"
                    if logic_lm_results[model_name][dataset][backup]['Overall_Accuracy'] == 0:
                        print(model_name, dataset, backup)
                except:
                    row += "---"
                row += "|"
            table_rows.append(row)

        table_string = table_header + "\n".join(table_rows)
        print(table_string)
        print()

print_Logic_LM_table(logic_lm_results, dataset_names)

In [4]:
def print_best_vs_best_table(baseline_results, logic_lm_results, dataset_names):
    for dataset in dataset_names:
        max = -1
        base_direct = -1
        base_cot = -1
        
        max_model = ""
        base_direct_model = ""
        base_cot_model = ""

        for model_name in baseline_results.keys():
            for backup in ["random", "Direct", "CoT"]:
                acc = logic_lm_results[model_name][dataset][backup]['Overall_Accuracy']
                if acc > max:
                    max = acc
                    max_model = f"{model_name}   {backup}"
            direct_acc = baseline_results[model_name][dataset]['Direct']['Average_EM_score']
            cot_acc = baseline_results[model_name][dataset]['CoT']['Average_EM_score']
            if direct_acc > base_direct:
                base_direct = direct_acc
                base_direct_model = model_name
            if cot_acc > base_cot:
                base_cot = cot_acc
                base_cot_model = model_name

        row_acc = " | ".join([f"{100*base_direct:.2f}", f"{100*base_cot:.2f}", f"{100*max:.2f}"])
        row_model = " | ".join([base_direct_model, base_cot_model, max_model])

        print(row_acc)        
        print(row_model)
        
print_best_vs_best_table(baseline_results, logic_lm_results, dataset_names)

79.96 | 99.34 | 97.40
gemini-1.5-pro-preview-0514 | gemini-1.5-pro-preview-0514 | gemini-1.5-pro-preview-0409   CoT
56.67 | 74.11 | 79.73
gemini-1.5-pro-preview-0514 | gemini-1.5-pro-preview-0409 | gemini-1.5-pro-preview-0409   CoT
66.67 | 77.61 | 82.67
gemini-1.5-flash-preview-0514 | gemini-1.5-pro-preview-0409 | gemini-1.5-pro-preview-0409   CoT
59.00 | 69.00 | 84.67
gemini-1.5-pro-preview-0514 | gemini-1.5-pro-preview-0409 | gemini-1.5-pro-preview-0514   random
28.14 | 25.54 | 38.53
gemini-1.5-pro-preview-0514 | gemini-1.0-pro-vision-001 | gemini-1.5-pro-preview-0514   Direct


|dataset | info | Direct | CoT| Logic-LM (backup) |
| --- | --- | --- | ---  | --- | 
|  ProntoQA | best acc |  79.96 | 99.34 | 97.40
|  ProntoQA | best model | gemini-1.5-pro-preview-0514 | gemini-1.5-pro-preview-0514 | gemini-1.5-pro-preview-0409   (CoT)|
|  ProofWriter | best acc |  56.67 | 74.11 | 79.73|
|  ProofWriter | best model |gemini-1.5-pro-preview-0514 | gemini-1.5-pro-preview-0409 | gemini-1.5-pro-preview-0409   (CoT)|
|  FOLIO | best acc |  66.67 | 77.61 | 82.67|
|  FOLIO | best model |gemini-1.5-flash-preview-0514 | gemini-1.5-pro-preview-0409 | gemini-1.5-pro-preview-0409   (CoT)|
|  LogicalDeduction | best acc |  59.00 | 69.00 | 84.67|
|  LogicalDeduction | best model |gemini-1.5-pro-preview-0514 | gemini-1.5-pro-preview-0409 | gemini-1.5-pro-preview-0514   (random)|
|  AR-LSAT | best acc |  28.14 | 25.54 | 38.53|
|  AR-LSAT | best model |gemini-1.5-pro-preview-0514 | gemini-1.0-pro-vision-001 | gemini-1.5-pro-preview-0514   (Direct)|

In [32]:
for dataset in dataset_names:
    for mode in ["Direct", "CoT"]:
        row = []
        for model_name in baseline_results.keys():
            a_str = baseline_results[model_name][dataset][mode]['Average_EM_score']
            a_str = f"{100*a_str:.2f}"
            row.append(a_str)
        print(" | ".join(row) + " |")
    for backup in ["random", "Direct", "CoT"]:
        row = []
        for model_name in logic_lm_results.keys():
            a_str = logic_lm_results[model_name][dataset][backup]['Overall_Accuracy']
            a_str = f"{100*a_str:.2f}"
            row.append(a_str)
        print(" | ".join(row) + " |")
    print()


59.60 | 47.40 | 79.96 | 63.80 |
84.47 | 98.51 | 99.34 | 92.55 |
77.40 | 95.60 | 46.40 | 46.94 |
77.40 | 96.20 | 80.00 | 62.50 |
77.40 | 97.40 | 95.00 | 94.39 |

34.83 | 15.17 | 56.67 | 53.83 |
56.67 | 74.11 | 59.83 | 66.84 |
61.27 | 74.66 | 34.50 | 32.17 |
55.43 | 76.01 | 56.67 | 53.67 |
69.28 | 79.73 | 64.17 | 67.17 |

59.80 | 39.71 | 40.20 | 66.67 |
63.78 | 77.61 | 48.51 | 67.34 |
53.00 | 62.38 | 71.14 | 36.27 |
65.50 | 72.77 | 81.59 | 68.14 |
67.50 | 82.67 | 81.59 | 68.63 |

45.67 | 53.33 | 59.00 | 54.67 |
57.67 | 69.00 | 60.20 | 55.33 |
61.33 | 60.33 | 84.67 | 56.33 |
70.00 | 71.33 | 84.67 | 59.67 |
72.00 | 75.00 | 84.67 | 61.00 |

20.35 | 23.81 | 28.14 | 27.95 |
25.54 | 22.94 | 19.91 | 20.35 |
22.17 | 19.05 | 31.60 | 32.47 |
20.87 | 30.30 | 38.53 | 38.10 |
26.09 | 24.68 | 31.60 | 34.63 |



| Dataset          | Prompting                                               | Accuracy (%) for Meta-Llama-3-8B-Instruct | Accuracy (%) for gemini-1.0-pro-vision-001 | Accuracy (%) for gemini-1.5-pro-preview-0409 | Accuracy (%) for gemini-1.5-pro-preview-0514 | Accuracy (%) for gemini-1.5-flash-preview-0514 |
| ---------------- | ------------------------------------------------------- | ----------------------------------------- | ------------------------------------------ | -------------------------------------------- | -------------------------------------------- | ----------------------------- |
| ProntoQA         | Direct                               | 43                                        |                                            |                                              |                                              |                               |
|                  | CoT                               | 76.6                                      |                                            |                                              |                                              |                               |
|                  | Logic-LM (random) | 55                                     |                                            |                                              |                                              |                               |
|                  | Logic-LM (Direct) | 42.46                                     |                                            |                                              |                                              |                               |
|                  | Logic-LM (CoT)    | 8                                         | 92.60                                      | **97.40**                                    | 93.00                                        | 92.60                         |
| ProofWriter      | Direct                               | 33                                        |                                            |                                              |                                              |                               |
|                  | CoT                               | 28.54                                     |                                            |                                              |                                              |                               |
|                  | Logic-LM (random)                       | 28.7                                      |                                            |                                              |                                              |                               |
|                  | Logic-LM (Direct) | 28.69                                     |                                            |                                              |                                              |                               |
|                  | Logic-LM (CoT)    | 28.695                                    | 74.12                                      | **78.04**                                    | 66.17                                        | 66.17                         |
| FOLIO            | Direct                             | 46.5                                      |                                            |                                              |                                              |                               |
|                  | CoT                               | 36                                        |                                            |                                              |                                              |                               |
|                  | Logic-LM (random)                       | 43                                        |                                            |                                              |                                              |                               |
|                  | Logic-LM (Direct) | 53                                        |                                            |                                              |                                              |                               |
|                  | Logic-LM (CoT)    | 44.285                                    | 63.50                                      | 75.25                                        | **80.60**                                    | 59.80                         |
| LogicalDeduction | Direct                               | 32.33                                     |                                            |                                              |                                              |                               |
|                  | CoT                                | 22                                        |                                            |                                              |                                              |                               |
|                  | Logic-LM (random)                       | 24.27                                     |                                            |                                              |                                              |                               |
|                  | Logic-LM (Direct) | 31                                        |                                            |                                              |                                              |                               |
|                  | Logic-LM (CoT)    | 20.38                                     | 64.67                                      | 64.67                                        | **84.67**                                    | 57.67                         |
| AR-LSAT          | Direct                              | 7.36                                      |                                            |                                              |                                              |                               |
|                  | CoT                               | 8.225                                     |                                            |                                              |                                              |                               |
|                  | Logic-LM (random)                       | 22                                        |                                            |                                              |                                              |                               |
|                  | Logic-LM (Direct) | 12                                        |                                            |                                              |                                              |                               |
|                  | Logic-LM (CoT)    | 6                                         | 24.35                                      | 23.81                                        | 34.20                                        | **35.50**                     |