In [3]:
import json
from models.symbolic_solvers.clingo_solver import ClingoSolver

In [23]:
#MODEL_NAME = "gemini-1.5-pro-preview-0409"
#MODEL_NAME = "gemini-1.5-flash-preview-0514"
MODEL_NAME = "gpt-4"
PROBLEMS = [
    "graph_validity",
    "graph_fill_in",
    "sudoku_validity",
    "sudoku_fill_in",
    "set_validity",
]
#PROBLEMS = ["graph_validity"]

In [20]:
def _parse_asp(asp_raw: str) -> str:
    if "###ASP_START###" in asp_raw:
        return _parse_asp_start_end(asp_raw)

    possible_ends = ["\n\n#", "\n\n`", "\n`", "\n#"]

    # find first % symbol index
    start = asp_raw.find("%")
    asp_raw = asp_raw[start:]
    for end in possible_ends:
        if end in asp_raw:
            asp_raw = asp_raw.split(end)[0]
            break
    asp_raw = asp_raw.replace("`", "")

    return asp_raw.strip()

def _parse_asp_start_end(asp_raw: str) -> str:
    START = "###ASP_START###"
    END = "###ASP_END###"

    return asp_raw.split(START)[1].split(END)[0]


def get_answer_letter(answer, options):
    for option in options:
        if answer in option:
            return option[0]

def full_evaluation(result_file):
    with open(result_file, "r") as f:
        all_samples = json.load(f)

    task_type = "validity" if "validity" in result_file else "fill_in"
    solver = lambda x: ClingoSolver.solve(x, task_type == "validity")
    answer_is_correct = []
    failed_programs = []

    for sample in all_samples:
        asp_raw = sample["raw_logic_programs"][0].strip()
        try:
            asp = _parse_asp(asp_raw)
        except:
            print(sample["id"])
            raise Exception("Error parsing ASP")
        try:
            if task_type == "validity":
                answer = "Yes" if solver(asp) else "No"
            else:
                answer = solver(asp)
        except:
            failed_programs.append(sample["id"])
            answer = None
        correct_answer = sample["answer"]
        options = sample["options"]
        if answer:
            choice_answer = get_answer_letter(answer, options)
            answer_is_correct.append(choice_answer == correct_answer)
            if choice_answer != correct_answer:
                print(sample["id"], choice_answer, correct_answer)
        else:
            answer_is_correct.append(False)
    return sum(answer_is_correct) / len(answer_is_correct), len(failed_programs)


out = {
    "model": MODEL_NAME,
    "results": []
}

for problem in PROBLEMS:


    file = f"outputs/logic_programs/{problem}_data_{MODEL_NAME}.json"

    try:
        accuracy, failed_programs = full_evaluation(file)
        out["results"].append({
            "problem": problem,
            "accuracy": accuracy,
            "failed_programs": failed_programs
        })
    except FileNotFoundError:
        print(f"File {file} not found")
        continue

with open(f"outputs/evaluation/evaluation_{MODEL_NAME}.json", "w") as f:
    json.dump(out, f, indent=4)

problem_48 A B
problem_62 A B
problem_81 B A
problem_86 A B
problem_88 A B
problem_99 B A
problem_139 B A
problem_186 A B
problem_211 A B
problem_214 A B
problem_238 A B
problem_257 B A
problem_267 A B
problem_277 A B
problem_285 A B
problem_288 B A
problem_295 B A
problem_319 B A
problem_353 B A
problem_357 B A
problem_362 B A
problem_388 A B
problem_0 C B
problem_4 C A
problem_21 A D
problem_25 A D
problem_27 C B
problem_52 B D
problem_53 None D
problem_65 D C
problem_66 B A
problem_83 C A
problem_84 D B
problem_128 B D
problem_149 C A
problem_151 C B
problem_163 A B
problem_169 C A
problem_181 D C
problem_189 None D
problem_3 B A
problem_8 B A
problem_27 B A
problem_37 B A
problem_39 A B
problem_50 A B
problem_68 A B


<block>:1:1-2: error: lexer error, unexpected #



problem_88 A B
problem_93 B A
problem_156 A B
problem_180 A B
problem_184 A B
problem_191 B A
problem_224 A B
problem_229 A B
problem_238 A B
problem_240 A B
problem_246 A B
problem_274 A B
problem_285 B A
problem_292 A B
problem_311 B A
problem_321 B A
problem_322 B A
problem_327 A B
problem_328 B A
problem_342 A B
problem_345 A B
problem_369 A B
problem_390 B A


<block>:15:83-84: error: syntax error, unexpected ), expecting : or "," or . or ;



problem_9 C D
problem_10 B A
problem_11 None C
problem_31 None C
problem_35 B C
problem_36 None D
problem_47 None D
problem_50 None A
problem_58 C A
problem_59 B D
problem_61 B A
problem_67 D B
problem_71 None A
problem_73 C D
problem_81 None C
problem_86 A D
problem_89 None B
problem_92 None D
problem_100 A B
problem_105 None B
problem_107 None A
problem_108 None A
problem_131 None C
problem_132 C A
problem_133 None C
problem_145 None C
problem_151 None D
problem_169 C B
problem_178 None A
problem_181 B D
problem_182 None B
problem_184 C A
problem_190 None B
problem_192 None D
problem_4 A B
problem_7 B A
problem_9 B A
problem_11 B A
problem_13 B A
problem_16 B A
problem_17 A B
problem_22 A B
problem_23 B A
problem_30 A B
problem_31 B A
problem_32 A B
problem_34 A B
problem_35 A B
problem_38 A B
problem_39 A B
problem_49 B A
problem_50 A B
problem_53 B A
problem_56 B A
problem_58 A B
problem_60 A B
problem_64 B A
problem_69 B A
problem_74 B A
problem_79 A B
problem_81 B A
problem_82 A 

In [24]:
PROBLEMS = [
    "graph_validity_direct",
    "graph_fill_in_direct",
    "sudoku_validity_direct",
    "sudoku_fill_in_direct",
    "set_validity_direct",
]

In [25]:
import re


def extract_answer_letter(answer: str) -> str:
    answer = answer.replace("Correct", "").replace("The", "")
    # find first capital letter
    answer = re.search("[A-Z]", answer)
    return answer.group(0)


def full_evaluation_direct(result_file):
    answer_is_correct = []

    with open(result_file, "r") as f:
        all_samples = json.load(f)

    for sample in all_samples:
        output = sample["raw_logic_programs"][0].strip()
        correct_answer = sample["answer"]

        choice_answer = extract_answer_letter(output)
        answer_is_correct.append(choice_answer == correct_answer)

    return sum(answer_is_correct) / len(answer_is_correct)

out = {
    "model": MODEL_NAME,
    "results": []
}

for problem in PROBLEMS:
    file = f"outputs/logic_programs/{problem}_data_{MODEL_NAME}.json"
    accuracy = full_evaluation_direct(file)
    out["results"].append({
        "problem": problem,
        "accuracy": accuracy
    })

with open(f"outputs/evaluation/evaluation_{MODEL_NAME}_direct.json", "w") as f:
    json.dump(out, f, indent=4)