In [None]:
# Imports
import os
import sys
import json
import numpy as np
import openai
%load_ext autoreload
%autoreload 2

# Add the parent folder to make the utilities importable
module_path = os.path.abspath(os.path.join('..'))
sys.path.insert(0, module_path)

import answer_scoring as score
import answer_extraction as extract

In [None]:
# We will run the models using the OpenAI framework as an example, but it
# is easy to generalize to any generate() function that returns a string 
# given a string prompt.

# Add a key.env file in the base directory with your API key.
# The key.env file should be a plain text file with the line:
# OPENAI_API_KEY=<your API key>

from dotenv import load_dotenv
load_dotenv('../key.env')
import openai


client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
model = "gpt-4o"

def generate(prompt: str) -> str:
  response = client.chat.completions.create(
    model=model, 
    messages=[
        {"role": "user", "content": prompt}
    ],
  )
  return response.choices[0].message.content

# Load the simple reasoning benchmark

In [None]:
with open(f"../datasets/simple_reasoning.json", "r") as f:
    json_data = json.load(f)
print(f"loaded {len(json_data)} entries.")

# See what a single entry looks like:
print(json_data[0])


In [None]:
# Gather responses for all questions.
responses = []
for entry in json_data:
    responses.append(generate(entry["input"]))

In [None]:
# Score and evaluate the responses
is_correct = []
for entry, response in zip(json_data, responses):
    answer = extract.extract_eval(entry, response)
    correct = score.score_eval(entry, answer)
    is_correct.append(correct)
print(f"Proportion correct: {np.sum(is_correct) / len(is_correct)}.")

# Load the Unpuzzles

In [None]:
# Load the simple reasoning benchmark
with open('../datasets/unpuzzles.json', 'r') as f:
    json_data = json.load(f)
print(f"loaded {len(json_data)} entries.")

In [None]:
# Run the puzzles and unpuzzles

puzzle_responses = []
unpuzzle_responses = []
for entry in json_data:
    puzzle_responses.append(generate(entry["puzzle"]))
    unpuzzle_responses.append(generate(entry["unpuzzle"]))

# Load the shifted unpuzzles

In [None]:
with open('../datasets/shifted_unpuzzles.json', 'r') as f:
    json_data = json.load(f)
print(f"loaded {len(json_data)} entries.")

In [None]:
puzzle_responses = []
unpuzzle_responses = []
shifted_unpuzzle_responses = []
for entry in json_data:
  puzzle_responses.append(generate(entry["original_puzzle"]))
  unpuzzle_responses.append(generate(entry["unpuzzle"]))
  shifted_unpuzzle_responses.append(generate(entry["shifted_unpuzzle"]))

puzzles_correct = []
unpuzzles_correct = []
shifted_unpuzzles_correct = []

for entry, puzzle_response, unpuzzle_response, shifted_unpuzzle_response in zip(
        json_data, puzzle_responses, unpuzzle_responses, shifted_unpuzzle_responses
):
  puzzles_correct.append(score.unpuzzle(
      entry["original_answer"],
      extract.unpuzzle(puzzle_response)
  ))
  unpuzzles_correct.append(score.unpuzzle(
      entry["unpuzzle_answer"],
      extract.unpuzzle(unpuzzle_response)
  ))
  shifted_unpuzzles_correct.append(score.unpuzzle(
      entry["shifted_unpuzzle_answer"],
      extract.unpuzzle(shifted_unpuzzle_response)
  ))