In [None]:
import openai
from datasets import load_dataset
from dotenv import load_dotenv
import json
import os
import datetime
load_dotenv()
import pandas as pd

client = openai.OpenAI(api_key=os.getenv("OPENAI_KEY"))


In [None]:
# Load the benchmark dataset
dataset = load_dataset("dannkoh/invaR1ant-benchmark", split="test")
dataset = dataset.to_pandas()

In [None]:
dataset

In [None]:
def format_question(str):
    end = "All per-variable constraints must be combined using a top-level (assert (and ...)) clause.\nThe output must be in exact, canonical SMT-LIB format without extra commentary in the constraint string.\nShow your work in <think> </think> tags. And return the final SMT-LIB constraint string in <answer> </answer> tags.\nFor example: <answer>(assert (and  ( >=  in0 97)  ( <=  in0 122)))</answer>."
    return end + "\n" + str


In [None]:
models = ["gpt-4.1-2025-04-14", "gpt-4.1-mini-2025-04-14", "gpt-4o-2024-11-20","o3-mini-2025-01-31","o4-mini-2025-04-16"]
endpoint = "/v1/chat/completions"

for model in models:
    batch = []
    for i, row in dataset.iterrows():
        batch.append(
            {
                "custom_id": str(i),
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {"model": model, "messages": [{"role": "user", "content": format_question(row["question"])}]},
            }
        )
    # Save the batch as a jsonl file and upload it to the OpenAI API
    with open(f"results/batch/{model}-batch.jsonl", "w") as f:
        for item in batch:
            json.dump(item, f)
            f.write("\n")
    # Upload the batch to OpenAI

    client.files.create(
        file=open(f"results/batch/{model}-batch.jsonl", "rb"),
        purpose="batch"
    )


In [None]:
for file in client.files.list():
    if file.purpose == "batch":
        client.batches.create(
            completion_window="24h",
            endpoint=endpoint,
            input_file_id=file.id
        )

In [None]:
for file in client.files.list():
    if file.purpose == "batch":
        print(file)

In [None]:
for batch in client.batches.list():
    if batch.created_at >= 1746474126:
        print(datetime.datetime.fromtimestamp(batch.created_at),batch)

#
# 4o-mini trial 2 Batch(id='batch_67fe3cd614788190b108a5f4d0398928', completion_window='24h', created_at=1744714966, endpoint='/v1/chat/completions', input_file_id='file-EUgdDeKHeWtkUjsUwJxq6Y', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1744801366, failed_at=None, finalizing_at=None, in_progress_at=1744714967, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=671))
# print(f"{'#'* 20}")
# for batch in client.batches.list():
#         print(batch)

In [None]:
import subprocess
def load_z3_template(constants: str, original_assertions: str, generated_assertions: str) -> str:
    """
    Load the Z3 template for the experiment.
    """
    return f"""; Combined SMT for checking equivalence
; Original constants:
{constants}

; Original constraints (A):
(push)
{original_assertions}
(pop)

; Generated constraints (B):
(push)
{generated_assertions}
(pop)

; Now do two checks:
; 1) A => B fails means we push A and then (not B)
(push)
{original_assertions}
(assert (not
{_parse_constraints(generated_assertions)}
))
(check-sat)
(pop)

; 2) B => A fails means we push B and then (not A)
(push)
{generated_assertions}
(assert (not
{_parse_constraints(original_assertions)}
))
(check-sat)
(pop)
"""

def _parse_constraints(constraints: str) -> str:
    """
    Parse raw SMT-LIB2 constraints into a single conjunctive form.
    """
    assertions = [line.strip()[8:-1] for line in constraints.splitlines() if line.startswith("(assert")]
    return f"(and {' '.join(assertions)})"

def extract_response(response: str) -> str:
    """
    Extract the response from the experiment.
    """
    try:
        return response.split("<answer>")[1].split("</answer>")[0].strip().replace("\n", "")
    except:
        return None

def evaluate_with_z3(response: str, truth: str, constants: str) -> dict:
    """
    Evaluate the response using Z3.
    """
    result = {}
    smt2 = load_z3_template(constants=constants, original_assertions=truth, generated_assertions=response)

    try:
        proc = subprocess.run(["z3", "-in"], input=smt2, capture_output=True, text=True, check=False)
        output = proc.stdout.strip()

        results = [line for line in output.splitlines() if line in ("sat", "unsat", "unknown")]

        if len(results) < 2:
            result["result"] = False
            result["reason"] = "Could not parse results correctly."
            return result

        if results[0] != "unsat":
            result["result"] = False
            result["reason"] = "Original does not imply generated."
            return result

        if results[1] != "unsat":
            result["result"] = False
            result["reason"] = "Generated does not imply original."
            return result

        result["result"] = True
        result["reason"] = "Constraints are logically equivalent."
    except Exception as e:
        result["result"] = False
        result["reason"] = f"Error running Z3: {e}"

    return result


In [None]:

for batch in client.batches.list():
    if batch.created_at >= 1746474126 and batch.output_file_id is not None:
        client.files.content(batch.output_file_id).stream_to_file(f"{client.files.retrieve(batch.input_file_id).filename}.{datetime.datetime.fromtimestamp(batch.created_at).strftime('%Y-%m-%d-%H:%M:%S')}")
        model = client.files.retrieve(batch.input_file_id).filename.rsplit("-",maxsplit=1)[0]

        # Load the batch response into a DataFrame
        with open(f"{client.files.retrieve(batch.input_file_id).filename}.{datetime.datetime.fromtimestamp(batch.created_at).strftime('%Y-%m-%d-%H:%M:%S')}") as f:
            data = [json.loads(line) for line in f]
        df = pd.DataFrame(data)

        # Extract content from each response
        df["content"] = df["response"].apply(
            lambda r: (
                r["body"]["choices"][0]["message"]["content"]
                if (r and "body" in r and "choices" in r["body"] and len(r["body"]["choices"]) > 0)
                else None
            )
        )
        df = df[["custom_id", "content"]]
        df["custom_id"] = df["custom_id"].astype(int)

        # Merge with original dataset
        merged_df = dataset.merge(df, left_index=True, right_on="custom_id", how="left")

        # Evaluate each response
        results = []
        for i, row in merged_df.iterrows():
            response = row["content"]
            truth = row["answer"]
            constants = row["constants"]
            question = row["question"]

            extracted = extract_response(response) if response else None
            z3_result = (
                evaluate_with_z3(response=extracted, truth=truth, constants=constants)
                if extracted else {"result": False, "reason": "Failed to extract response."}
            )

            results.append({
                "custom_id": row["custom_id"],
                "question": question,
                "response": response,
                "extracted": extracted,
                "truth": truth,
                "constants": constants,
                "z3_result": z3_result["result"],
                "reason": z3_result["reason"]
            })

        # Compute final metrics and save
        result_summary = {
            "model": model,
            "accuracy": sum(1 for r in results if r["z3_result"]) / len(results) if len(results) else 0,
            "correct": sum(1 for r in results if r["z3_result"]),
            "incorrect": sum(1 for r in results if not r["z3_result"]),
            "results": results,
        }
        with open(f"{model}-{datetime.datetime.fromtimestamp(batch.created_at).strftime('%Y-%m-%d-%H:%M:%S')}.json", "w") as f:
            json.dump(result_summary, f, indent=4)

In [None]:
import os
import json
import re

def gather_and_save_accuracy_data(root_path, output_file):
    """
    Scans subfolders named 'trialN' (with N being an integer) under 'root_path',
    reads all .json files in each trial folder, and collects model accuracies.
    For each model, computes the average accuracy per trial and an overall average.
    The final structure has one entry per model with keys for each trial and an 'average'
    entry summarizing overall performance. The resulting dictionary is saved as a JSON file.

    Example output:
    {
        "ModelA": {
            "trials": {
                "trial_1": 0.85,
                "trial_2": 0.88,
                ...
            },
            "average": 0.86
        },
        "ModelB": {
            "trials": {
                "trial_1": 0.80,
                "trial_2": 0.82,
                ...
            },
            "average": 0.81
        }
    }
    
    Parameters:
    - root_path: str, path to the directory containing trial subfolders.
    - output_file: str, path to the output JSON file.
    
    Returns:
    - A dictionary with model names as keys and their corresponding trial and overall averages.
    """
    model_results = {}
    trial_pattern = re.compile(r"trial(\d+)")
    
    # Loop over items in the root directory
    for item in os.listdir(root_path):
        match = trial_pattern.match(item)
        if match:
            trial_number = int(match.group(1))
            trial_folder_path = os.path.join(root_path, item)
            
            if not os.path.isdir(trial_folder_path):
                continue
            
            # Scan for JSON files in each trial folder
            for filename in os.listdir(trial_folder_path):
                if filename.endswith(".json"):
                    file_path = os.path.join(trial_folder_path, filename)
                    try:
                        with open(file_path, 'r', encoding='utf-8') as f:
                            data = json.load(f)
                        model_name = data.get("model")
                        accuracy = data.get("accuracy")
                        
                        # Only add if both model name and accuracy are present
                        if model_name is None or accuracy is None:
                            continue
                        
                        if model_name not in model_results:
                            model_results[model_name] = {}
                        
                        if trial_number not in model_results[model_name]:
                            model_results[model_name][trial_number] = []
                        
                        model_results[model_name][trial_number].append(accuracy)
                    except json.JSONDecodeError:
                        print(f"Warning: Could not parse JSON from {file_path}")
    
    # Compute average accuracies per trial and overall for each model
    output_dict = {}
    for model, trials in model_results.items():
        trial_averages = {}
        overall_total = 0.0
        overall_count = 0
        
        for trial_no in sorted(trials.keys()):
            accuracies = trials[trial_no]
            avg_trial = sum(accuracies) / len(accuracies)
            trial_averages[f"trial_{trial_no}"] = avg_trial
            overall_total += sum(accuracies)
            overall_count += len(accuracies)
        
        overall_average = overall_total / overall_count if overall_count > 0 else None
        output_dict[model] = {"trials": trial_averages, "average": overall_average}
    
    # Save the resulting dictionary to a JSON file
    with open(output_file, 'w', encoding='utf-8') as out_f:
        json.dump(output_dict, out_f, indent=4)
    
    return output_dict


In [None]:
gather_and_save_accuracy_data(
    root_path="./results",
    output_file="accuracy_summary.json"
)