In [12]:
import datasets
from typing import Optional, Dict, Any
from pprint import pprint
import re

In [13]:
def check_logical_equivalence(
    original_assertions: str,
    generated_assertions: str,
    constants: Optional[str] = None
) -> Dict[str, Any]:
    """
    In-process Z3 two-step equivalence check:
      1) A ⇒ B
      2) B ⇒ A.
    """
    from z3 import Solver, parse_smt2_string, Not, And, Z3Exception, sat

    orig = original_assertions.strip()
    gen = generated_assertions.strip()
    if constants:
        decls = constants.strip()
        orig = decls + f"\n{orig}"
        gen = decls + f"\n{gen}"

    # trivial cases
    if not orig and not gen:
        return {"result": True}
    if not orig or not gen:
        return {"result": False, "reason": "Empty side"}

    try:
        A = parse_smt2_string(orig)
        B = parse_smt2_string(gen)
    except Z3Exception as e:
        return {"result": False, "reason": f"Parse error: {e}"}

    s = Solver()
    # A ⇒ B
    s.push()
    s.add(*A)
    s.add(Not(And(*B)))
    if s.check() == sat:
        return {"result": False, "reason": "A does not imply B"}
    s.pop()
    # B ⇒ A
    s.push()
    s.add(*B)
    s.add(Not(And(*A)))
    if s.check() == sat:
        return {"result": False, "reason": "B does not imply A"}
    s.pop()

    return {"result": True}


In [None]:
# dataset = datasets.load_dataset("[REDACTED FOR ANONYMITY]", split="test")
dataset = dataset.to_pandas()
instance = dataset.iloc[621]

In [None]:
# path WARP-evaluation/src/results_warp/warp-1.0/2025-05-05_20-25-50/stats/individual_stats.json
WARP_RESPONSE = """
To generate the constraint for N=22, we can extend the logic applied in the previous examples. We notice a pattern where the input values are permuted and checked against a set of constraints based on the current input and the previous ones. This pattern can be applied to larger inputs as well. Here is the derived constraint for N=22.\n\n```\nAnswer: (assert (and  (not ( = in0 65))  ( =  in0 66)  (not ( = in1 69))  ( =  in1 70)  (not ( = in2 73))  (not ( = in2 74))  (not ( = in3 65))  ( =  in3 66)  (not ( = in4 69))  ( =  in4 70)  (not ( = in5 73))  (not ( = in5 74))  (not ( = in6 65))  ( =  in6 66)  (not ( = in7 69))  ( =  in7 70)  (not ( = in8 73))  (not ( = in8 74))  (not ( = in9 65))  ( =  in9 66)  (not ( = in10 69))  ( =  in10 70)  (not ( = in11 73))  (not ( = in11 74))  (not ( = in12 65))  ( =  in12 66)  (not ( = in13 69))  ( =  in13 70)  (not ( = in14 73))  (not ( = in14 74))  (not ( = in15 65))  ( =  in15 66)  (not ( = in16 69))  ( =  in16 70)  (not ( = in17 73))  (not ( = in17 74))  (not ( = in18 65))  ( =  in18 66)  (not ( = in19 69))  ( =  in19 70)  (not ( = in20 73))  (not ( = in20 74))  (not ( = in21 65))  ( =  in21 66)))\n```
"""

In [16]:
pprint(instance.to_dict())

{'answer': '(assert (and (and (and (and (and (and (and (and (and (and (and '
           '(and (and (and (and (and (and (and (and (and (and (and (and (and '
           '(and (and (and (and (and (and (and (and (and (and (and (and (and '
           '(and (and (and (and (and (and (not ( = in0 65))  ( =  in0 66)) '
           '(not ( = in1 69)))  ( =  in1 70)) (not ( = in2 73))) (not ( = in2 '
           '74))) (not ( = in3 65)))  ( =  in3 66)) (not ( = in4 69)))  ( =  '
           'in4 70)) (not ( = in5 73))) (not ( = in5 74))) (not ( = in6 65)))  '
           '( =  in6 66)) (not ( = in7 69)))  ( =  in7 70)) (not ( = in8 73))) '
           '(not ( = in8 74))) (not ( = in9 65)))  ( =  in9 66)) (not ( = in10 '
           '69)))  ( =  in10 70)) (not ( = in11 73))) (not ( = in11 74))) (not '
           '( = in12 65)))  ( =  in12 66)) (not ( = in13 69)))  ( =  in13 70)) '
           '(not ( = in14 73))) (not ( = in14 74))) (not ( = in15 65)))  ( =  '
           'in15 66)) (not ( = in16 69)))  (

In [17]:
manual_extraction = WARP_RESPONSE.split("```\nAnswer: ")[1].split("\n```")[0]


check_logical_equivalence(
    original_assertions=instance["answer"],
    generated_assertions=manual_extraction,
    constants=instance["constants"]
)

{'result': True}