In [1]:
import pandas
import glob
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataframe = datasets.load_dataset("dannkoh/warp-benchmark", split="test").to_pandas()

In [3]:
dataframe

Unnamed: 0,question,answer,constants,tier
0,Given the following examples of constraints fo...,(assert (and (not ( = cell_0_0 0)) (not ( = ce...,(declare-const cell_0_0 Int),small
1,Given the following examples of constraints fo...,(assert (and (and (and (and (and (and (and (an...,(declare-const in0 Int)\n(declare-const in2 In...,small
2,Given the following examples of constraints fo...,(assert (and (and (and (and (and (and (and (an...,(declare-const in6 Int)\n(declare-const in5 In...,small
3,Given the following examples of constraints fo...,(assert (and (and ( >= n 1) ( <= n 5)) ( ...,(declare-const n Int),small
4,Given the following examples of constraints fo...,(assert (and (and (and (and (and (and (and (an...,(declare-const in0 Int)\n(declare-const in2 In...,small
...,...,...,...,...
666,Given the following examples of constraints fo...,(assert (and (and (and (and (and (and (and (an...,(declare-const c Int)\n(declare-const w0 Int)\...,large
667,Given the following examples of constraints fo...,(assert (and (and (and (and (and (and (and (an...,(declare-const in0 Int)\n(declare-const in2 In...,large
668,Given the following examples of constraints fo...,(assert (and (and (and (and (and (and (and (an...,(declare-const in20 Int)\n(declare-const in22 ...,large
669,Given the following examples of constraints fo...,(assert (and (and (and (and (and (and (and (an...,(declare-const in20 Int)\n(declare-const in22 ...,large


In [4]:
problems = {}
for problem in glob.glob("../spf-wca/custom/*"):
    problems[problem] = set()
    for n in glob.glob(f"{problem}/**/*.smt2", recursive=True):
        with open(n, "r") as f:
            smt_lines = [line.strip() for line in f if line.strip()]
            assertions = [line for line in smt_lines if line.startswith("(assert")]
            problems[problem].update(assertions)

In [5]:
problems

{'../spf-wca/custom/BubbleSort': {'(assert  ( >  in0 in1))',
  '(assert (and (and  ( >  in0 in1)  ( >  in0 in2))  ( >  in1 in2)))',
  '(assert (and (and (and (and (and  ( >  in0 in1)  ( >  in0 in2))  ( >  in0 in3))  ( >  in1 in2))  ( >  in1 in3))  ( >  in2 in3)))',
  '(assert (and (and (and (and (and (and (and (and (and  ( >  in0 in1)  ( >  in0 in2))  ( >  in0 in3))  ( >  in0 in4))  ( >  in1 in2))  ( >  in1 in3))  ( >  in1 in4))  ( >  in2 in3))  ( >  in2 in4))  ( >  in3 in4)))',
  '(assert (and (and (and (and (and (and (and (and (and (and (and (and (and (and  ( >  in0 in1)  ( >  in0 in2))  ( >  in0 in3))  ( >  in0 in4))  ( >  in0 in5))  ( >  in1 in2))  ( >  in1 in3))  ( >  in1 in4))  ( >  in1 in5))  ( >  in2 in3))  ( >  in2 in4))  ( >  in2 in5))  ( >  in3 in4))  ( >  in3 in5))  ( >  in4 in5)))',
  '(assert (and (and (and (and (and (and (and (and (and (and (and (and (and (and (and (and (and (and (and (and  ( >  in0 in1)  ( >  in0 in2))  ( >  in0 in3))  ( >  in0 in4))  ( >  in0 in5))  ( 

In [6]:
dataframe["problem"] = None
for index, row in dataframe.iterrows():
    matches = [problem.removeprefix("../spf-wca/custom/") for problem in problems if row["answer"] in problems[problem]]
    if len(matches) != 1:
        raise ValueError(f"Ambiguous mapping for row {row['id']}: {matches}")
    dataframe.at[index, "problem"] = matches[0]

In [7]:
dataframe

Unnamed: 0,question,answer,constants,tier,problem
0,Given the following examples of constraints fo...,(assert (and (not ( = cell_0_0 0)) (not ( = ce...,(declare-const cell_0_0 Int),small,MazeSolver
1,Given the following examples of constraints fo...,(assert (and (and (and (and (and (and (and (an...,(declare-const in0 Int)\n(declare-const in2 In...,small,BubbleSort
2,Given the following examples of constraints fo...,(assert (and (and (and (and (and (and (and (an...,(declare-const in6 Int)\n(declare-const in5 In...,small,BinaryTreeSearch
3,Given the following examples of constraints fo...,(assert (and (and ( >= n 1) ( <= n 5)) ( ...,(declare-const n Int),small,TowerOfHanoi
4,Given the following examples of constraints fo...,(assert (and (and (and (and (and (and (and (an...,(declare-const in0 Int)\n(declare-const in2 In...,small,Collatz
...,...,...,...,...,...
666,Given the following examples of constraints fo...,(assert (and (and (and (and (and (and (and (an...,(declare-const c Int)\n(declare-const w0 Int)\...,large,KnapsackSolver
667,Given the following examples of constraints fo...,(assert (and (and (and (and (and (and (and (an...,(declare-const in0 Int)\n(declare-const in2 In...,large,DizzyRamp
668,Given the following examples of constraints fo...,(assert (and (and (and (and (and (and (and (an...,(declare-const in20 Int)\n(declare-const in22 ...,large,ComplexStateMachineParser
669,Given the following examples of constraints fo...,(assert (and (and (and (and (and (and (and (an...,(declare-const in20 Int)\n(declare-const in22 ...,large,BinarySearchTreeHeight


In [8]:
def analyze_models_by_problem(
    output_dir="../output",
    local_model_paths=["."],
    openai_model_paths=["../results"],
    problem_column="problem",
    problem_dataframe=None,  # Pass the dataframe with problem mappings
    verbose=False
):
    """
    Analyze model performance by problem.
    
    Creates two files:
    - output/local_model_problem_stats.json
    - output/openai_model_problem_stats.json
    
    Each file contains counts of error types by model and problem.
    
    Args:
        problem_dataframe: DataFrame containing id -> problem name mappings
    """
    import json
    import os
    from pathlib import Path
    from collections import defaultdict
    import pandas as pd
    
    # Constants (same as aggregate.py)
    PARSE_ERROR = "Parse error:"
    FORMAT_ERROR = "Failed to extract <answer> from response."
    FORMAT_ERROR_2 = "Empty side"
    REASON_CORRECT = "Constraints are logically equivalent."
    REASON_SEMANTICS_A = "Original does not imply generated."
    REASON_SEMANTICS_B = "Generated does not imply original."
    REASON_SYNTAX = "Could not parse results correctly."
    REASON_FORMAT = "Failed to extract response."
    
    def log(msg):
        if verbose:
            print(f"[LOG] {msg}")
    
    # Create output directory
    Path(output_dir).mkdir(exist_ok=True, parents=True)
    
    # Build ID to problem mapping if dataframe provided
    id_to_problem = {}
    if problem_dataframe is not None and problem_column in problem_dataframe.columns:
        # Create mapping from ID to problem name
        for idx, row in problem_dataframe.iterrows():
            if not pd.isna(row[problem_column]):
                id_to_problem[str(idx)] = row[problem_column]
        
        log(f"Created mapping for {len(id_to_problem)} problem IDs to problem names")
    
    # --- Helper functions ---
    def find_local_stats_files():
        all_files = []
        for path in local_model_paths:
            if os.path.exists(path):
                found = list(Path(path).rglob("individual_stats.json"))
                all_files.extend(found)
                log(f"Found {len(found)} local stats files in {path}")
        return all_files
    
    def find_openai_result_files():
        all_files = []
        for path in openai_model_paths:
            if os.path.exists(path):
                for trial_dir in Path(path).glob("trial*"):
                    if trial_dir.is_dir():
                        json_files = [f for f in trial_dir.glob("*.json") if "summary" not in f.name]
                        all_files.extend(json_files)
                        log(f"Found {len(json_files)} OpenAI result files in {trial_dir}")
        return all_files
    
    def extract_model_name(path, is_openai=False):
        if is_openai:
            filename = path.stem
            if "-20" in filename:
                model_name = filename.split("-20")[0]
                return model_name.rstrip("-")
            elif "gpt" in filename.lower() or "claude" in filename.lower():
                parts = filename.split("-")
                if len(parts) > 1 and parts[1].isdigit() and len(parts[1]) == 4:
                    return parts[0]
                return filename
            return filename
        else:
            parts = path.parts
            for i, part in enumerate(parts):
                if part.startswith("results_") and i + 1 < len(parts):
                    return parts[i + 1]
            
            indicators = ["results_", "model_", "claude", "gpt", "llama", "gemini"]
            for part in parts:
                for indicator in indicators:
                    if indicator in part.lower():
                        return part
            
            try:
                return path.parent.parent.name
            except:
                return "unknown_model"
    
    def categorize_result(result):
        """Safely categorize a result dictionary"""
        if not isinstance(result, dict):
            return "unknown"
            
        # First check: look for "z3_result" (used by OpenAI files)
        if "z3_result" in result:
            if result["z3_result"] is True:
                return "correct"
        
        # Second check: look for "result" (used by local model files) 
        if result.get("result") is True:
            return "correct"
        
        # If neither result field is True, categorize the error based on the reason
        reason = result.get("reason", "")
        if not isinstance(reason, str):
            return "unknown"
            
        if reason.startswith((FORMAT_ERROR, FORMAT_ERROR_2)) or reason == REASON_FORMAT or "Failed to extract" in reason:
            return "error_output_formatting"
        elif reason.startswith(PARSE_ERROR) or reason == REASON_SYNTAX or "Could not parse" in reason:
            return "error_syntax"
        elif "does not imply" in reason or reason in (REASON_SEMANTICS_A, REASON_SEMANTICS_B):
            return "error_semantics"
        elif reason == REASON_CORRECT or reason == "Constraints are logically equivalent.":
            # Handle case where reason indicates correctness but result fields weren't set
            return "correct"
        else:
            return "unknown"
    
    def get_problem_name(problem_id):
        """Get problem name from ID using the mapping"""
        if problem_id is None:
            return "unknown"
        
        problem_id_str = str(problem_id)
        
        # If we have a mapping, use it
        if problem_id_str in id_to_problem:
            return id_to_problem[problem_id_str]
        
        # Try integer index if possible
        try:
            idx = int(problem_id_str)
            if str(idx) in id_to_problem:
                return id_to_problem[str(idx)]
        except (ValueError, TypeError):
            pass
            
        # No mapping found, return the ID
        return problem_id_str
    
    # Initialize data structures for problem counts
    local_models = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))  # model -> problem -> category -> count
    openai_models = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))  # model -> problem -> category -> count
    
    # --- Process local model results ---
    local_files = find_local_stats_files()
    log(f"Processing {len(local_files)} local model result files")
    
    for file_path in local_files:
        try:
            with open(file_path) as f:
                data = json.load(f)
            
            model_name = extract_model_name(file_path)
            log(f"Processing {file_path} for model {model_name}")
            
            # Handle the specific structure with "small", "medium", "large" categories
            if isinstance(data, dict) and any(k in data for k in ["small", "medium", "large"]):
                for difficulty, items in data.items():
                    if isinstance(items, list):
                        log(f"Processing {len(items)} items in {difficulty} category")
                        for item in items:
                            if not isinstance(item, dict):
                                continue
                                
                            # Use the index field as problem_id
                            problem_id = item.get("index")
                            if problem_id is None:
                                continue
                                
                            # Get problem name using the mapping
                            problem_name = get_problem_name(problem_id)
                            
                            # Get category and add to stats
                            category = categorize_result(item)
                            
                            # Count this result for the model/problem
                            local_models[model_name][problem_name][category] += 1
                            local_models[model_name][problem_name]["total_attempts"] += 1
            # Handle regular list format
            elif isinstance(data, list):
                log(f"List format detected in {file_path}, length: {len(data)}")
                for item in data:
                    # Skip non-dictionary items
                    if not isinstance(item, dict):
                        log(f"Skipping non-dictionary item in list: {type(item)}")
                        continue
                        
                    # Extract problem identifier
                    problem_id = None
                    if problem_column in item:
                        problem_id = item[problem_column]
                    elif "id" in item:
                        problem_id = item["id"]
                    elif "index" in item:
                        problem_id = item["index"]
                    
                    # Get problem name using the mapping
                    problem_name = get_problem_name(problem_id)
                    
                    # Get category and add to stats
                    category = categorize_result(item)
                    
                    # Count this result for the model/problem
                    local_models[model_name][problem_name][category] += 1
                    local_models[model_name][problem_name]["total_attempts"] += 1
            # Handle regular dictionary format
            elif isinstance(data, dict):
                # Regular dictionary format
                log(f"Dictionary format detected in {file_path}, keys: {len(data)}")
                for problem_id, problem_data in data.items():
                    # Skip non-dictionary items
                    if not isinstance(problem_data, dict):
                        log(f"Skipping non-dictionary value for key {problem_id}: {type(problem_data)}")
                        continue
                        
                    # Extract problem identifier from the problem_data if possible
                    if problem_column in problem_data:
                        problem_id = problem_data[problem_column]
                    elif "id" in problem_data:
                        problem_id = problem_data["id"]
                    elif "index" in problem_data:
                        problem_id = problem_data["index"]
                    
                    # Get problem name using the mapping
                    problem_name = get_problem_name(problem_id)
                    
                    # Get category and add to stats
                    category = categorize_result(problem_data)
                    
                    # Count this result for the model/problem
                    local_models[model_name][problem_name][category] += 1
                    local_models[model_name][problem_name]["total_attempts"] += 1
            else:
                log(f"Unrecognized data format in {file_path}: {type(data)}")
                
        except Exception as e:
            log(f"Error processing {file_path}: {str(e)}")
    
    # --- Process OpenAI model results ---
    openai_files = find_openai_result_files()
    log(f"Processing {len(openai_files)} OpenAI model result files")
    
    for file_path in openai_files:
        try:
            with open(file_path) as f:
                data = json.load(f)
            
            if "results" not in data:
                log(f"No results in {file_path}")
                continue
                
            model_name = extract_model_name(file_path, is_openai=True)
            
            for result in data["results"]:
                # Skip non-dictionary items
                if not isinstance(result, dict):
                    log(f"Skipping non-dictionary result in OpenAI file: {type(result)}")
                    continue
                    
                # Extract problem identifier
                problem_id = None
                if "custom_id" in result:
                    problem_id = result["custom_id"]
                elif problem_column in result:
                    problem_id = result[problem_column]
                elif "index" in result:
                    problem_id = result["index"]
                
                # Get problem name using the mapping
                problem_name = get_problem_name(problem_id)
                
                # Get category
                category = categorize_result(result)
                
                # Count this result for the model/problem
                openai_models[model_name][problem_name][category] += 1
                openai_models[model_name][problem_name]["total_attempts"] += 1
                
        except Exception as e:
            log(f"Error processing {file_path}: {str(e)}")
    
    # Write output files
    with open(f"{output_dir}/local_model_problem_stats.json", "w") as f:
        json.dump(local_models, f, indent=2)
    
    with open(f"{output_dir}/openai_model_problem_stats.json", "w") as f:
        json.dump(openai_models, f, indent=2)
    
    # Create per-problem reports
    problem_dir = Path(output_dir) / "problems"
    problem_dir.mkdir(exist_ok=True)
    
    # Collect all problems seen
    all_problems = set()
    for model_data in local_models.values():
        all_problems.update(model_data.keys())
    for model_data in openai_models.values():
        all_problems.update(model_data.keys())
    
    # Create per-problem reports
    for problem in all_problems:
        if problem == "unknown":
            continue
            
        problem_report = {
            "problem": problem,
            "local_models": {},
            "openai_models": {}
        }
        
        # Add local model data
        for model, problems in local_models.items():
            if problem in problems:
                problem_report["local_models"][model] = problems[problem]
                
        # Add OpenAI model data
        for model, problems in openai_models.items():
            if problem in problems:
                problem_report["openai_models"][model] = problems[problem]
        
        # Write problem report
        safe_name = problem.replace("/", "_").replace(":", "-")
        with open(f"{problem_dir}/{safe_name}.json", "w") as f:
            json.dump(problem_report, f, indent=2)
    
    print(f"Analysis complete. Reports written to:")
    print(f"- {output_dir}/local_model_problem_stats.json")
    print(f"- {output_dir}/openai_model_problem_stats.json")
    print(f"- {output_dir}/problems/ (per-problem reports)")
    
    return {
        "local_models": dict(local_models),
        "openai_models": dict(openai_models)
    }

In [9]:
# Update the function call to pass the dataframe with problem mappings
results = analyze_models_by_problem(
    problem_dataframe=dataframe,  # Pass the dataframe with problem mappings
    verbose=True
)

[LOG] Created mapping for 671 problem IDs to problem names
[LOG] Found 27 local stats files in .
[LOG] Processing 27 local model result files
[LOG] Processing results_deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/2025-05-07_10-37-40/stats/individual_stats.json for model DeepSeek-R1-Distill-Qwen-7B
[LOG] Processing 333 items in small category
[LOG] Processing 333 items in medium category
[LOG] Processing 5 items in large category
[LOG] Processing results_deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/2025-05-06_17-50-14/stats/individual_stats.json for model DeepSeek-R1-Distill-Qwen-7B
[LOG] Processing 333 items in small category
[LOG] Processing 333 items in medium category
[LOG] Processing 5 items in large category
[LOG] Processing results_deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/2025-05-06_17-23-57/stats/individual_stats.json for model DeepSeek-R1-Distill-Qwen-7B
[LOG] Processing 333 items in small category
[LOG] Processing 333 items in medium category
[LOG] Processing 5 items in large category
[LO