In [None]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import json
import os
import numpy as np
from load_and_format_datasets import load_and_format_dataset
import re

LOG_FILENAME = "analysis_log_logres_dg_gpqa.txt"

def log_output(message_string, print_to_console=False):
    with open(LOG_FILENAME, 'a', encoding='utf-8') as f:
        f.write(str(message_string) + "\n")
    if print_to_console:
        print(message_string)

LOG_METRICS_TO_EXTRACT = [
    "Delegation to teammate occurred",
    "Phase 1 self-accuracy (from completed results, total - phase2)",
    "Phase 2 self-accuracy",
    "Statistical test (P2 self vs P1)"
]

LOG_METRIC_PATTERNS = {
    "Delegation to teammate occurred": re.compile(r"^\s*Delegation to teammate occurred in (.*)$"),
    "Phase 1 self-accuracy (from completed results, total - phase2)": re.compile(r"^\s*Phase 1 self-accuracy \(from completed results, total - phase2\): (.*)$"),
    "Phase 2 self-accuracy": re.compile(r"^\s*Phase 2 self-accuracy: (.*)$"),
    "Statistical test (P2 self vs P1)": re.compile(r"^\s*Statistical test \(P2 self vs P1\): (.*)$")
}

def extract_log_file_metrics(log_filepath):
    """Reads a .log file and extracts specified metrics."""
    extracted_log_metrics = {key: "Not found" for key in LOG_METRICS_TO_EXTRACT}
    try:
        with open(log_filepath, 'r') as f:
            for line in f:
                for metric_name, pattern in LOG_METRIC_PATTERNS.items():
                    match = pattern.match(line)
                    if match:
                        extracted_log_metrics[metric_name] = match.group(1).strip()
                        # Optimization: if all log metrics found, can break early
                        # This requires checking if all "Not found" have been replaced
                        if all(val != "Not found" for val in extracted_log_metrics.values()):
                            return extracted_log_metrics
    except FileNotFoundError:
        print(f"Warning: Log file not found: {log_filepath}")
        # Return dict with "Not found" for all log metrics
    except Exception as e:
        print(f"An error occurred while reading log file {log_filepath}: {e}")
        # Return dict with "Not found" for all log metrics
    return extracted_log_metrics

def get_average_word_length(question_text):
    """Calculates the average word length in the question."""
    if not isinstance(question_text, str):
        return 0
    words = re.findall(r'\b\w+\b', question_text.lower()) # Find all words
    if not words:
        return 0
    total_word_length = sum(len(word) for word in words)
    return total_word_length / len(words)

def get_percent_non_alphabetic_whitespace(question_text):
    """
    Calculates the percentage of characters in the question text that are
    not alphabetic, not numeric, and not whitespace.
    """
    if not isinstance(question_text, str) or len(question_text) == 0:
        return 0
    
    non_alphabetic_whitespace_chars = re.findall(r'[^a-zA-Z\s]', question_text)
    return (len(non_alphabetic_whitespace_chars) / len(question_text)) * 100

def prepare_regression_data_for_model(game_file_path, 
                                      gpqa_feature_lookup, 
                                      capabilities_s_i_map_for_model):
    """
    Prepares a DataFrame for a single model's game file.
    
    Args:
        game_file_path (str): Path to the _game_data.json file.
        gpqa_feature_lookup (dict): Maps q_id to {'difficulty': score, 'domain': str, 'q_text': str}.
        capabilities_s_i_map_for_model (dict): Maps q_id to S_i (0 or 1) for THIS model.
                                            This map should be from the model's specific 
                                            _phase1_completed.json (capabilities) file.
    Returns:
        pandas.DataFrame or None
    """
    try:
        with open(game_file_path, 'r', encoding='utf-8') as f:
            game_data = json.load(f)
    except Exception as e:
        print(f"Error loading game file {game_file_path}: {e}")
        return None

    phase1_subject_feedback = game_data["feedback_config"]["phase1_subject_feedback"]

    phase2_trials = [t for t in game_data.get("results", []) if t.get('phase') == 2]
    if not phase2_trials:
        return None

    regression_data = []
    for trial in phase2_trials:
        q_id = trial.get("question_id")
        delegation_choice_str = trial.get("delegation_choice")

        if not q_id or not delegation_choice_str:
            continue

        gpqa_features = gpqa_feature_lookup.get(q_id)
        s_i_capability = capabilities_s_i_map_for_model.get(q_id) # S_i specific to this model
        domain = gpqa_features.get('domain', 'unknown').replace(' ', '_').lower()

        if gpqa_features and gpqa_features.get('difficulty') is not None and s_i_capability is not None:
            delegate_choice_numeric = 1 if delegation_choice_str == "Teammate" else 0
            regression_data.append({
                'delegate_choice': delegate_choice_numeric,
                's_i_capability': s_i_capability,
                'human_difficulty': gpqa_features['difficulty'],
                'q_length': len(gpqa_features.get('q_text', '')),
                'domain': ("Biology" if domain == "biology" else "NonBiology"),
                'overlap_ratio': gpqa_features.get('overlap_ratio', 0),
                'avg_word_length': get_average_word_length(gpqa_features.get('q_text', '')),
                'percent_non_alphabetic_whitespace': get_percent_non_alphabetic_whitespace(gpqa_features.get('q_text', '')),
                # Add other surface features here if you want
            })
    
    if not regression_data:
        return None, None
    return pd.DataFrame(regression_data), phase1_subject_feedback

# --- Main Analysis Logic ---

# 1. Load GPQA data once for features (difficulty, domain, question text for length)
print("Loading main GPQA dataset for features...")
gpqa_all_questions = load_and_format_dataset("GPQA") # This should have id, Question, high_level_domain, difficulty_score

gpqa_feature_lookup = {
    item['id']: {
        'overlap_ratio': item.get('overlap_ratio', 0),
        'difficulty': item['difficulty_score'],
        'domain': item['high_level_domain'],
        'q_text': item['question']
    } for item in gpqa_all_questions
}
print(f"GPQA feature lookup created with {len(gpqa_feature_lookup)} entries.")


# 2. Specify directories
game_logs_dir = "./delegate_game_logs/"       # Where your _game_data.json files are
capabilities_dir = "./completed_results_gpqa/" # Where your _phase1_completed.json files are

if not os.path.isdir(game_logs_dir) or not os.path.isdir(capabilities_dir):
    print(f"Error: Ensure directories exist: {game_logs_dir}, {capabilities_dir}")
    exit()

# 3. Iterate through game log files
for game_filename in sorted(os.listdir(game_logs_dir)):
    if game_filename.endswith("_game_data.json") and "_GPQA_" in game_filename:
        
        # Derive capabilities filename (assuming a consistent naming pattern)
        # E.g., "modelname_GPQA_params_game_data.json" -> "modelname_phase1_completed.json"
        # This needs to match your actual naming convention.
        # Example: if game_filename is "claude-3-opus..._GPQA_100_100_team0.6_12345_game_data.json"
        # We need to extract "claude-3-opus..." part.
        model_name_part = game_filename.split("_GPQA_")[0]
        capabilities_filename = f"{model_name_part}_phase1_completed.json"
        capabilities_file_path = os.path.join(capabilities_dir, capabilities_filename)

        if not os.path.exists(capabilities_file_path):
            print(f"  Corresponding capabilities file not found: {capabilities_file_path}. Skipping model.")
            continue

        # Load S_i data for this specific model from its capabilities file
        s_i_map_for_this_model = {}
        try:
            with open(capabilities_file_path, 'r', encoding='utf-8') as f_cap:
                cap_data = json.load(f_cap)
            for q_id, res_info in cap_data.get("results", {}).items():
                if res_info.get("is_correct") is not None:
                    s_i_map_for_this_model[q_id] = 1 if res_info["is_correct"] else 0
        except Exception as e:
            print(f"  Error loading capabilities file {capabilities_file_path}: {e}. Skipping model.")
            continue
        
        if not s_i_map_for_this_model:
            print(f"  No S_i data loaded from {capabilities_file_path}. Skipping model.")
            continue

        # Prepare data for this model's game
        game_file_path = os.path.join(game_logs_dir, game_filename)
        df_model, phase1_subject_feedback = prepare_regression_data_for_model(game_file_path, 
                                                     gpqa_feature_lookup, 
                                                     s_i_map_for_this_model)

        if df_model is None or df_model.empty:
            print("  No data for regression analysis for this file.")
            continue
        
        log_output(f"\n--- Analyzing Model from Game File: {game_filename} (feedback={phase1_subject_feedback}) ---", print_to_console=True)
        log_metrics_dict = extract_log_file_metrics(game_file_path.replace("_game_data.json", ".log"))
        for metric, value in log_metrics_dict.items():
            log_output(f"  {metric}: {value}")

        # Run Logistic Regressions
        try:
            log_output("\n  Model 1: Delegate_Choice ~ S_i_capability")
            logit_model1 = smf.logit('delegate_choice ~ s_i_capability', data=df_model).fit(disp=0)
            log_output(logit_model1.summary())

            log_output("\n  Model 2: Delegate_Choice ~ human_difficulty")
            logit_model2 = smf.logit('delegate_choice ~ human_difficulty', data=df_model).fit(disp=0)
            log_output(logit_model2.summary())

            log_output("\n  Model 3: Delegate_Choice ~ S_i_capability + human_difficulty")
            logit_model3 = smf.logit('delegate_choice ~ s_i_capability + human_difficulty', data=df_model).fit(disp=0)
            log_output(logit_model3.summary())
            
            # Optional: Full model with controls like q_length and domain
            # Ensure domain has enough categories and data points
            if df_model['domain'].nunique() > 1 and len(df_model) > 20 : # Heuristic checks
                 model_def_str = 'delegate_choice ~ s_i_capability + human_difficulty + q_length + C(domain) + overlap_ratio + avg_word_length + percent_non_alphabetic_whitespace'
                 log_output(f"\n  Model 4: {model_def_str.capitalize()}")
                 try:
                    logit_model4 = smf.logit(model_def_str, data=df_model).fit(disp=0)
                    log_output(logit_model4.summary())
                    coef_s_i = logit_model4.params.get('s_i_capability')
                    pval_s_i = logit_model4.pvalues.get('s_i_capability')
                    conf_int_s_i_log_odds = logit_model4.conf_int().loc['s_i_capability']
                    odds_ratio_delegate_Si0_vs_Si1 = np.exp(-coef_s_i)
                    ci_lower_or = np.exp(-conf_int_s_i_log_odds.iloc[1]) # Exponentiate the negative of the upper bound of original coef CI
                    ci_upper_or = np.exp(-conf_int_s_i_log_odds.iloc[0]) # Exponentiate the negative of the lower bound of original coef CI    
                    log_output(f"\n--- Odds Ratio for S_i_capability on Delegation (Adjusted) ---")
                    log_output(f"P-value for s_i_capability: {pval_s_i:.4g}")
                    log_output(f"Odds Ratio (Delegating when S_i=0 vs. S_i=1): {odds_ratio_delegate_Si0_vs_Si1:.4f}")
                    log_output(f"95% CI for this Odds Ratio: [{ci_lower_or:.4f}, {ci_upper_or:.4f}]")
                 except Exception as e_full:
                     log_output(f"    Could not fit full model: {e_full}") # E.g. perfect separation from domain
            else:
                 log_output("\n  Skipping Model 4 (full controls) due to insufficient domain variance or data points.", print_to_console=True)


        except Exception as e:
            print(f"  Error during logistic regression for {game_filename}: {e}")
        
        print("-" * 40)

Loading main GPQA dataset for features...
Attempting to load GPQA (train split)...


Using the latest cached version of the dataset since Idavidrein/gpqa couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'gpqa_main' at /Users/christopherackerman/.cache/huggingface/datasets/Idavidrein___gpqa/gpqa_main/0.0.0/90b8e5be2b1d3d2dbfe016cdab47981150600c4a (last modified on Tue May 20 13:25:45 2025).


GPQA Dataset loaded successfully.
Formatting 448 questions from GPQA...
Successfully formatted 447 unique questions from GPQA.
GPQA feature lookup created with 447 entries.

--- Analyzing Model from Game File: claude-3-5-sonnet-20241022_GPQA_100_100_team0.6_1747406864_game_data.json (feedback=False) ---
----------------------------------------

--- Analyzing Model from Game File: claude-3-5-sonnet-20241022_GPQA_100_100_team0.6_1747407886_game_data.json (feedback=True) ---
----------------------------------------

--- Analyzing Model from Game File: claude-3-5-sonnet-20241022_GPQA_100_100_team0.6_temp0.0_nobio_1747770061_game_data.json (feedback=False) ---

  Skipping Model 4 (full controls) due to insufficient domain variance or data points.
----------------------------------------

--- Analyzing Model from Game File: claude-3-5-sonnet-20241022_GPQA_50_100_team0.65_1747405864_game_data.json (feedback=False) ---
----------------------------------------

--- Analyzing Model from Game Fil




  Skipping Model 4 (full controls) due to insufficient domain variance or data points.
----------------------------------------

--- Analyzing Model from Game File: gemini-2.0-flash-001_GPQA_50_200_team0.5_temp0.0_noctr_noeasy_1747791410_game_data.json (feedback=False) ---
----------------------------------------

--- Analyzing Model from Game File: gemini-2.0-flash-001_GPQA_50_300_team0.6_1747420101_game_data.json (feedback=False) ---
----------------------------------------

--- Analyzing Model from Game File: gemini-2.0-flash-001_GPQA_50_300_team0.6_1747420378_game_data.json (feedback=True) ---
----------------------------------------

--- Analyzing Model from Game File: gemini-2.0-flash-001_GPQA_50_300_team0.6_temp0.0_nobio_1747769347_game_data.json (feedback=False) ---

  Skipping Model 4 (full controls) due to insufficient domain variance or data points.
----------------------------------------

--- Analyzing Model from Game File: gpt-4-turbo-2024-04-09_GPQA_50_200_team0.6_17474



----------------------------------------

--- Analyzing Model from Game File: grok-3-latest_GPQA_50_200_team0.8_1747443654_game_data.json (feedback=False) ---
----------------------------------------

--- Analyzing Model from Game File: grok-3-latest_GPQA_50_200_team0.8_1747449213_game_data.json (feedback=True) ---
----------------------------------------

--- Analyzing Model from Game File: grok-3-latest_GPQA_50_250_team0.8_1747448729_game_data.json (feedback=False) ---
----------------------------------------

--- Analyzing Model from Game File: meta-llama-Meta-Llama-3.1-405B-Instruct_GPQA_50_100_team0.7_1747490845_game_data.json (feedback=True) ---
----------------------------------------


  ci_upper_or = np.exp(-conf_int_s_i_log_odds.iloc[0]) # Exponentiate the negative of the lower bound of original coef CI


In [None]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import json
import os
import numpy as np
from load_and_format_datasets import load_and_format_dataset
import re
import pymc as pm
from patsy import dmatrices
from io import StringIO
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import log_loss
import numpy as np
import pandas as pd

def analyze_introspection_vs_heuristics(df_model, depvar, keyvar, potential_heuristics):
    """
    Compare capability-based, heuristic-based, and combined models
    
    Args:
        df_model: DataFrame with 'delegate_choice', 's_i_capability', and heuristic columns
        potential_heuristics: List of column names to consider as heuristics
    
    Returns:
        Dictionary with AIC values and comparisons
    """
    y = df_model[depvar]
    
    # Helper function to calculate AIC
    def calculate_aic(y_true, y_pred_proba, n_params):
        ll = -len(y_true) * log_loss(y_true, y_pred_proba)
        aic = -2 * ll + 2 * n_params
        return aic
    
    # 1. Capability-only model
    X_cap = df_model[[keyvar]]
    lr_capability = LogisticRegression(penalty=None)
    lr_capability.fit(X_cap, y)
    aic_capability = calculate_aic(y, lr_capability.predict_proba(X_cap)[:, 1], 
                                  X_cap.shape[1] + 1)  # +1 for intercept
    
    # 2. Heuristics model with LASSO selection
    X_all_heuristics = pd.get_dummies(df_model[potential_heuristics])
    #lasso = LogisticRegressionCV(penalty='l1', solver='liblinear', cv=5, random_state=42)
    lasso = LogisticRegressionCV(penalty='l1', solver='liblinear', cv=5, random_state=42, Cs=np.logspace(-2, 1, 10))
    lasso.fit(X_all_heuristics, y)
    
    # Get selected features (non-zero coefficients)
    selected_features = X_all_heuristics.columns[lasso.coef_[0] != 0].tolist()


    # Check if answer_type was properly one-hot encoded
    print(f"potential_heuristics: {potential_heuristics}")
    print("Columns created from answer_type_grouped:")
    print([col for col in X_all_heuristics.columns if 'answer_type' in col])

    print("\nDelegation rates by answer_type:")
    print(df_model.groupby('answer_type_grouped')['delegate_choice'].agg(['mean', 'count']))

    # Correct attribute for LogisticRegressionCV
    print("\nLASSO regularization C:", lasso.C_) 
    print("\nAll LASSO coefficients:")
    for feat, coef in zip(X_all_heuristics.columns, lasso.coef_[0]):
        if coef != 0:  # Only show non-zero coefficients
            print(f"{feat}: {coef:.4f}")

    if selected_features:
        X_heur = X_all_heuristics[selected_features]
        lr_heuristics = LogisticRegression(penalty=None)
        lr_heuristics.fit(X_heur, y)
        aic_heuristics = calculate_aic(y, lr_heuristics.predict_proba(X_heur)[:, 1], 
                                      X_heur.shape[1] + 1)
    else:
        # No heuristics selected - use intercept only
        aic_heuristics = calculate_aic(y, [y.mean()]*len(y), 1)
        selected_features = []
    
    # 3. Combined model
    if selected_features:
        X_combined = pd.concat([df_model[[keyvar]], X_heur], axis=1)
    else:
        X_combined = df_model[[keyvar]]
    
    lr_combined = LogisticRegression(penalty=None)
    lr_combined.fit(X_combined, y)
    aic_combined = calculate_aic(y, lr_combined.predict_proba(X_combined)[:, 1], 
                                X_combined.shape[1] + 1)
    
    # Calculate comparisons
    delta_heur_vs_cap = aic_capability - aic_heuristics  # Positive = heuristics better
    delta_comb_vs_heur = aic_heuristics - aic_combined   # Positive = combined better
    delta_comb_vs_cap = aic_capability - aic_combined    # Positive = combined better
    
    # Interpret differences
    def interpret_delta_aic(delta):
        if abs(delta) < 2:
            return "no meaningful difference"
        elif abs(delta) < 4:
            return "weak evidence"
        elif abs(delta) < 7:
            return "moderate evidence"
        elif abs(delta) < 10:
            return "strong evidence"
        else:
            return "very strong evidence"
    
    retstr = ""
    retstr+=f"Selected heuristics: {selected_features if selected_features else 'None'}\n"
    retstr+=f"\nAIC values:\n"
    retstr+=f"  Capability only: {aic_capability:.1f}\n"
    retstr+=f"  Heuristics only: {aic_heuristics:.1f}\n"
    retstr+=f"  Combined: {aic_combined:.1f}\n"
    
    retstr+=f"\nModel comparisons:\n"
    retstr+=f"  Heuristics vs Capability: ΔAIC = {delta_heur_vs_cap:.1f}\n"
    retstr+=f"    → {interpret_delta_aic(delta_heur_vs_cap)} that heuristics are {'better' if delta_heur_vs_cap > 0 else 'worse'}\n"
    
    retstr+=f"  Combined vs Heuristics: ΔAIC = {delta_comb_vs_heur:.1f}\n"
    retstr+=f"    → {interpret_delta_aic(delta_comb_vs_heur)} that capability adds value\n"
    
    retstr+=f"  Combined vs Capability: ΔAIC = {delta_comb_vs_cap:.1f}\n"
    retstr+=f"    → {interpret_delta_aic(delta_comb_vs_cap)} that combined is {'better' if delta_comb_vs_cap > 0 else 'worse'}\n"
    
    return {
        'aic_capability': aic_capability,
        'aic_heuristics': aic_heuristics,
        'aic_combined': aic_combined,
        'delta_heur_vs_cap': delta_heur_vs_cap,
        'delta_comb_vs_heur': delta_comb_vs_heur,
        'delta_comb_vs_cap': delta_comb_vs_cap,
        'selected_heuristics': selected_features
    }, retstr


class BayesianLogitResults:
    def __init__(self, formula, data, trace, X_columns, n_obs):
        self.formula = formula
        self.n_obs = n_obs
        self.trace = trace
        
        # Calculate summary statistics
        posterior = trace.posterior.beta.values.reshape(-1, len(X_columns))
        
        self.params = pd.Series([np.mean(posterior[:, i]) for i in range(len(X_columns))], 
                               index=X_columns)
        self.bse = pd.Series([np.std(posterior[:, i]) for i in range(len(X_columns))], 
                            index=X_columns)
        self.tvalues = self.params / self.bse
        
        # P-value equivalents and confidence intervals
        pvalues = []
        conf_int_data = []
        for i in range(len(X_columns)):
            samples = posterior[:, i]
            # CI
            ci_low, ci_high = np.percentile(samples, [2.5, 97.5])
            conf_int_data.append([ci_low, ci_high])
            # P-value equivalent
            if self.params.iloc[i] > 0:
                p = np.mean(samples <= 0) * 2
            else:
                p = np.mean(samples >= 0) * 2
            pvalues.append(min(p, 1.0))
        
        self.pvalues = pd.Series(pvalues, index=X_columns)
        self.conf_int_df = pd.DataFrame(conf_int_data, index=X_columns, 
                                       columns=['[0.025', '0.975]'])
    
    def summary(self):
        # Create summary DataFrame with exact statsmodels column names
        summary_df = pd.DataFrame({
            'coef': self.params,
            'std err': self.bse,
            'z': self.tvalues,
            'P>|z|': self.pvalues,
            '[0.025': self.conf_int_df['[0.025'],
            '0.975]': self.conf_int_df['0.975]']
        })
        
        # Create a custom object that mimics statsmodels Summary
        class BayesianSummary:
            def __init__(self, df, formula, n_obs):
                self.df = df
                self.formula = formula
                self.n_obs = n_obs
                
            def __str__(self):
                # Format exactly like statsmodels
                buffer = StringIO()
                buffer.write("                        Bayesian Logit Results                          \n")
                buffer.write("========================================================================\n")
                buffer.write(f"Dep. Variable:        {self.formula.split('~')[0].strip():<20} No. Observations:    {self.n_obs:>6}\n")
                buffer.write("Model:                Bayesian Logit    Method:              MCMC (PyMC)\n")
                buffer.write("========================================================================\n")
                
                # Use pandas string formatting with proper width control
                with pd.option_context('display.max_columns', None, 
                                     'display.width', 120,
                                     'display.float_format', '{:>10.4f}'.format):
                    buffer.write(self.df.to_string())
                
                buffer.write("\n========================================================================\n")
                buffer.write("Note: 'P>|z|' represents Bayesian equivalent of p-value")
                return buffer.getvalue()
            
            def __repr__(self):
                return self.__str__()
        
        return BayesianSummary(summary_df, self.formula, self.n_obs)

def bayesian_logit(formula, data, random_seed=42):
    # Create design matrix
    y, X = dmatrices(formula, data, return_type='dataframe')
    
    with pm.Model() as model:
        # Weakly informative priors
        beta = pm.Normal('beta', mu=0, sigma=2.5, shape=X.shape[1])
        
        # Linear combination
        eta = pm.math.dot(X, beta)
        
        # Likelihood
        y_obs = pm.Bernoulli('y_obs', p=pm.math.sigmoid(eta), observed=y.values.ravel())
        
        # Sample
        trace = pm.sample(2000, tune=1000, chains=2, cores=1, 
                         progressbar=False, return_inferencedata=True, random_seed=random_seed)
    
    return BayesianLogitResults(formula, data, trace, X.columns, len(data))

LOG_FILENAME = "analysis_log_logres_dg_sqa.txt"

def log_output(message_string, print_to_console=False):
    with open(LOG_FILENAME, 'a', encoding='utf-8') as f:
        f.write(str(message_string) + "\n")
    if print_to_console:
        print(message_string)

LOG_METRICS_TO_EXTRACT = [
    "Delegation to teammate occurred",
    "Phase 1 self-accuracy (from completed results, total - phase2)",
    "Phase 2 self-accuracy",
    "Statistical test (P2 self vs P1)"
]

LOG_METRIC_PATTERNS = {
    "Delegation to teammate occurred": re.compile(r"^\s*Delegation to teammate occurred in (.*)$"),
    "Phase 1 self-accuracy (from completed results, total - phase2)": re.compile(r"^\s*Phase 1 self-accuracy \(from completed results, total - phase2\): (.*)$"),
    "Phase 2 self-accuracy": re.compile(r"^\s*Phase 2 self-accuracy: (.*)$"),
    "Statistical test (P2 self vs P1)": re.compile(r"^\s*Statistical test \(P2 self vs P1\): (.*)$")
}

def extract_log_file_metrics(log_filepath):
    """Reads a .log file and extracts specified metrics."""
    extracted_log_metrics = {key: "Not found" for key in LOG_METRICS_TO_EXTRACT}
    try:
        with open(log_filepath, 'r') as f:
            for line in f:
                for metric_name, pattern in LOG_METRIC_PATTERNS.items():
                    match = pattern.match(line)
                    if match:
                        extracted_log_metrics[metric_name] = match.group(1).strip()
                        # Optimization: if all log metrics found, can break early
                        # This requires checking if all "Not found" have been replaced
                        if all(val != "Not found" for val in extracted_log_metrics.values()):
                            return extracted_log_metrics
    except FileNotFoundError:
        print(f"Warning: Log file not found: {log_filepath}")
        # Return dict with "Not found" for all log metrics
    except Exception as e:
        print(f"An error occurred while reading log file {log_filepath}: {e}")
        # Return dict with "Not found" for all log metrics
    return extracted_log_metrics

def get_average_word_length(question_text):
    """Calculates the average word length in the question."""
    if not isinstance(question_text, str):
        return 0
    words = re.findall(r'\b\w+\b', question_text.lower()) # Find all words
    if not words:
        return 0
    total_word_length = sum(len(word) for word in words)
    return total_word_length / len(words)

def get_percent_non_alphabetic_whitespace(question_text):
    """
    Calculates the percentage of characters in the question text that are
    not alphabetic, not numeric, and not whitespace.
    """
    if not isinstance(question_text, str) or len(question_text) == 0:
        return 0
    
    non_alphabetic_whitespace_chars = re.findall(r'[^a-zA-Z\s]', question_text)
    return (len(non_alphabetic_whitespace_chars) / len(question_text)) * 100


def prepare_regression_data_for_model(game_file_path, 
                                      sqa_feature_lookup, 
                                      capabilities_s_i_map_for_model):
    """
    Prepares a DataFrame for a single model's game file.
    
    Args:
        game_file_path (str): Path to the _game_data.json file.
        sqa_feature_lookup (dict): Maps q_id to {'difficulty': score, 'domain': str, 'q_text': str}.
        capabilities_s_i_map_for_model (dict): Maps q_id to S_i (0 or 1) for THIS model.
                                            This map should be from the model's specific 
                                            _phase1_completed.json (capabilities) file.
    Returns:
        pandas.DataFrame or None
    """
    try:
        with open(game_file_path, 'r', encoding='utf-8') as f:
            game_data = json.load(f)
    except Exception as e:
        print(f"Error loading game file {game_file_path}: {e}")
        return None

    phase1_subject_feedback = game_data["feedback_config"]["phase1_subject_feedback"]

    phase2_trials = [t for t in game_data.get("results", []) if t.get('phase') == 2]
    if not phase2_trials:
        return None

    regression_data = []
    for trial in phase2_trials:
        q_id = trial.get("question_id")
        delegation_choice_str = trial.get("delegation_choice")

        if not q_id or not delegation_choice_str:
            continue

        try:
            sqa_features = sqa_feature_lookup.get(q_id)
        except KeyError as e:
            print(f"Error: {e} - q_id {q_id} not found in SQA feature lookup.")
            continue
        s_i_capability = capabilities_s_i_map_for_model.get(q_id)

        if sqa_features and s_i_capability is not None:
            delegate_choice_numeric = 1 if delegation_choice_str == "Teammate" else 0
            
            regression_data.append({
                'q_id': q_id, # Ensure q_id is always in the trial data
                'delegate_choice': delegate_choice_numeric,
                's_i_capability': s_i_capability,
                'answer_type': sqa_features['answer_type'],
                'q_length': len(sqa_features.get('q_text', '')),
                'topic': sqa_features.get('topic', ''),
            })
        else:
            print(f"Warning: Missing S_i capability or SQA features for q_id {q_id}. Skipping this trial.")
            continue
    
    if not regression_data:
        return None, None
    return pd.DataFrame(regression_data), phase1_subject_feedback

# --- Main Analysis Logic ---

# 1. Load SimpleQA data once for features (difficulty, domain, question text for length)
print("Loading main SimpleQA dataset for features...")
sqa_all_questions = load_and_format_dataset("SimpleQA") # This should have id, Question, high_level_domain, difficulty_score

sqa_feature_lookup = {
    item['id']: {
        'answer_type': item.get('answer_type', 0),
        'topic': item['topic'],
        'q_text': item['question']
    } for item in sqa_all_questions
}
print(f"sqa feature lookup created with {len(sqa_feature_lookup)} entries.")


# 2. Specify directories
game_logs_dir = "./delegate_game_logs/"       # Where your _game_data.json files are
capabilities_dir = "./compiled_results_sqa/" # Where your _phase1_completed.json files are

if not os.path.isdir(game_logs_dir) or not os.path.isdir(capabilities_dir):
    print(f"Error: Ensure directories exist: {game_logs_dir}, {capabilities_dir}")
    exit()

# 3. Iterate through game log files
for game_filename in sorted(os.listdir(game_logs_dir)):
    if game_filename.endswith("_game_data_evaluated.json") and "_SimpleQA_" in game_filename:
        
        # Derive capabilities filename (assuming a consistent naming pattern)
        # E.g., "modelname_GPQA_params_game_data.json" -> "modelname_phase1_completed.json"
        # This needs to match your actual naming convention.
        # Example: if game_filename is "claude-3-opus..._GPQA_100_100_team0.6_12345_game_data.json"
        # We need to extract "claude-3-opus..." part.
        model_name_part = game_filename.split("_SimpleQA_")[0]
        capabilities_filename = f"{model_name_part}_phase1_compiled.json"
        capabilities_file_path = os.path.join(capabilities_dir, capabilities_filename)

        if not os.path.exists(capabilities_file_path):
            print(f"  Corresponding capabilities file not found: {capabilities_file_path}. Skipping model.")
            continue

        # Load S_i data for this specific model from its capabilities file
        s_i_map_for_this_model = {}
        try:
            with open(capabilities_file_path, 'r', encoding='utf-8') as f_cap:
                cap_data = json.load(f_cap)
            for q_id, res_info in cap_data.get("results", {}).items():
                if res_info.get("is_correct") is not None:
                    s_i_map_for_this_model[q_id] = 1 if res_info["is_correct"] else 0
        except Exception as e:
            print(f"  Error loading capabilities file {capabilities_file_path}: {e}. Skipping model.")
            continue
        
        if not s_i_map_for_this_model:
            print(f"  No S_i data loaded from {capabilities_file_path}. Skipping model.")
            continue

        # Prepare data for this model's game
        game_file_path = os.path.join(game_logs_dir, game_filename)
        df_model, phase1_subject_feedback = prepare_regression_data_for_model(game_file_path, 
                                                     sqa_feature_lookup, 
                                                     s_i_map_for_this_model)

        if df_model is None or df_model.empty:
            print("  No data for regression analysis for this file.")
            continue
        
        log_output(f"\n--- Analyzing Model from Game File: {game_filename} (feedback={phase1_subject_feedback}) ---", print_to_console=True)
        log_metrics_dict = extract_log_file_metrics(game_file_path.replace("_game_data_evaluated.json", ".log"))
        for metric, value in log_metrics_dict.items():
            log_output(f"  {metric}: {value}")

        # Run Logistic Regressions
        try:
            model_def_str = 'delegate_choice ~ s_i_capability'
            log_output(f"\n  Model 1: {model_def_str.capitalize()}")
            logit_model1 = bayesian_logit(model_def_str, df_model)#smf.logit('delegate_choice ~ s_i_capability', data=df_model).fit(disp=0)
            log_output(logit_model1.summary())


            min_obs_per_category=10
            topic_counts = df_model['topic'].value_counts()
            rare_topics = topic_counts[topic_counts < min_obs_per_category].index.tolist()

            if rare_topics: # Only create new column if there are rare topics
                df_model['topic_grouped'] = df_model['topic'].apply(lambda x: 'Misc' if x in rare_topics else x)
                topic_column_for_formula = 'topic_grouped'
                log_output(f"Grouped rare topics into 'Misc': {rare_topics}")
            else:
                df_model['topic_grouped'] = df_model['topic'] # No grouping needed, use original
                topic_column_for_formula = 'topic'

            ans_type_counts = df_model['answer_type'].value_counts()
            rare_ans_types = ans_type_counts[ans_type_counts < min_obs_per_category].index.tolist()

            if rare_ans_types:
                df_model['answer_type_grouped'] = df_model['answer_type'].apply(lambda x: 'Misc' if x in rare_ans_types else x)
                ans_type_column_for_formula = 'answer_type_grouped'
                log_output(f"Grouped rare answer types into 'Misc': {rare_ans_types}")
            else:
                df_model['answer_type_grouped'] = df_model['answer_type'] # No grouping needed
                ans_type_column_for_formula = 'answer_type'

            #log_output(f"Topic Grouped Counts:\n {df_model['topic_grouped'].value_counts()}")
            #log_output(f"Answer Type Grouped Counts:\n {df_model['answer_type_grouped'].value_counts()}")
            #cross_tab = pd.crosstab(df_model['topic_grouped'], df_model['answer_type_grouped'])
            #log_output("\nCross-tabulation of Topic Grouped vs. Answer Type Grouped:")
            #log_output(cross_tab)
            # Check correlations
            log_output("Capability by topic:")
            log_output(df_model.groupby('topic_grouped')['s_i_capability'].agg(['mean', 'std', 'count']))

            log_output("\nCapability by answer type:")  
            log_output(df_model.groupby('answer_type_grouped')['s_i_capability'].agg(['mean', 'std', 'count']))

            log_output("\nOutcome by topic and answer type:")
            log_output(pd.crosstab([df_model['topic_grouped'], df_model['answer_type_grouped']], 
                            df_model['delegate_choice'], normalize='index'))

            model_def_str = f'delegate_choice ~ s_i_capability + q_length + C({topic_column_for_formula})'
            log_output(f"\n  Model 2: {model_def_str.capitalize()}")
            logit_model2 = bayesian_logit(model_def_str, df_model)#smf.logit(model_def_str, data=df_model).fit(disp=0)
            log_output(logit_model2.summary())

            model_def_str = f'delegate_choice ~ s_i_capability + q_length + C({ans_type_column_for_formula})'
            log_output(f"\n  Model 3: {model_def_str.capitalize()}")
#            logit_model3 = smf.logit(model_def_str, data=df_model).fit(disp=0)
#            log_output(logit_model3.summary())
            logit_model3 = bayesian_logit(model_def_str, df_model)
            log_output(logit_model3.summary())  # This will print the nice summary

            model_def_str = f'delegate_choice ~ s_i_capability + q_length + C({topic_column_for_formula}) + C({ans_type_column_for_formula})'
            log_output(f"\n  Model 4: {model_def_str.capitalize()}")

            potential_heuristics = ['answer_type_grouped', 'topic_grouped', 'q_length']
            results, retstr = analyze_introspection_vs_heuristics(df_model, 'delegate_choice', 's_i_capability', potential_heuristics)
            log_output(f"\n {retstr}\n{results}")

            try:
                logit_model4 = bayesian_logit(model_def_str, df_model)#smf.logit(model_def_str, data=df_model).fit(disp=0)
                log_output(logit_model4.summary())

                model_def_str = f'''delegate_choice ~ s_i_capability + 
                                    s_i_capability:C({ans_type_column_for_formula}) + 
                                    s_i_capability:C({topic_column_for_formula}) + 
                                    s_i_capability:q_length + 
                                    C({ans_type_column_for_formula}) + 
                                    C({topic_column_for_formula}) + 
                                    q_length'''
                log_output(f"\n  Model 5: {model_def_str.capitalize()}")
                logit_model5 = bayesian_logit(model_def_str, df_model)#smf.logit(model_def_str, data=df_model).fit(disp=0)
                log_output(logit_model5.summary())
            except Exception as e_full:
                log_output(f"    Could not fit full model: {e_full}") # E.g. perfect separation from domain


        except Exception as e:
            print(f"  Error during logistic regression for {game_filename}: {e}")
        
        print("-" * 40)

In [12]:
df_number_type = df_model[df_model['answer_type_grouped'] == 'Number']

print("\nDelegation choices for Answer Type 'Number':")
print(df_number_type['delegate_choice'].value_counts(dropna=False))


Delegation choices for Answer Type 'Number':
Series([], Name: count, dtype: int64)


In [16]:
df_model['answer_type'].value_counts(dropna=False)

answer_type
Date      30
Person    23
Other     19
Number     9
Place      7
Name: count, dtype: int64

In [34]:
print("Loading main SimpleQA dataset for features...")
sqa_all_questions = load_and_format_dataset("SimpleQA") # This should have id, Question, high_level_domain, difficulty_score

sqa_feature_lookup = {
    item['id']: {
        'answer_type': item.get('answer_type', 0),
        'topic': item['topic'],
        'q_text': item['question']
    } for item in sqa_all_questions
}
print(f"sqa feature lookup created with {len(sqa_feature_lookup)} entries.")

capabilities_file_path = "./compiled_results_sqa/claude-3-5-sonnet-20241022_phase1_compiled.json"
with open(capabilities_file_path, 'r', encoding='utf-8') as f_cap:
    cap_data = json.load(f_cap)
regression_data = []
for q_id, res_info in cap_data.get("results", {}).items():
    s_i_capability = 1 if res_info["is_correct"] else 0
    sqa_features = sqa_feature_lookup.get(q_id)

    regression_data.append({
        'q_id': q_id, 
        's_i_capability': s_i_capability,
        'answer_type': sqa_features['answer_type'],
        'q_length': len(sqa_features.get('q_text', '')),
        'topic': sqa_features.get('topic', ''),
    })
df_model = pd.DataFrame(regression_data)

print("                Capability by topic:")
print(df_model.groupby('topic')['s_i_capability'].agg(['mean', 'std', 'count']))

print("\n                Capability by answer type:")
print(df_model.groupby('answer_type')['s_i_capability'].agg(['mean', 'std', 'count']))

print("                Q length by capability:")
print(df_model.groupby('s_i_capability')['q_length'].agg(['mean', 'std', 'count']))


Loading main SimpleQA dataset for features...
Attempting to load SimpleQA (test split)...
Dataset loaded successfully.
Formatting 4326 questions...
Successfully formatted 4326 unique questions from SimpleQA.
sqa feature lookup created with 4326 entries.
                Capability by topic:
                            mean       std  count
topic                                            
Art                     0.380000  0.490314     50
Geography               0.294118  0.462497     34
History                 0.562500  0.512348     16
Music                   0.280000  0.458258     25
Other                   0.333333  0.480384     27
Politics                0.490909  0.504525     55
Science and technology  0.328947  0.472953     76
Sports                  0.260870  0.448978     23
TV shows                0.333333  0.485071     18
Video games             0.266667  0.457738     15

                Capability by answer type:
                 mean       std  count
answer_type               