In [None]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import json
import os
import numpy as np
from load_and_format_datasets import load_and_format_dataset
import re

LOG_FILENAME = "analysis_log_logres_dg_gpqa.txt"

def log_output(message_string, print_to_console=False):
    with open(LOG_FILENAME, 'a', encoding='utf-8') as f:
        f.write(str(message_string) + "\n")
    if print_to_console:
        print(message_string)

LOG_METRICS_TO_EXTRACT = [
    "Delegation to teammate occurred",
    "Phase 1 self-accuracy (from completed results, total - phase2)",
    "Phase 2 self-accuracy",
    "Statistical test (P2 self vs P1)"
]

LOG_METRIC_PATTERNS = {
    "Delegation to teammate occurred": re.compile(r"^\s*Delegation to teammate occurred in (.*)$"),
    "Phase 1 self-accuracy (from completed results, total - phase2)": re.compile(r"^\s*Phase 1 self-accuracy \(from completed results, total - phase2\): (.*)$"),
    "Phase 2 self-accuracy": re.compile(r"^\s*Phase 2 self-accuracy: (.*)$"),
    "Statistical test (P2 self vs P1)": re.compile(r"^\s*Statistical test \(P2 self vs P1\): (.*)$")
}

def extract_log_file_metrics(log_filepath):
    """Reads a .log file and extracts specified metrics."""
    extracted_log_metrics = {key: "Not found" for key in LOG_METRICS_TO_EXTRACT}
    try:
        with open(log_filepath, 'r') as f:
            for line in f:
                for metric_name, pattern in LOG_METRIC_PATTERNS.items():
                    match = pattern.match(line)
                    if match:
                        extracted_log_metrics[metric_name] = match.group(1).strip()
                        # Optimization: if all log metrics found, can break early
                        # This requires checking if all "Not found" have been replaced
                        if all(val != "Not found" for val in extracted_log_metrics.values()):
                            return extracted_log_metrics
    except FileNotFoundError:
        print(f"Warning: Log file not found: {log_filepath}")
        # Return dict with "Not found" for all log metrics
    except Exception as e:
        print(f"An error occurred while reading log file {log_filepath}: {e}")
        # Return dict with "Not found" for all log metrics
    return extracted_log_metrics

def get_average_word_length(question_text):
    """Calculates the average word length in the question."""
    if not isinstance(question_text, str):
        return 0
    words = re.findall(r'\b\w+\b', question_text.lower()) # Find all words
    if not words:
        return 0
    total_word_length = sum(len(word) for word in words)
    return total_word_length / len(words)

def get_percent_non_alphabetic_whitespace(question_text):
    """
    Calculates the percentage of characters in the question text that are
    not alphabetic, not numeric, and not whitespace.
    """
    if not isinstance(question_text, str) or len(question_text) == 0:
        return 0
    
    non_alphabetic_whitespace_chars = re.findall(r'[^a-zA-Z\s]', question_text)
    return (len(non_alphabetic_whitespace_chars) / len(question_text)) * 100

def get_s_i_from_capabilities_map(q_id, capabilities_s_i_map):
    """
    Looks up S_i (1 if model knew it in capabilities test, 0 if not)
    """
    return capabilities_s_i_map.get(q_id)


def prepare_regression_data_for_model(game_file_path, 
                                      gpqa_feature_lookup, 
                                      capabilities_s_i_map_for_model):
    """
    Prepares a DataFrame for a single model's game file.
    
    Args:
        game_file_path (str): Path to the _game_data.json file.
        gpqa_feature_lookup (dict): Maps q_id to {'difficulty': score, 'domain': str, 'q_text': str}.
        capabilities_s_i_map_for_model (dict): Maps q_id to S_i (0 or 1) for THIS model.
                                            This map should be from the model's specific 
                                            _phase1_completed.json (capabilities) file.
    Returns:
        pandas.DataFrame or None
    """
    try:
        with open(game_file_path, 'r', encoding='utf-8') as f:
            game_data = json.load(f)
    except Exception as e:
        print(f"Error loading game file {game_file_path}: {e}")
        return None

    phase1_subject_feedback = game_data["feedback_config"]["phase1_subject_feedback"]

    phase2_trials = [t for t in game_data.get("results", []) if t.get('phase') == 2]
    if not phase2_trials:
        return None

    regression_data = []
    for trial in phase2_trials:
        q_id = trial.get("question_id")
        delegation_choice_str = trial.get("delegation_choice")

        if not q_id or not delegation_choice_str:
            continue

        gpqa_features = gpqa_feature_lookup.get(q_id)
        s_i_capability = capabilities_s_i_map_for_model.get(q_id) # S_i specific to this model
        domain = gpqa_features.get('domain', 'unknown').replace(' ', '_').lower()

        if gpqa_features and gpqa_features.get('difficulty') is not None and s_i_capability is not None:
            delegate_choice_numeric = 1 if delegation_choice_str == "Teammate" else 0
            regression_data.append({
                'delegate_choice': delegate_choice_numeric,
                's_i_capability': s_i_capability,
                'human_difficulty': gpqa_features['difficulty'],
                'q_length': len(gpqa_features.get('q_text', '')),
                'domain': ("Biology" if domain == "biology" else "NonBiology"),
                'overlap_ratio': gpqa_features.get('overlap_ratio', 0),
                'avg_word_length': get_average_word_length(gpqa_features.get('q_text', '')),
                'percent_non_alphabetic_whitespace': get_percent_non_alphabetic_whitespace(gpqa_features.get('q_text', '')),
                # Add other surface features here if you want
            })
    
    if not regression_data:
        return None, None
    return pd.DataFrame(regression_data), phase1_subject_feedback

# --- Main Analysis Logic ---

# 1. Load GPQA data once for features (difficulty, domain, question text for length)
print("Loading main GPQA dataset for features...")
gpqa_all_questions = load_and_format_dataset("GPQA") # This should have id, Question, high_level_domain, difficulty_score

gpqa_feature_lookup = {
    item['id']: {
        'overlap_ratio': item.get('overlap_ratio', 0),
        'difficulty': item['difficulty_score'],
        'domain': item['high_level_domain'],
        'q_text': item['question']
    } for item in gpqa_all_questions
}
print(f"GPQA feature lookup created with {len(gpqa_feature_lookup)} entries.")


# 2. Specify directories
game_logs_dir = "./delegate_game_logs/"       # Where your _game_data.json files are
capabilities_dir = "./completed_results_gpqa/" # Where your _phase1_completed.json files are

if not os.path.isdir(game_logs_dir) or not os.path.isdir(capabilities_dir):
    print(f"Error: Ensure directories exist: {game_logs_dir}, {capabilities_dir}")
    exit()

# 3. Iterate through game log files
for game_filename in sorted(os.listdir(game_logs_dir)):
    if game_filename.endswith("_game_data.json") and "_GPQA_" in game_filename:
        
        # Derive capabilities filename (assuming a consistent naming pattern)
        # E.g., "modelname_GPQA_params_game_data.json" -> "modelname_phase1_completed.json"
        # This needs to match your actual naming convention.
        # Example: if game_filename is "claude-3-opus..._GPQA_100_100_team0.6_12345_game_data.json"
        # We need to extract "claude-3-opus..." part.
        model_name_part = game_filename.split("_GPQA_")[0]
        capabilities_filename = f"{model_name_part}_phase1_completed.json"
        capabilities_file_path = os.path.join(capabilities_dir, capabilities_filename)

        if not os.path.exists(capabilities_file_path):
            print(f"  Corresponding capabilities file not found: {capabilities_file_path}. Skipping model.")
            continue

        # Load S_i data for this specific model from its capabilities file
        s_i_map_for_this_model = {}
        try:
            with open(capabilities_file_path, 'r', encoding='utf-8') as f_cap:
                cap_data = json.load(f_cap)
            for q_id, res_info in cap_data.get("results", {}).items():
                if res_info.get("is_correct") is not None:
                    s_i_map_for_this_model[q_id] = 1 if res_info["is_correct"] else 0
        except Exception as e:
            print(f"  Error loading capabilities file {capabilities_file_path}: {e}. Skipping model.")
            continue
        
        if not s_i_map_for_this_model:
            print(f"  No S_i data loaded from {capabilities_file_path}. Skipping model.")
            continue

        # Prepare data for this model's game
        game_file_path = os.path.join(game_logs_dir, game_filename)
        df_model, phase1_subject_feedback = prepare_regression_data_for_model(game_file_path, 
                                                     gpqa_feature_lookup, 
                                                     s_i_map_for_this_model)

        if df_model is None or df_model.empty:
            print("  No data for regression analysis for this file.")
            continue
        
        log_output(f"\n--- Analyzing Model from Game File: {game_filename} (feedback={phase1_subject_feedback}) ---", print_to_console=True)
        log_metrics_dict = extract_log_file_metrics(game_file_path.replace("_game_data.json", ".log"))
        for metric, value in log_metrics_dict.items():
            log_output(f"  {metric}: {value}")

        # Run Logistic Regressions
        try:
            log_output("\n  Model 1: Delegate_Choice ~ S_i_capability")
            logit_model1 = smf.logit('delegate_choice ~ s_i_capability', data=df_model).fit(disp=0)
            log_output(logit_model1.summary())

            log_output("\n  Model 2: Delegate_Choice ~ human_difficulty")
            logit_model2 = smf.logit('delegate_choice ~ human_difficulty', data=df_model).fit(disp=0)
            log_output(logit_model2.summary())

            log_output("\n  Model 3: Delegate_Choice ~ S_i_capability + human_difficulty")
            logit_model3 = smf.logit('delegate_choice ~ s_i_capability + human_difficulty', data=df_model).fit(disp=0)
            log_output(logit_model3.summary())
            
            # Optional: Full model with controls like q_length and domain
            # Ensure domain has enough categories and data points
            if df_model['domain'].nunique() > 1 and len(df_model) > 20 : # Heuristic checks
                 model_def_str = 'delegate_choice ~ s_i_capability + human_difficulty + q_length + C(domain) + overlap_ratio + avg_word_length + percent_non_alphabetic_whitespace'
                 log_output(f"\n  Model 4: {model_def_str.capitalize()}")
                 try:
                    logit_model4 = smf.logit(model_def_str, data=df_model).fit(disp=0)
                    log_output(logit_model4.summary())
                    coef_s_i = logit_model4.params.get('s_i_capability')
                    pval_s_i = logit_model4.pvalues.get('s_i_capability')
                    conf_int_s_i_log_odds = logit_model4.conf_int().loc['s_i_capability']
                    odds_ratio_delegate_Si0_vs_Si1 = np.exp(-coef_s_i)
                    ci_lower_or = np.exp(-conf_int_s_i_log_odds.iloc[1]) # Exponentiate the negative of the upper bound of original coef CI
                    ci_upper_or = np.exp(-conf_int_s_i_log_odds.iloc[0]) # Exponentiate the negative of the lower bound of original coef CI    
                    log_output(f"\n--- Odds Ratio for S_i_capability on Delegation (Adjusted) ---")
                    log_output(f"P-value for s_i_capability: {pval_s_i:.4g}")
                    log_output(f"Odds Ratio (Delegating when S_i=0 vs. S_i=1): {odds_ratio_delegate_Si0_vs_Si1:.4f}")
                    log_output(f"95% CI for this Odds Ratio: [{ci_lower_or:.4f}, {ci_upper_or:.4f}]")
                 except Exception as e_full:
                     log_output(f"    Could not fit full model: {e_full}") # E.g. perfect separation from domain
            else:
                 log_output("\n  Skipping Model 4 (full controls) due to insufficient domain variance or data points.", print_to_console=True)


        except Exception as e:
            print(f"  Error during logistic regression for {game_filename}: {e}")
        
        print("-" * 40)

Loading main GPQA dataset for features...
Attempting to load GPQA (train split)...


Using the latest cached version of the dataset since Idavidrein/gpqa couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'gpqa_main' at /Users/christopherackerman/.cache/huggingface/datasets/Idavidrein___gpqa/gpqa_main/0.0.0/90b8e5be2b1d3d2dbfe016cdab47981150600c4a (last modified on Tue May 20 13:25:45 2025).


GPQA Dataset loaded successfully.
Formatting 448 questions from GPQA...
Successfully formatted 447 unique questions from GPQA.
GPQA feature lookup created with 447 entries.

--- Analyzing Model from Game File: claude-3-5-sonnet-20241022_GPQA_100_100_team0.6_1747406864_game_data.json (feedback=False) ---
----------------------------------------

--- Analyzing Model from Game File: claude-3-5-sonnet-20241022_GPQA_100_100_team0.6_1747407886_game_data.json (feedback=True) ---
----------------------------------------

--- Analyzing Model from Game File: claude-3-5-sonnet-20241022_GPQA_100_100_team0.6_temp0.0_nobio_1747770061_game_data.json (feedback=False) ---
----------------------------------------

--- Analyzing Model from Game File: claude-3-5-sonnet-20241022_GPQA_50_100_team0.65_1747405864_game_data.json (feedback=False) ---
----------------------------------------

--- Analyzing Model from Game File: claude-3-5-sonnet-20241022_GPQA_50_100_team0.6_1747406304_game_data.json (feedback=Fa




--- Analyzing Model from Game File: gemini-2.0-flash-001_GPQA_50_200_team0.5_temp0.0_noctr_1747791168_game_data.json (feedback=False) ---
----------------------------------------

--- Analyzing Model from Game File: gemini-2.0-flash-001_GPQA_50_200_team0.5_temp0.0_noctr_nobio_1747790761_game_data.json (feedback=False) ---
----------------------------------------

--- Analyzing Model from Game File: gemini-2.0-flash-001_GPQA_50_200_team0.5_temp0.0_noctr_nobio_noeasy_1747790955_game_data.json (feedback=False) ---
----------------------------------------

--- Analyzing Model from Game File: gemini-2.0-flash-001_GPQA_50_200_team0.5_temp0.0_noctr_noeasy_1747791410_game_data.json (feedback=False) ---
----------------------------------------

--- Analyzing Model from Game File: gemini-2.0-flash-001_GPQA_50_300_team0.6_1747420101_game_data.json (feedback=False) ---
----------------------------------------

--- Analyzing Model from Game File: gemini-2.0-flash-001_GPQA_50_300_team0.6_1747420378

  ci_upper_or = np.exp(-conf_int_s_i_log_odds.iloc[0]) # Exponentiate the negative of the lower bound of original coef CI


----------------------------------------

--- Analyzing Model from Game File: gpt-4-turbo-2024-04-09_GPQA_50_200_team0.6_1747424300_game_data.json (feedback=True) ---
----------------------------------------

--- Analyzing Model from Game File: gpt-4o-2024-08-06_GPQA_50_200_team0.6_1747674995_game_data.json (feedback=True) ---
----------------------------------------

--- Analyzing Model from Game File: gpt-4o-2024-08-06_GPQA_50_200_team0.6_1747675242_game_data.json (feedback=False) ---
----------------------------------------

--- Analyzing Model from Game File: grok-3-latest_GPQA_50_200_team0.75_1747442192_game_data.json (feedback=False) ---
----------------------------------------

--- Analyzing Model from Game File: grok-3-latest_GPQA_50_200_team0.7_1747441815_game_data.json (feedback=False) ---
----------------------------------------

--- Analyzing Model from Game File: grok-3-latest_GPQA_50_200_team0.8_1747443654_game_data.json (feedback=False) ---
------------------------------

