### Feature-Error Relationship Analysis

This notebook identifies which equation features are associated with specific MIRA extraction errors using:
- Distance Correlation for detecting non-linear associations
- Chi-square/Fisher's Tests for categorical feature-error relationships
- Propensity Score Matching for establishing causality
- Benjamini-Hochberg False Discovery Rate (FDR) for multiple testing correction
 

Input: Features and categorized errors from error_analysis.ipynb

Output: Ranked feature-error associations with statistical significance

> tbcorrected

In [25]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, fisher_exact
from scipy.spatial.distance import pdist, squareform
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import normalized_mutual_info_score
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns
import json
import glob
import os

In [26]:
ALPHA = 0.05
OUTPUT_DIR = './feature_error_results'

Creating features dataframes

In [27]:
#load features data
features_df = pd.read_csv('test_features.csv')
print(f"Features loaded: {features_df.shape}")
print(f"Models in features: {features_df['model'].unique()}")

# Step 2: Create function to load all categorization files
def load_all_categorizations(pattern='*_categorization.json'):
    """
    Load all categorization JSON files and create errors dataframe
    """
    categorizations = {}
    
    # If pattern doesn't find files, try specific file
    json_files = glob.glob(pattern)
    if not json_files:
        json_files = ['test_categorization.json']
    
    for json_file in json_files:
        try:
            with open(json_file, 'r') as f:
                data = json.load(f)
                model = data['model']
                categorizations[model] = data
        except Exception as e:
            print(f"Error loading {json_file}: {e}")
    
    return categorizations

# Step 3: Convert categorizations to errors dataframe
def create_errors_dataframe(categorizations_dict, features_df):
    """
    Create binary errors dataframe from categorization results
    """
    # Define all possible error types from MIRA
    error_types = [
        'symbol_recognition',
        'subscript_superscript',
        'structural_corruption',
        'derivative_notation',
        'boundary_initial_conditions',
        'operator_errors',
        'completeness_errors',
        'formatting_errors'
    ]
    
    errors_data = []
    
    # Process each model in features
    for model in features_df['model'].unique():
        error_row = {'model': model}
        
        if model in categorizations_dict:
            cat_data = categorizations_dict[model]
            errors_by_cat = cat_data['error_distribution']['by_category']
            
            # Set binary values for each error type
            for error_type in error_types:
                # 1 if error exists, 0 otherwise
                error_row[error_type] = 1 if errors_by_cat.get(error_type, 0) > 0 else 0
            
            # Also store severity for additional analysis
            error_row['overall_severity'] = cat_data.get('overall_severity', 'none')
            error_row['extraction_quality'] = cat_data.get('extraction_quality_score', 100)
            
        else:
            # Model not found in categorizations - assume no errors
            print(f"Warning: Model {model} not found in categorizations")
            for error_type in error_types:
                error_row[error_type] = 0
            error_row['overall_severity'] = 'none'
            error_row['extraction_quality'] = 100
        
        errors_data.append(error_row)
    
    return pd.DataFrame(errors_data)

# Step 4: Load and process data
categorizations = load_all_categorizations()
print(f"\nLoaded categorizations for {len(categorizations)} models")

errors_df = create_errors_dataframe(categorizations, features_df)
print(f"\nErrors dataframe created: {errors_df.shape}")

# Step 5: Prepare data for analysis
# Remove non-feature columns from features_df
feature_cols = [col for col in features_df.columns if col not in ['model', 'original_equation']]
analysis_features_df = features_df[feature_cols]

# Get only error columns for analysis (exclude model, severity, quality)
error_cols = [col for col in errors_df.columns if col not in ['model', 'overall_severity', 'extraction_quality']]
analysis_errors_df = errors_df[error_cols]

print(f"\nPrepared for analysis:")
print(f"Features: {analysis_features_df.shape} - {len(feature_cols)} features")
print(f"Errors: {analysis_errors_df.shape} - {len(error_cols)} error types")

# Step 6: Check data
print("\nData Summary:")
print(f"Total models: {len(features_df)}")
print(f"Models with errors: {(analysis_errors_df.sum(axis=1) > 0).sum()}")
print(f"\nError type frequencies:")
print(analysis_errors_df.sum())

# Step 7: Handle case where all extractions are perfect
if analysis_errors_df.sum().sum() == 0:
    print("\n" + "="*60)
    print("WARNING: All extractions are perfect (no errors found)")
    print("="*60)
    print("\nFor meaningful feature-error analysis, you need models with errors.")
    print("Options:")
    print("1. Run error analysis on more models that have extraction errors")
    print("2. Use synthetic error data for testing the analysis pipeline")
    
    # Create synthetic errors for demonstration
    create_synthetic = input("\nCreate synthetic errors for demonstration? (y/n): ")
    
    if create_synthetic.lower() == 'y':
        import numpy as np
        
        # Create synthetic errors based on feature complexity
        for idx, row in features_df.iterrows():
            # Higher complexity → more likely errors
            if row['overall_complexity_score'] > 6:
                analysis_errors_df.loc[idx, 'symbol_recognition'] = 1
            if row['has_nested_subscripts']:
                analysis_errors_df.loc[idx, 'subscript_superscript'] = 1
            if row['unicode_ratio'] > 0.2:
                analysis_errors_df.loc[idx, 'operator_errors'] = 1
            if row['mixed_derivative_notation']:
                analysis_errors_df.loc[idx, 'derivative_

SyntaxError: unterminated string literal (detected at line 130) (2735612189.py, line 130)

#### Distance correlation

In [None]:
def distance_correlation(x, y):

    x = np.atleast_1d(x)
    y = np.atleast_1d(y)
    
    if len(x) != len(y):
        raise ValueError("x and y must have the same length")
    
    n = len(x)
    if n < 2:
        return 0
    
    # Center the distance matrices
    x = x.reshape(-1, 1) if x.ndim == 1 else x
    y = y.reshape(-1, 1) if y.ndim == 1 else y
    
    # Compute distance matrices
    a = squareform(pdist(x))
    b = squareform(pdist(y))
    
    # Double center the distance matrices
    a_mean_rows = a.mean(axis=1, keepdims=True)
    a_mean_cols = a.mean(axis=0, keepdims=True)
    a_mean_all = a.mean()
    a_centered = a - a_mean_rows - a_mean_cols + a_mean_all
    
    b_mean_rows = b.mean(axis=1, keepdims=True)
    b_mean_cols = b.mean(axis=0, keepdims=True)
    b_mean_all = b.mean()
    b_centered = b - b_mean_rows - b_mean_cols + b_mean_all
    
    # Calculate distance covariance and variances
    dcov_squared = (a_centered * b_centered).sum() / (n * n)
    dvar_x_squared = (a_centered * a_centered).sum() / (n * n)
    dvar_y_squared = (b_centered * b_centered).sum() / (n * n)
    
    # Calculate distance correlation
    if dvar_x_squared * dvar_y_squared > 0:
        dcor = np.sqrt(dcov_squared) / np.sqrt(np.sqrt(dvar_x_squared) * np.sqrt(dvar_y_squared))
    else:
        dcor = 0
    
    return min(max(dcor, 0), 1)

In [None]:
def analyze_distance_correlations(features_df, errors_df):
    """
    Distance Correlation for detecting non-linear associations
    """
    results = []
    
    numeric_features = features_df.select_dtypes(include=[np.number]).columns
    
    for feature_col in numeric_features:
        for error_col in errors_df.columns:
            feature_data = features_df[feature_col].values
            error_data = errors_df[error_col].astype(int).values
            
            # Skip if no variance
            if np.std(feature_data) == 0 or np.std(error_data) == 0:
                continue
            
            # Calculate distance correlation
            dcor = distance_correlation(feature_data, error_data)
            
            results.append({
                'feature': feature_col,
                'error_type': error_col,
                'method': 'distance_correlation',
                'statistic': dcor,
                'interpretation': 'strength of non-linear association (0-1)'
            })
    
    return pd.DataFrame(results)

### Multiple methods to capture different types of relationships between features and errors  (using sklearn)

In [None]:
def comprehensive_sklearn_analysis(features_df, errors_df):

    all_results = []
    
    # 1. Mutual Information (non-linear relationships)
    print("1. Mutual Information Analysis...")
    mi_results = analyze_mutual_info_sklearn(features_df, errors_df)
    all_results.append(mi_results)
    
    # 2. Random Forest (captures interactions automatically)
    print("2. Random Forest Importance...")
    rf_results = analyze_rf_importance(features_df, errors_df)
    all_results.append(rf_results)
    
    # 3. Permutation Importance (model-agnostic)
    print("3. Permutation Importance...")
    perm_results = analyze_permutation_importance(features_df, errors_df)
    all_results.append(perm_results)
    
    # Combine all
    combined = pd.concat(all_results, ignore_index=True)
    
    # Create ensemble score (average rank across methods)
    combined['rank'] = combined.groupby(['method'])['statistic'].rank(ascending=False)
    ensemble_scores = combined.groupby(['feature', 'error_type'])['rank'].mean()
    
    return combined, ensemble_scores

# plot
def visualize_method_comparison(results_df):
    """
    Compare different methods' findings
    """
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # pivot for heatmap
    pivot = results_df.pivot_table(
        index='feature',
        columns='method',
        values='statistic',
        aggfunc='mean'
    )
    
    # normalize each method to 0-1 for comparison
    pivot_norm = (pivot - pivot.min()) / (pivot.max() - pivot.min())
    
    sns.heatmap(pivot_norm, annot=True, fmt='.2f', cmap='viridis', ax=ax)
    ax.set_title('Feature Importance Across Different Methods (Normalized)')
    
    return fig

results, ensemble_scores = comprehensive_sklearn_analysis(features_df, errors_df)

# DISPLAY RESULTS
print("\n" + "="*60)
print("TOP FEATURE-ERROR RELATIONSHIPS (by ensemble score)")
print("="*60)

# Show top 15 relationships
top_relationships = ensemble_scores.sort_values().head(15)
for (feature, error), rank in top_relationships.items():
    print(f"{feature} → {error}: average rank = {rank:.2f}")

# Show top findings by each method
print("\n" + "="*60)
print("TOP FINDINGS BY METHOD")
print("="*60)

for method in results['method'].unique():
    print(f"\nTop 5 for {method}:")
    method_results = results[results['method'] == method].nlargest(5, 'statistic')
    for _, row in method_results.iterrows():
        print(f"  {row['feature']} → {row['error_type']}: {row['statistic']:.4f}")

# SHOW THE PLOT
fig = visualize_method_comparison(results)
plt.show()

# Additional summary statistics
print("\n" + "="*60)
print("SUMMARY STATISTICS")
print("="*60)
print(f"Total feature-error pairs analyzed: {len(results) // 3}")  # divided by 3 methods
print(f"Average importance across all methods: {results['statistic'].mean():.4f}")
print(f"Methods analyzed: {', '.join(results['method'].unique())}")

NameError: name 'features_df' is not defined