# FRED Exogenous Feature Selection

Analyze all FRED exogenous features (levels + MoM diffs + z-scores) using VIF to determine which transformations provide unique explanatory power and prune multicollinearity.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 200)

## 1. Load 2025-12 Snapshot & Pivot to Wide Format

In [2]:
snap_path = Path('data/Exogenous_data/exogenous_fred_data/decades/2020s/2025/2025-12.parquet')
df = pd.read_parquet(snap_path)
print(f"Raw shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Unique series: {df['series_name'].nunique()}")
print(f"\nAll series names:")
for s in sorted(df['series_name'].unique()):
    print(f"  {s}")

Raw shape: (83248, 6)
Columns: ['date', 'release_date', 'series_code', 'snapshot_date', 'value', 'series_name']
Unique series: 201

All series names:
  CCNSA_max_spike
  CCNSA_max_spike_diff
  CCNSA_max_spike_diff_zscore_12m
  CCNSA_max_spike_diff_zscore_3m
  CCNSA_monthly_avg
  CCNSA_monthly_avg_diff
  CCNSA_monthly_avg_diff_zscore_12m
  CCNSA_monthly_avg_diff_zscore_3m
  CCNSA_weeks_high
  CCNSA_weeks_high_diff
  CCNSA_weeks_high_diff_zscore_12m
  CCNSA_weeks_high_diff_zscore_3m
  CCSA_max_spike
  CCSA_max_spike_diff
  CCSA_max_spike_diff_zscore_12m
  CCSA_max_spike_diff_zscore_3m
  CCSA_monthly_avg
  CCSA_monthly_avg_diff
  CCSA_monthly_avg_diff_zscore_12m
  CCSA_monthly_avg_diff_zscore_3m
  CCSA_weeks_high
  CCSA_weeks_high_diff
  CCSA_weeks_high_diff_zscore_12m
  CCSA_weeks_high_diff_zscore_3m
  Credit_Spreads_accel_volatility
  Credit_Spreads_accel_volatility_diff
  Credit_Spreads_accel_volatility_diff_zscore_12m
  Credit_Spreads_accel_volatility_diff_zscore_3m
  Credit_Spreads_a

In [3]:
# Pivot to wide format
wide = df.pivot_table(index='date', columns='series_name', values='value', aggfunc='first')
wide = wide.sort_index()
print(f"Wide shape: {wide.shape}")
print(f"Date range: {wide.index.min()} to {wide.index.max()}")
print(f"\nNaN counts per feature (top 20):")
print(wide.isna().sum().sort_values(ascending=False).head(20))

Wide shape: (595, 201)
Date range: 1976-06-01 00:00:00 to 2025-12-01 00:00:00

NaN counts per feature (top 20):
series_name
SP500_days_circuit_breaker_diff_zscore_3m        560
ICSA_weeks_high_diff_zscore_3m                   558
CCSA_weeks_high_diff_zscore_3m                   553
CCSA_weeks_high_diff_zscore_12m                  530
ICSA_weeks_high_diff_zscore_12m                  526
CCNSA_weeks_high_diff_zscore_3m                  524
SP500_days_circuit_breaker_diff_zscore_12m       504
ICNSA_weeks_high_diff_zscore_3m                  489
CCNSA_weeks_high_diff_zscore_12m                 466
Weekly_Econ_Index_monthly_max_diff_zscore_12m    387
Weekly_Econ_Index_monthly_min_diff_zscore_12m    387
Weekly_Econ_Index_monthly_avg_diff_zscore_12m    387
Weekly_Econ_Index_monthly_min_diff_zscore_3m     383
Weekly_Econ_Index_monthly_max_diff_zscore_3m     383
Weekly_Econ_Index_monthly_avg_diff_zscore_3m     383
Weekly_Econ_Index_monthly_max_diff               382
Weekly_Econ_Index_monthly_av

## 2. Define Feature Groups

In [4]:
# Group by series type
SERIES_GROUPS = {
    'VIX': [c for c in wide.columns if c.startswith('VIX_')],
    'SP500': [c for c in wide.columns if c.startswith('SP500_')],
    'Credit_Spreads': [c for c in wide.columns if c.startswith('Credit_Spreads_')],
    'Yield_Curve': [c for c in wide.columns if c.startswith('Yield_Curve_')],
    'Oil_Prices': [c for c in wide.columns if c.startswith('Oil_Prices_') or c.startswith('Oil_worst')],
    'ICSA': [c for c in wide.columns if c.startswith('ICSA_')],
    'ICNSA': [c for c in wide.columns if c.startswith('ICNSA_')],
    'CCSA': [c for c in wide.columns if c.startswith('CCSA_')],
    'CCNSA': [c for c in wide.columns if c.startswith('CCNSA_')],
    'Weekly_Econ_Index': [c for c in wide.columns if c.startswith('Weekly_Econ_Index_')],
    'Financial_Stress': [c for c in wide.columns if c.startswith('Financial_Stress_')],
}

# Group by transformation type
TRANSFORM_GROUPS = {
    'level': [c for c in wide.columns if not c.endswith('_diff') and 
              not c.endswith('_diff_zscore_12m') and not c.endswith('_diff_zscore_3m')],
    'diff': [c for c in wide.columns if c.endswith('_diff')],
    'diff_zscore_12m': [c for c in wide.columns if c.endswith('_diff_zscore_12m')],
    'diff_zscore_3m': [c for c in wide.columns if c.endswith('_diff_zscore_3m')],
}

print("Feature counts by series group:")
for group, feats in SERIES_GROUPS.items():
    print(f"  {group}: {len(feats)} features")

print(f"\nFeature counts by transformation:")
for ttype, feats in TRANSFORM_GROUPS.items():
    print(f"  {ttype}: {len(feats)} features")

print(f"\nTotal features: {len(wide.columns)}")

Feature counts by series group:
  VIX: 22 features
  SP500: 39 features
  Credit_Spreads: 28 features
  Yield_Curve: 28 features
  Oil_Prices: 20 features
  ICSA: 12 features
  ICNSA: 12 features
  CCSA: 12 features
  CCNSA: 12 features
  Weekly_Econ_Index: 12 features
  Financial_Stress: 4 features

Feature counts by transformation:
  level: 54 features
  diff: 49 features
  diff_zscore_12m: 49 features
  diff_zscore_3m: 49 features

Total features: 201


## 3. VIF Helper Functions

In [5]:
def compute_group_vif(df_wide, feature_list, group_name, min_obs=50):
    """Compute VIF for a group of features."""
    cols = [c for c in feature_list if c in df_wide.columns]
    if len(cols) < 2:
        return pd.DataFrame()
    
    X = df_wide[cols].dropna()
    if len(X) < min_obs:
        print(f"  WARNING: {group_name} has only {len(X)} complete rows (need {min_obs})")
        # Try with less restrictive NaN handling
        X = df_wide[cols].fillna(method='ffill').dropna()
    
    # Drop zero-variance columns
    zero_var = X.columns[X.var() == 0]
    if len(zero_var) > 0:
        print(f"  Dropping {len(zero_var)} zero-variance cols: {list(zero_var)}")
        X = X.loc[:, X.var() > 0]
    
    if X.shape[1] < 2:
        return pd.DataFrame()
    
    # Replace inf with NaN and drop
    X = X.replace([np.inf, -np.inf], np.nan).dropna()
    
    X_const = X.copy()
    X_const['_const'] = 1.0
    
    vif_data = []
    for col in [c for c in X_const.columns if c != '_const']:
        try:
            vif = variance_inflation_factor(X_const.values, X_const.columns.get_loc(col))
            vif_data.append({'group': group_name, 'feature': col, 'VIF': round(vif, 2)})
        except Exception as e:
            vif_data.append({'group': group_name, 'feature': col, 'VIF': np.nan})
    
    return pd.DataFrame(vif_data).sort_values('VIF', ascending=False)


def iterative_vif_pruning(df_wide, features, threshold=10.0, protected=None, verbose=True):
    """Iteratively remove highest-VIF feature until all below threshold."""
    protected = protected or set()
    cols = [c for c in features if c in df_wide.columns]
    X = df_wide[cols].replace([np.inf, -np.inf], np.nan).dropna()
    X = X.loc[:, X.var() > 0]
    removed = []
    
    iteration = 0
    while len(X.columns) > 2:
        iteration += 1
        X_const = X.copy()
        X_const['_const'] = 1.0
        
        vifs = {}
        for col in [c for c in X_const.columns if c != '_const']:
            try:
                vifs[col] = variance_inflation_factor(X_const.values, X_const.columns.get_loc(col))
            except:
                vifs[col] = np.inf
        
        # Find highest VIF among non-protected features
        non_protected_vifs = {k: v for k, v in vifs.items() if k not in protected}
        if not non_protected_vifs:
            break
        
        max_col = max(non_protected_vifs, key=non_protected_vifs.get)
        max_vif = non_protected_vifs[max_col]
        
        if max_vif <= threshold:
            break
        
        X = X.drop(columns=[max_col])
        removed.append((max_col, round(max_vif, 2)))
        if verbose:
            print(f"  Iter {iteration}: Removed {max_col} (VIF={max_vif:.1f}), {len(X.columns)} features remain")
    
    return X.columns.tolist(), removed

## 4. Within-Group VIF Analysis

For each series group, compute VIF to see which transformations (level, diff, zscore) are redundant within the same indicator.

In [6]:
all_vif_results = []

for group_name, features in SERIES_GROUPS.items():
    print(f"\n{'='*60}")
    print(f"{group_name} ({len(features)} features)")
    print(f"{'='*60}")
    
    vif_df = compute_group_vif(wide, features, group_name)
    if not vif_df.empty:
        all_vif_results.append(vif_df)
        print(vif_df.to_string(index=False))
        
        high_vif = vif_df[vif_df['VIF'] > 10]
        if len(high_vif) > 0:
            print(f"\n  >> {len(high_vif)} features with VIF > 10 (multicollinearity concern)")
        else:
            print(f"\n  >> All features have VIF <= 10 (OK)")
    else:
        print("  (not enough features for VIF)")

within_group_vif = pd.concat(all_vif_results, ignore_index=True) if all_vif_results else pd.DataFrame()


VIX (22 features)
group                          feature    VIF
  VIX                          VIX_max 121.55
  VIX                         VIX_mean  83.42
  VIX                     VIX_max_diff  40.86
  VIX                    VIX_mean_diff  26.46
  VIX          VIX_max_diff_zscore_12m  22.95
  VIX              VIX_volatility_diff  22.72
  VIX                   VIX_volatility  20.63
  VIX   VIX_volatility_diff_zscore_12m  14.74
  VIX         VIX_mean_diff_zscore_12m  12.87
  VIX    VIX_30d_spike_diff_zscore_12m  12.19
  VIX            VIX_max_5d_spike_diff  12.07
  VIX               VIX_30d_spike_diff  10.91
  VIX                 VIX_max_5d_spike  10.90
  VIX VIX_max_5d_spike_diff_zscore_12m  10.65
  VIX                    VIX_30d_spike   9.97
  VIX           VIX_max_diff_zscore_3m   6.03
  VIX    VIX_volatility_diff_zscore_3m   5.47
  VIX  VIX_max_5d_spike_diff_zscore_3m   5.31
  VIX          VIX_mean_diff_zscore_3m   4.78
  VIX     VIX_30d_spike_diff_zscore_3m   4.34
  VIX          

## 5. Level vs Diff Correlation Analysis

For each base feature, how correlated is the level with its diff? Low correlation means both provide unique info.

In [7]:
# Identify base features (levels that have a corresponding _diff)
level_features = TRANSFORM_GROUPS['level']
diff_features = TRANSFORM_GROUPS['diff']

# Map level -> diff by checking if {level}_diff exists
correlation_analysis = []

for level_feat in level_features:
    diff_feat = f"{level_feat}_diff"
    z12_feat = f"{level_feat}_diff_zscore_12m"
    z3_feat = f"{level_feat}_diff_zscore_3m"
    
    row = {'base_feature': level_feat}
    
    # Level vs Diff correlation
    if diff_feat in wide.columns:
        valid = wide[[level_feat, diff_feat]].dropna()
        if len(valid) > 10:
            row['level_vs_diff_corr'] = round(valid[level_feat].corr(valid[diff_feat]), 3)
    
    # Diff vs Z-score 12m correlation
    if diff_feat in wide.columns and z12_feat in wide.columns:
        valid = wide[[diff_feat, z12_feat]].dropna()
        if len(valid) > 10:
            row['diff_vs_z12_corr'] = round(valid[diff_feat].corr(valid[z12_feat]), 3)
    
    # Diff vs Z-score 3m correlation
    if diff_feat in wide.columns and z3_feat in wide.columns:
        valid = wide[[diff_feat, z3_feat]].dropna()
        if len(valid) > 10:
            row['diff_vs_z3_corr'] = round(valid[diff_feat].corr(valid[z3_feat]), 3)
    
    # Z-score 12m vs Z-score 3m correlation
    if z12_feat in wide.columns and z3_feat in wide.columns:
        valid = wide[[z12_feat, z3_feat]].dropna()
        if len(valid) > 10:
            row['z12_vs_z3_corr'] = round(valid[z12_feat].corr(valid[z3_feat]), 3)
    
    if len(row) > 1:  # Only add if we found at least one correlation
        correlation_analysis.append(row)

corr_df = pd.DataFrame(correlation_analysis)
print("Level vs Diff vs Z-Score Correlation Analysis:")
print("="*100)
print(corr_df.to_string(index=False))

print(f"\n\nSummary Statistics:")
print(f"  Level vs Diff |r|: mean={corr_df['level_vs_diff_corr'].abs().mean():.3f}, "
      f"median={corr_df['level_vs_diff_corr'].abs().median():.3f}")
if 'diff_vs_z12_corr' in corr_df.columns:
    print(f"  Diff vs Z12 |r|:   mean={corr_df['diff_vs_z12_corr'].abs().mean():.3f}, "
          f"median={corr_df['diff_vs_z12_corr'].abs().median():.3f}")
if 'z12_vs_z3_corr' in corr_df.columns:
    print(f"  Z12 vs Z3 |r|:     mean={corr_df['z12_vs_z3_corr'].abs().mean():.3f}, "
          f"median={corr_df['z12_vs_z3_corr'].abs().median():.3f}")

Level vs Diff vs Z-Score Correlation Analysis:
                   base_feature  level_vs_diff_corr  diff_vs_z12_corr  diff_vs_z3_corr  z12_vs_z3_corr
                CCNSA_max_spike               0.237             0.434            0.269           0.698
              CCNSA_monthly_avg               0.208             0.477            0.281           0.684
               CCNSA_weeks_high               0.062             0.964            0.788           0.782
                 CCSA_max_spike               0.233             0.247            0.116           0.645
               CCSA_monthly_avg               0.206             0.259            0.114           0.629
                CCSA_weeks_high               0.029             0.909            0.749           0.794
Credit_Spreads_accel_volatility               0.482             0.812            0.628           0.807
    Credit_Spreads_acceleration               0.844             0.836            0.678           0.837
             Credit_Spread

## 6. Cross-Group Correlation Analysis

Check for redundancy across different indicator types (e.g., ICSA vs ICNSA, Credit_Spreads vs Yield_Curve).

In [8]:
# Check cross-group correlations between level features
cross_group_pairs = [
    ('ICSA_monthly_avg', 'ICNSA_monthly_avg', 'ICSA vs ICNSA (avg)'),
    ('ICSA_max_spike', 'ICNSA_max_spike', 'ICSA vs ICNSA (spike)'),
    ('ICSA_weeks_high', 'ICNSA_weeks_high', 'ICSA vs ICNSA (weeks_high)'),
    ('CCSA_monthly_avg', 'CCNSA_monthly_avg', 'CCSA vs CCNSA (avg)'),
    ('CCSA_max_spike', 'CCNSA_max_spike', 'CCSA vs CCNSA (spike)'),
    ('CCSA_weeks_high', 'CCNSA_weeks_high', 'CCSA vs CCNSA (weeks_high)'),
    ('ICSA_monthly_avg', 'CCSA_monthly_avg', 'Initial vs Continued Claims'),
    ('Credit_Spreads_avg', 'Yield_Curve_avg', 'Credit Spreads vs Yield Curve'),
    ('VIX_mean', 'SP500_volatility', 'VIX vs SP500 Volatility'),
    ('VIX_mean', 'Credit_Spreads_avg', 'VIX vs Credit Spreads'),
    ('Financial_Stress_monthly_avg', 'VIX_mean', 'Financial Stress vs VIX'),
    ('Financial_Stress_monthly_avg', 'Credit_Spreads_avg', 'Financial Stress vs Credit Spreads'),
]

print("Cross-Group Level Correlations:")
print("="*70)
for feat1, feat2, label in cross_group_pairs:
    if feat1 in wide.columns and feat2 in wide.columns:
        valid = wide[[feat1, feat2]].dropna()
        if len(valid) > 10:
            r = valid[feat1].corr(valid[feat2])
            flag = " ** HIGH" if abs(r) > 0.9 else (" * moderate" if abs(r) > 0.7 else "")
            print(f"  {label:45s} r = {r:+.3f}{flag}")
    else:
        missing = [f for f in [feat1, feat2] if f not in wide.columns]
        print(f"  {label:45s} MISSING: {missing}")

Cross-Group Level Correlations:
  ICSA vs ICNSA (avg)                           r = +0.976 ** HIGH
  ICSA vs ICNSA (spike)                         r = +0.968 ** HIGH
  ICSA vs ICNSA (weeks_high)                    r = +0.996 ** HIGH
  CCSA vs CCNSA (avg)                           r = +0.982 ** HIGH
  CCSA vs CCNSA (spike)                         r = +0.984 ** HIGH
  CCSA vs CCNSA (weeks_high)                    r = +0.991 ** HIGH
  Initial vs Continued Claims                   r = +0.762 * moderate
  Credit Spreads vs Yield Curve                 r = +0.428
  VIX vs SP500 Volatility                       r = +0.915 ** HIGH
  VIX vs Credit Spreads                         r = +0.736 * moderate
  Financial Stress vs VIX                       r = +0.824 * moderate
  Financial Stress vs Credit Spreads            r = +0.831 * moderate


## 7. Within-Group VIF Pruning

For each series group, iteratively remove features with VIF > 10.

In [9]:
PROTECTED = {
    'VIX_panic_regime', 'VIX_high_regime',
    'SP500_bear_market', 'SP500_crash_month', 'SP500_circuit_breaker',
}

group_survivors = {}
group_removed = {}

for group_name, features in SERIES_GROUPS.items():
    print(f"\n{'='*60}")
    print(f"VIF Pruning: {group_name} ({len(features)} features)")
    print(f"{'='*60}")
    
    survivors, removed = iterative_vif_pruning(
        wide, features, threshold=10.0, protected=PROTECTED
    )
    group_survivors[group_name] = survivors
    group_removed[group_name] = removed
    
    print(f"\n  Kept {len(survivors)} features: {survivors}")
    print(f"  Removed {len(removed)} features")


VIF Pruning: VIX (22 features)
  Iter 1: Removed VIX_max (VIF=121.6), 21 features remain
  Iter 2: Removed VIX_max_diff (VIF=28.6), 20 features remain
  Iter 3: Removed VIX_volatility_diff (VIF=18.9), 19 features remain
  Iter 4: Removed VIX_volatility (VIF=13.5), 18 features remain
  Iter 5: Removed VIX_max_diff_zscore_12m (VIF=11.3), 17 features remain
  Iter 6: Removed VIX_30d_spike_diff_zscore_12m (VIF=10.3), 16 features remain

  Kept 16 features: ['VIX_30d_spike', 'VIX_30d_spike_diff', 'VIX_30d_spike_diff_zscore_3m', 'VIX_high_regime', 'VIX_max_5d_spike', 'VIX_max_5d_spike_diff', 'VIX_max_5d_spike_diff_zscore_12m', 'VIX_max_5d_spike_diff_zscore_3m', 'VIX_max_diff_zscore_3m', 'VIX_mean', 'VIX_mean_diff', 'VIX_mean_diff_zscore_12m', 'VIX_mean_diff_zscore_3m', 'VIX_panic_regime', 'VIX_volatility_diff_zscore_12m', 'VIX_volatility_diff_zscore_3m']
  Removed 6 features

VIF Pruning: SP500 (39 features)
  Iter 1: Removed SP500_30d_return (VIF=inf), 38 features remain
  Iter 2: Removed 

## 8. Cross-Group VIF Analysis

After within-group pruning, check VIF across ALL remaining features.

In [10]:
# Collect all survivors
all_survivors = []
for group_name, survivors in group_survivors.items():
    all_survivors.extend(survivors)

print(f"Total features after within-group pruning: {len(all_survivors)}")
print(f"\nComputing cross-group VIF...")

cross_vif = compute_group_vif(wide, all_survivors, 'cross_group')
if not cross_vif.empty:
    print(f"\nCross-Group VIF (all {len(cross_vif)} features):")
    print(cross_vif.to_string(index=False))
    
    high_cross = cross_vif[cross_vif['VIF'] > 10]
    print(f"\n{len(high_cross)} features with cross-group VIF > 10")

Total features after within-group pruning: 133

Computing cross-group VIF...

Cross-Group VIF (all 133 features):
      group                                        feature     VIF
cross_group                                 ICSA_max_spike 2035.69
cross_group                          ICSA_monthly_avg_diff 1813.28
cross_group                         Yield_Curve_zscore_max 1710.19
cross_group                                ICNSA_max_spike 1621.17
cross_group                         ICNSA_monthly_avg_diff 1536.72
cross_group                                Yield_Curve_avg 1435.11
cross_group                               CCSA_monthly_avg 1118.70
cross_group                              CCNSA_monthly_avg 1011.95
cross_group                            ICSA_max_spike_diff 1009.73
cross_group                      Credit_Spreads_zscore_max  920.05
cross_group                           ICNSA_max_spike_diff  824.41
cross_group                              Oil_worst_day_pct  709.97
cross_group    

In [11]:
# Final cross-group pruning
print("\nFinal Cross-Group VIF Pruning (threshold=10):")
print("="*60)

final_survivors, final_removed = iterative_vif_pruning(
    wide, all_survivors, threshold=10.0, protected=PROTECTED
)

print(f"\n\nFinal feature set: {len(final_survivors)} features")
print(f"Total removed in cross-group: {len(final_removed)}")


Final Cross-Group VIF Pruning (threshold=10):
  Iter 1: Removed VIX_30d_spike (VIF=inf), 132 features remain
  Iter 2: Removed VIX_30d_spike_diff (VIF=inf), 131 features remain
  Iter 3: Removed VIX_30d_spike_diff_zscore_3m (VIF=inf), 130 features remain
  Iter 4: Removed VIX_max_5d_spike (VIF=inf), 129 features remain
  Iter 5: Removed VIX_max_5d_spike_diff (VIF=inf), 128 features remain
  Iter 6: Removed VIX_max_5d_spike_diff_zscore_12m (VIF=inf), 127 features remain
  Iter 7: Removed VIX_max_5d_spike_diff_zscore_3m (VIF=inf), 126 features remain
  Iter 8: Removed VIX_max_diff_zscore_3m (VIF=inf), 125 features remain
  Iter 9: Removed VIX_mean (VIF=inf), 124 features remain
  Iter 10: Removed VIX_mean_diff (VIF=inf), 123 features remain
  Iter 11: Removed VIX_mean_diff_zscore_12m (VIF=inf), 122 features remain
  Iter 12: Removed VIX_mean_diff_zscore_3m (VIF=inf), 121 features remain
  Iter 13: Removed VIX_volatility_diff_zscore_12m (VIF=inf), 120 features remain
  Iter 14: Removed V

## 9. Final Summary & Recommendations

In [12]:
print("FINAL RECOMMENDED FEATURE SET")
print("=" * 60)
print(f"\nTotal features: {len(final_survivors)}")

# Categorize final features
final_by_group = {}
for feat in sorted(final_survivors):
    # Determine group
    found = False
    for group_name, group_feats in SERIES_GROUPS.items():
        # Check if this feature belongs to the group (could be a _diff or _zscore variant)
        for gf in group_feats:
            if feat == gf or feat.startswith(gf):
                final_by_group.setdefault(group_name, []).append(feat)
                found = True
                break
        if found:
            break
    if not found:
        final_by_group.setdefault('OTHER', []).append(feat)

for group_name in SERIES_GROUPS.keys():
    feats = final_by_group.get(group_name, [])
    if feats:
        print(f"\n{group_name} ({len(feats)} features):")
        for f in feats:
            transform = 'level'
            if f.endswith('_diff_zscore_12m'):
                transform = 'diff_zscore_12m'
            elif f.endswith('_diff_zscore_3m'):
                transform = 'diff_zscore_3m'
            elif f.endswith('_diff'):
                transform = 'diff'
            print(f"    {f:55s} [{transform}]")

FINAL RECOMMENDED FEATURE SET

Total features: 16

VIX (2 features):
    VIX_high_regime                                         [level]
    VIX_panic_regime                                        [level]

SP500 (3 features):
    SP500_bear_market                                       [level]
    SP500_circuit_breaker                                   [level]
    SP500_crash_month                                       [level]

CCSA (1 features):
    CCSA_weeks_high_diff_zscore_12m                         [diff_zscore_12m]

CCNSA (5 features):
    CCNSA_max_spike_diff_zscore_3m                          [diff_zscore_3m]
    CCNSA_monthly_avg                                       [level]
    CCNSA_monthly_avg_diff_zscore_3m                        [diff_zscore_3m]
    CCNSA_weeks_high                                        [level]
    CCNSA_weeks_high_diff_zscore_3m                         [diff_zscore_3m]

Weekly_Econ_Index (3 features):
    Weekly_Econ_Index_monthly_max_diff_zscore_3m   

In [13]:
# Summary of what was removed and why
print("\nREMOVAL SUMMARY")
print("=" * 60)

print("\n--- Within-Group Removals ---")
total_within = 0
for group_name, removed in group_removed.items():
    if removed:
        print(f"\n{group_name}:")
        for feat, vif in removed:
            print(f"  Removed: {feat:50s} VIF={vif}")
        total_within += len(removed)
print(f"\nTotal within-group removals: {total_within}")

print("\n--- Cross-Group Removals ---")
if final_removed:
    for feat, vif in final_removed:
        print(f"  Removed: {feat:50s} VIF={vif}")
print(f"Total cross-group removals: {len(final_removed)}")

print(f"\n{'='*60}")
print(f"Started with:  {len(wide.columns)} features")
print(f"Removed:       {len(wide.columns) - len(final_survivors)} features")
print(f"Final set:     {len(final_survivors)} features")


REMOVAL SUMMARY

--- Within-Group Removals ---

VIX:
  Removed: VIX_max                                            VIF=121.55
  Removed: VIX_max_diff                                       VIF=28.63
  Removed: VIX_volatility_diff                                VIF=18.92
  Removed: VIX_volatility                                     VIF=13.51
  Removed: VIX_max_diff_zscore_12m                            VIF=11.26
  Removed: VIX_30d_spike_diff_zscore_12m                      VIF=10.33

SP500:
  Removed: SP500_30d_return                                   VIF=inf
  Removed: SP500_30d_return_diff                              VIF=inf
  Removed: SP500_30d_return_diff_zscore_12m                   VIF=inf
  Removed: SP500_30d_return_diff_zscore_3m                    VIF=inf
  Removed: SP500_best_day                                     VIF=inf
  Removed: SP500_days_circuit_breaker_diff_zscore_12m         VIF=4650.0
  Removed: SP500_max_5d_drop_diff                             VIF=929.63
  Removed

In [14]:
# Final VIF verification
print("\nFinal VIF Verification (should all be <= 10):")
print("=" * 60)
final_vif = compute_group_vif(wide, final_survivors, 'final')
if not final_vif.empty:
    print(final_vif.to_string(index=False))
    max_vif = final_vif['VIF'].max()
    print(f"\nMax VIF: {max_vif:.2f} {'(PASS)' if max_vif <= 10 else '(FAIL)'}")


Final VIF Verification (should all be <= 10):
group                                      feature  VIF
final               CCNSA_max_spike_diff_zscore_3m 4.84
final             CCNSA_monthly_avg_diff_zscore_3m 4.83
final                             VIX_panic_regime 3.45
final Financial_Stress_monthly_avg_diff_zscore_12m 2.54
final                             CCNSA_weeks_high 2.51
final                              VIX_high_regime 2.44
final                        SP500_circuit_breaker 2.43
final  Financial_Stress_monthly_avg_diff_zscore_3m 2.28
final                            CCNSA_monthly_avg 2.19
final              CCSA_weeks_high_diff_zscore_12m 2.02
final              CCNSA_weeks_high_diff_zscore_3m 1.89
final                            SP500_crash_month 1.81
final           Weekly_Econ_Index_monthly_min_diff 1.73
final                            SP500_bear_market 1.60
final Weekly_Econ_Index_monthly_min_diff_zscore_3m 1.41
final Weekly_Econ_Index_monthly_max_diff_zscore_3m 1.11

