# Prosper Survey Feature Selection

Analyze all Prosper survey features (levels + MoM diffs + z-scores) using VIF to prune multicollinearity.

Prosper has 36 base series across 4 question categories and 4 demographic groups, yielding 144 features with all transformations.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 200)

## 1. Load Snapshot & Pivot to Wide Format

In [None]:
base_dir = Path('data/Exogenous_data/prosper')
all_snaps = sorted(base_dir.glob('**/*.parquet'))
print(f'Total snapshots: {len(all_snaps)}')

# Load 2025-12 or latest
snap_path = base_dir / 'decades' / '2020s' / '2025' / '2025-12.parquet'
if not snap_path.exists():
    snap_path = all_snaps[-1]
    print(f'Using fallback: {snap_path}')

df = pd.read_parquet(snap_path)
print(f'\nRaw shape: {df.shape}')
print(f'Unique series: {df["series_name"].nunique()}')
print(f'\nAll series names:')
for i, s in enumerate(sorted(df['series_name'].unique()), 1):
    # Truncate for display
    display = s[:90] + '...' if len(s) > 90 else s
    print(f'  {i:3d}. {display}')

In [None]:
# Pivot to wide format
wide = df.pivot_table(index='date', columns='series_name', values='value', aggfunc='first')
wide = wide.sort_index()
print(f'Wide shape: {wide.shape}')
print(f'Date range: {wide.index.min()} to {wide.index.max()}')
print(f'\nNaN % per feature (top 20):')
nan_pct = wide.isna().mean().sort_values(ascending=False)
for f, pct in nan_pct.head(20).items():
    print(f'  {pct*100:5.1f}%  {f[:80]}')

## 2. VIF Helper Functions

In [None]:
def compute_group_vif(df_wide, feature_list, group_name):
    """Compute VIF for a group of features."""
    cols = [c for c in feature_list if c in df_wide.columns]
    if len(cols) < 2:
        return pd.DataFrame()
    X = df_wide[cols].replace([np.inf, -np.inf], np.nan).dropna()
    zero_var = X.columns[X.var() == 0]
    if len(zero_var) > 0:
        print(f'  Dropping {len(zero_var)} zero-variance cols')
        X = X.loc[:, X.var() > 0]
    if X.shape[1] < 2:
        return pd.DataFrame()
    X_const = X.copy()
    X_const['_const'] = 1.0
    vif_data = []
    for col in [c for c in X_const.columns if c != '_const']:
        try:
            vif = variance_inflation_factor(X_const.values, X_const.columns.get_loc(col))
            vif_data.append({'group': group_name, 'feature': col, 'VIF': round(vif, 2)})
        except Exception:
            vif_data.append({'group': group_name, 'feature': col, 'VIF': np.nan})
    return pd.DataFrame(vif_data).sort_values('VIF', ascending=False)


def iterative_vif_pruning(df_wide, features, threshold=10.0, protected=None, verbose=True):
    """Iteratively remove highest-VIF feature until all below threshold."""
    protected = protected or set()
    cols = [c for c in features if c in df_wide.columns]
    X = df_wide[cols].replace([np.inf, -np.inf], np.nan).dropna()
    X = X.loc[:, X.var() > 0]
    removed = []
    iteration = 0
    while len(X.columns) > 2:
        iteration += 1
        X_const = X.copy()
        X_const['_const'] = 1.0
        vifs = {}
        for col in [c for c in X_const.columns if c != '_const']:
            try:
                vifs[col] = variance_inflation_factor(X_const.values, X_const.columns.get_loc(col))
            except:
                vifs[col] = np.inf
        non_protected_vifs = {k: v for k, v in vifs.items() if k not in protected}
        if not non_protected_vifs:
            break
        max_col = max(non_protected_vifs, key=non_protected_vifs.get)
        max_vif = non_protected_vifs[max_col]
        if max_vif <= threshold:
            break
        X = X.drop(columns=[max_col])
        removed.append((max_col, round(max_vif, 2)))
        if verbose and iteration <= 15:
            print(f'  Iter {iteration}: Removed {max_col[:70]}... (VIF={max_vif:.1f})')
        elif verbose and iteration == 16:
            print(f'  ... (continuing)')
    if verbose and iteration > 15:
        print(f'  Total iterations: {iteration}')
    return X.columns.tolist(), removed

## 3. Transformation Correlation Analysis

In [None]:
# Identify base series (levels, not _diff or _zscore variants)
all_series = sorted(wide.columns)
base_series = [s for s in all_series if not s.endswith('_diff') 
               and not s.endswith('_diff_zscore_12m') 
               and not s.endswith('_diff_zscore_3m')]
print(f'Base series: {len(base_series)}')

# Compute correlation summaries
corr_pairs = {'level_vs_diff': [], 'diff_vs_z12': [], 'diff_vs_z3': [], 'z12_vs_z3': []}
for base in base_series:
    diff = f'{base}_diff'
    z12 = f'{base}_diff_zscore_12m'
    z3 = f'{base}_diff_zscore_3m'
    for pair_key, f1, f2 in [('level_vs_diff', base, diff), ('diff_vs_z12', diff, z12),
                              ('diff_vs_z3', diff, z3), ('z12_vs_z3', z12, z3)]:
        if f1 in wide.columns and f2 in wide.columns:
            v = wide[[f1, f2]].dropna()
            if len(v) > 10:
                corr_pairs[pair_key].append(abs(v[f1].corr(v[f2])))

print('\nTransformation Correlation Summary:')
print('=' * 60)
for key, vals in corr_pairs.items():
    if vals:
        print(f'  {key:20s}: mean |r|={np.mean(vals):.3f}, median={np.median(vals):.3f}')

## 4. Cross-Series Correlation (Same Question, Different Demographics)

In [None]:
# Check demographic redundancy: e.g., US 18+ vs Males vs Females vs 18-34
# For each question+answer combo, compare across demographics
from collections import defaultdict

# Group base series by question+answer (strip demographic suffix)
qa_groups = defaultdict(list)
for s in base_series:
    parts = s.rsplit(' | ', 1)
    if len(parts) == 2:
        qa_key = parts[0]  # question | answer
        demo = parts[1]    # demographic group
        qa_groups[qa_key].append((demo, s))

print('Cross-Demographic Correlations (level):')
print('=' * 70)
for qa_key, demo_list in sorted(qa_groups.items()):
    if len(demo_list) < 2:
        continue
    short_qa = qa_key[:60] + '...' if len(qa_key) > 60 else qa_key
    print(f'\n  {short_qa}')
    for i in range(len(demo_list)):
        for j in range(i+1, len(demo_list)):
            d1, s1 = demo_list[i]
            d2, s2 = demo_list[j]
            if s1 in wide.columns and s2 in wide.columns:
                v = wide[[s1, s2]].dropna()
                if len(v) > 10:
                    r = v[s1].corr(v[s2])
                    flag = ' ** HIGH' if abs(r) > 0.9 else (' * mod' if abs(r) > 0.7 else '')
                    print(f'    {d1:10s} vs {d2:10s}: r={r:+.3f}{flag}')

## 5. Cross-Group VIF Pruning

In [None]:
all_features = list(wide.columns)
print(f'Starting with {len(all_features)} features')

# Filter to 2010+ for good coverage, forward-fill
wide_recent = wide.loc[wide.index >= '2010-01-01'].ffill()
print(f'Using 2010+ data: {len(wide_recent)} rows')

# Drop features with >30% NaN
nan_pct = wide_recent.isna().mean()
good_features = [f for f in all_features if nan_pct.get(f, 1.0) <= 0.3]
bad_features = [f for f in all_features if f not in good_features]
if bad_features:
    print(f'Dropped {len(bad_features)} features with >30% NaN')
    for f in bad_features:
        print(f'  {f[:80]}  ({nan_pct[f]*100:.0f}% NaN)')
print(f'Remaining: {len(good_features)} features')

print(f'\nIterative VIF pruning (threshold=10)...')
final_survivors, final_removed = iterative_vif_pruning(
    wide_recent, good_features, threshold=10.0
)

print(f'\nFinal feature set: {len(final_survivors)} features')
print(f'Removed: {len(final_removed)} features')

## 6. Final VIF Verification & Summary

In [None]:
print('FINAL RECOMMENDED FEATURE SET')
print('=' * 60)
print(f'Total features: {len(final_survivors)}')

# Categorize
levels, diffs, z12s, z3s = [], [], [], []
for f in sorted(final_survivors):
    if f.endswith('_diff_zscore_12m'): z12s.append(f)
    elif f.endswith('_diff_zscore_3m'): z3s.append(f)
    elif f.endswith('_diff'): diffs.append(f)
    else: levels.append(f)

print(f'\nBy transformation:')
print(f'  Level:           {len(levels)}')
print(f'  MoM diff:        {len(diffs)}')
print(f'  Diff z-score 12m: {len(z12s)}')
print(f'  Diff z-score 3m:  {len(z3s)}')

print(f'\nLevel features:')
for f in levels:
    print(f'  {f[:90]}')
print(f'\nDiff features:')
for f in diffs:
    print(f'  {f[:90]}')
print(f'\nZ-score 12m features:')
for f in z12s:
    print(f'  {f[:90]}')
print(f'\nZ-score 3m features:')
for f in z3s:
    print(f'  {f[:90]}')

In [None]:
# Final VIF verification
print('Final VIF Verification (should all be <= 10):')
print('=' * 60)
final_vif = compute_group_vif(wide_recent, final_survivors, 'final')
if not final_vif.empty:
    for _, row in final_vif.iterrows():
        print(f'  VIF={row["VIF"]:6.2f}  {row["feature"][:80]}')
    print(f'\nMax VIF: {final_vif["VIF"].max():.2f}')

In [None]:
# Output the SELECTED_FEATURES set for load_prosper_data.py
print('# Copy this into load_prosper_data.py:')
print('SELECTED_FEATURES = {')
for f in sorted(final_survivors):
    print(f"    '{f}',")
print('}')