# Enhanced Feature Engineering Playbook

This notebook demonstrates the enhanced feature engineering capabilities including:
- Temporal trend features
- Categorical encoding  
- Feature validation
- Dependency management
- Feature metadata & lineage

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from cr_score.features import (
    # Core engineering
    FeatureRecipe,
    FeatureEngineeringConfig,
    PandasFeatureEngineer,
    AggregationType,
    TimeWindow,
    FeatureRegistry,
    # Enhanced features
    TemporalTrendFeatures,
    CategoricalEncoder,
    FeatureValidator,
    DependencyGraph,
)

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("✓ Imports successful")

## 1. Create Sample Credit Bureau Data

In [None]:
# Create sample customer-month level data
np.random.seed(42)

customers = [f"CUST_{i:04d}" for i in range(1, 21)]  # 20 customers
months = pd.date_range('2023-01-01', '2023-12-31', freq='M')  # 12 months

data = []
for customer in customers:
    for month in months:
        data.append({
            'customer_id': customer,
            'snapshot_date': month,
            'balance': np.random.randint(500, 10000),
            'credit_limit': np.random.randint(5000, 20000),
            'days_past_due': np.random.choice([0, 0, 0, 0, 0, 0, 0, 15, 30, 60, 90], p=[0.7, 0.1, 0.05, 0.05, 0.05, 0.02, 0.01, 0.01, 0.005, 0.003, 0.002]),
            'payment_amount': np.random.randint(100, 2000),
            'num_inquiries': np.random.randint(0, 5),
            'num_accounts': np.random.randint(1, 10),
            'total_debt': np.random.randint(1000, 50000),
            'account_type': np.random.choice(['Credit Card', 'Personal Loan', 'Auto Loan', 'Mortgage']),
            'region': np.random.choice(['North', 'South', 'East', 'West']),
        })

df = pd.DataFrame(data)

print(f"Created dataset with {len(df)} rows")
print(f"Customers: {df['customer_id'].nunique()}")
print(f"Date range: {df['snapshot_date'].min()} to {df['snapshot_date'].max()}")
df.head()

## 2. Temporal Trend Features

Create time-based features like delta, percent change, momentum, volatility, and trend slope.

In [None]:
trend = TemporalTrendFeatures()

# Delta (change from previous period)
df = trend.delta(
    df,
    column='balance',
    time_col='snapshot_date',
    group_cols=['customer_id']
)

# Percent change
df = trend.pct_change(
    df,
    column='balance',
    time_col='snapshot_date',
    group_cols=['customer_id']
)

# Momentum (current - rolling mean)
df = trend.momentum(
    df,
    column='balance',
    time_col='snapshot_date',
    group_cols=['customer_id'],
    window=3
)

# Volatility
df = trend.volatility(
    df,
    column='balance',
    time_col='snapshot_date',
    group_cols=['customer_id'],
    window=6,
    method='std'
)

# Trend slope
df = trend.trend_slope(
    df,
    column='balance',
    time_col='snapshot_date',
    group_cols=['customer_id'],
    window=6
)

print("✓ Created temporal trend features")
print("\nNew columns:")
trend_cols = [c for c in df.columns if any(x in c for x in ['delta', 'pct_change', 'momentum', 'volatility', 'slope'])]
print(trend_cols)

In [None]:
# Visualize temporal features for a sample customer
sample_customer = df['customer_id'].iloc[0]
customer_data = df[df['customer_id'] == sample_customer].sort_values('snapshot_date')

fig, axes = plt.subplots(3, 2, figsize=(15, 12))
fig.suptitle(f'Temporal Features for {sample_customer}', fontsize=16)

# Balance over time
axes[0, 0].plot(customer_data['snapshot_date'], customer_data['balance'], marker='o')
axes[0, 0].set_title('Balance Over Time')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Balance')

# Delta
axes[0, 1].plot(customer_data['snapshot_date'], customer_data['balance_delta'], marker='o', color='orange')
axes[0, 1].set_title('Balance Delta')
axes[0, 1].set_xlabel('Date')
axes[0, 1].axhline(y=0, color='r', linestyle='--', alpha=0.3)

# Percent Change
axes[1, 0].plot(customer_data['snapshot_date'], customer_data['balance_pct_change'], marker='o', color='green')
axes[1, 0].set_title('Balance % Change')
axes[1, 0].set_xlabel('Date')
axes[1, 0].axhline(y=0, color='r', linestyle='--', alpha=0.3)

# Momentum
axes[1, 1].plot(customer_data['snapshot_date'], customer_data['balance_momentum_3'], marker='o', color='purple')
axes[1, 1].set_title('Balance Momentum (3-period)')
axes[1, 1].set_xlabel('Date')
axes[1, 1].axhline(y=0, color='r', linestyle='--', alpha=0.3)

# Volatility
axes[2, 0].plot(customer_data['snapshot_date'], customer_data['balance_volatility_6'], marker='o', color='red')
axes[2, 0].set_title('Balance Volatility (6-period)')
axes[2, 0].set_xlabel('Date')

# Trend Slope
axes[2, 1].plot(customer_data['snapshot_date'], customer_data['balance_trend_slope_6'], marker='o', color='brown')
axes[2, 1].set_title('Balance Trend Slope (6-period)')
axes[2, 1].set_xlabel('Date')
axes[2, 1].axhline(y=0, color='r', linestyle='--', alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Categorical Encoding

Encode categorical variables using frequency, target mean, and rare grouping.

In [None]:
# Create a target variable (default indicator)
df['is_default'] = (df['days_past_due'] > 30).astype(int)

encoder = CategoricalEncoder()

# Frequency encoding
df = encoder.freq_encoding(df, 'account_type')
df = encoder.freq_encoding(df, 'region')

# Target mean encoding
df = encoder.target_mean_encoding(
    df,
    column='account_type',
    target='is_default',
    smoothing=5.0
)

# Rare grouping
df = encoder.rare_grouping(
    df,
    column='region',
    threshold=0.15
)

print("✓ Created categorical encodings")
print("\nEncoding mappings:")
for name, mapping in encoder.mappings.items():
    print(f"  {name}: {mapping['type']}")

In [None]:
# Visualize categorical encodings
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Account type frequency
df.groupby('account_type')['account_type_freq'].first().plot(kind='bar', ax=axes[0, 0])
axes[0, 0].set_title('Account Type Frequency Encoding')
axes[0, 0].set_ylabel('Frequency')

# Account type target mean
df.groupby('account_type')['account_type_target_mean'].first().plot(kind='bar', ax=axes[0, 1], color='orange')
axes[0, 1].set_title('Account Type Target Mean Encoding')
axes[0, 1].set_ylabel('Target Mean')

# Region frequency
df.groupby('region')['region_freq'].first().plot(kind='bar', ax=axes[1, 0], color='green')
axes[1, 0].set_title('Region Frequency Encoding')
axes[1, 0].set_ylabel('Frequency')

# Region grouping
df['region_grouped'].value_counts().plot(kind='bar', ax=axes[1, 1], color='purple')
axes[1, 1].set_title('Region Rare Grouping')
axes[1, 1].set_ylabel('Count')

plt.tight_layout()
plt.show()

## 4. Feature Validation

Validate features and check for data quality issues.

In [None]:
# Define thresholds
validator = FeatureValidator(
    warning_thresholds={'missing_rate': 0.05, 'zero_variance': True},
    hard_fail_thresholds={'missing_rate': 0.20}
)

# Validate numeric features
numeric_features = ['balance', 'credit_limit', 'days_past_due', 'payment_amount', 
                   'balance_delta', 'balance_pct_change', 'balance_momentum_3']

results = validator.validate_features(df, feature_list=numeric_features)

# Convert to DataFrame for better visualization
validation_df = validator.to_dataframe()
print("\nValidation Results:")
print(validation_df[['feature', 'missing_rate', 'mean', 'std', 'min', 'max', 'status']])

In [None]:
# Visualize validation metrics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Missing rates
validation_df.plot(x='feature', y='missing_rate', kind='bar', ax=axes[0, 0], legend=False)
axes[0, 0].set_title('Missing Rate by Feature')
axes[0, 0].set_ylabel('Missing Rate')
axes[0, 0].axhline(y=0.05, color='orange', linestyle='--', label='Warning')
axes[0, 0].axhline(y=0.20, color='red', linestyle='--', label='Hard Fail')
axes[0, 0].legend()

# Unique counts
validation_df.plot(x='feature', y='unique_count', kind='bar', ax=axes[0, 1], legend=False, color='green')
axes[0, 1].set_title('Unique Count by Feature')
axes[0, 1].set_ylabel('Unique Count')

# Mean values
validation_df.plot(x='feature', y='mean', kind='bar', ax=axes[1, 0], legend=False, color='orange')
axes[1, 0].set_title('Mean by Feature')
axes[1, 0].set_ylabel('Mean')

# Standard deviation
validation_df.plot(x='feature', y='std', kind='bar', ax=axes[1, 1], legend=False, color='purple')
axes[1, 1].set_title('Standard Deviation by Feature')
axes[1, 1].set_ylabel('Std Dev')

plt.tight_layout()
plt.show()

## 5. Dependency Graph Management

Build and resolve feature dependencies.

In [None]:
# Create dependency graph
graph = DependencyGraph()

# Define features and dependencies
graph.add_feature('balance', [])
graph.add_feature('credit_limit', [])
graph.add_feature('utilization', ['balance', 'credit_limit'])
graph.add_feature('log_utilization', ['utilization'])
graph.add_feature('balance_delta', ['balance'])
graph.add_feature('momentum', ['balance_delta'])

# Get execution order
execution_order = graph.topological_sort()

print("\nFeature Execution Order:")
for i, feature in enumerate(execution_order, 1):
    deps = graph.get_dependencies(feature)
    print(f"{i}. {feature:20s} (depends on: {', '.join(deps) if deps else 'none'})")

In [None]:
# Test cycle detection
try:
    bad_graph = DependencyGraph()
    bad_graph.add_feature('feature_a', ['feature_b'])
    bad_graph.add_feature('feature_b', ['feature_c'])
    bad_graph.add_feature('feature_c', ['feature_a'])  # Creates cycle
    
    bad_graph.topological_sort()
except ValueError as e:
    print(f"\n✓ Cycle detected correctly: {e}")

## 6. Feature Registry & Lineage

Track feature metadata and lineage for audit purposes.

In [None]:
# Create feature registry
registry = FeatureRegistry()

# Register features
registry.register(
    name='balance_delta',
    source_columns=['balance'],
    operation='delta',
    parameters={'periods': 1},
    window=None,
    missing_strategy='keep',
    dependencies=[],
    engine='pandas',
    output_dtype='float64'
)

registry.register(
    name='utilization',
    source_columns=['balance', 'credit_limit'],
    operation='ratio',
    parameters={},
    window=None,
    missing_strategy='zero',
    dependencies=[],
    engine='pandas',
    output_dtype='float64'
)

registry.register(
    name='log_utilization',
    source_columns=['utilization'],
    operation='log',
    parameters={'add_one': True},
    window=None,
    missing_strategy='keep',
    dependencies=['utilization'],
    engine='pandas',
    output_dtype='float64'
)

print("\n✓ Registered 3 features")
print("\nFeature Registry:")
for name, metadata in registry.features.items():
    print(f"\n{name}:")
    print(f"  Sources: {metadata.source_columns}")
    print(f"  Operation: {metadata.operation}")
    print(f"  Dependencies: {metadata.dependencies}")
    print(f"  Created: {metadata.created_timestamp}")

In [None]:
# Get lineage for a feature
lineage = registry.get_lineage('log_utilization')
print("\nLineage for 'log_utilization':")
import json
print(json.dumps(lineage, indent=2))

## 7. Complete Feature Engineering Pipeline

Put it all together in a complete pipeline.

In [None]:
# Define comprehensive feature set
recipes = [
    # Aggregations
    FeatureRecipe("max_dpd_3m", "days_past_due", AggregationType.MAX, TimeWindow.LAST_3M),
    FeatureRecipe("avg_balance_6m", "balance", AggregationType.MEAN, TimeWindow.LAST_6M),
    
    # Ratios
    FeatureRecipe("utilization", ["balance", "credit_limit"], "ratio"),
    FeatureRecipe("debt_per_account", ["total_debt", "num_accounts"], "ratio"),
    
    # Transformations
    FeatureRecipe("log_debt", "total_debt", "log", params={"add_one": True}),
]

config = FeatureEngineeringConfig(
    recipes=recipes,
    id_col="customer_id",
    time_col="snapshot_date",
    group_cols=["customer_id"]
)

engineer = PandasFeatureEngineer(config)
df_features = engineer.fit_transform(df)

print(f"\n✓ Created {len(engineer.created_features_)} features via pipeline")
print(f"\nFeatures: {engineer.created_features_}")
print(f"\nSample output:")
print(df_features[['customer_id'] + engineer.created_features_].head())

## 8. Summary

This playbook demonstrated:

1. **Temporal Trend Features**: delta, percent change, momentum, volatility, trend slope
2. **Categorical Encoding**: frequency, target mean, rare grouping
3. **Feature Validation**: Quality metrics, thresholds, PSI
4. **Dependency Management**: Topological sort, cycle detection
5. **Feature Registry**: Metadata tracking, lineage, audit trail
6. **Complete Pipeline**: Integration with existing feature engineering

All features are production-ready and integrate seamlessly with CR-Score pipelines.

In [None]:
print("\n" + "="*80)
print("Enhanced Feature Engineering Playbook Complete!")
print("="*80)