# Credit Risk Scoring Model - Feature Engineering

This notebook performs feature engineering and selection.

## Objectives:
1. Engineer risk tiers (Low, Medium, High)
2. Create derived features
3. Encode categorical variables
4. Scale numerical features
5. Select important features
6. Validate engineered features

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif
import warnings

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
warnings.filterwarnings('ignore')

# Import custom modules
import sys
sys.path.append('../')
from src.feature_engineering import FeatureEngineer
from src.utils import load_config

print("Libraries imported successfully!")

## 1. Load Preprocessed Data

In [None]:
# Load cleaned data
df = pd.read_csv('../data/processed/cleaned_data.csv')

print(f"Data loaded: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

## 2. Engineer Risk Tiers

In [None]:
# Initialize feature engineer
engineer = FeatureEngineer()

# Engineer risk tiers
df_risk = engineer.engineer_risk_tiers(df)

print("Risk Tier Distribution:")
print(df_risk['risk_tier'].value_counts())
print(f"\nPercentages:")
print(df_risk['risk_tier'].value_counts(normalize=True) * 100)

In [None]:
# Visualize risk tier distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
tier_counts = df_risk['risk_tier'].value_counts()
colors = {'Low': 'green', 'Medium': 'orange', 'High': 'red'}
tier_colors = [colors[tier] for tier in tier_counts.index]

axes[0].bar(tier_counts.index, tier_counts.values, color=tier_colors)
axes[0].set_title('Risk Tier Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Risk Tier', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].grid(axis='y', alpha=0.3)

# Pie chart
axes[1].pie(tier_counts.values, labels=tier_counts.index, autopct='%1.1f%%',
           colors=[colors[tier] for tier in tier_counts.index], startangle=90)
axes[1].set_title('Risk Tier Proportion', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Analyze relationship between risk tier and loan status
if 'loan_status' in df_risk.columns:
    cross_tab = pd.crosstab(df_risk['risk_tier'], df_risk['loan_status'], normalize='index') * 100
    print("Default Rate by Risk Tier:")
    print(cross_tab)
    
    # Visualize
    cross_tab.plot(kind='bar', stacked=False, figsize=(10, 6), color=['green', 'red'])
    plt.title('Loan Status by Risk Tier', fontsize=14, fontweight='bold')
    plt.xlabel('Risk Tier', fontsize=12)
    plt.ylabel('Percentage (%)', fontsize=12)
    plt.legend(['Paid (0)', 'Default (1)'])
    plt.xticks(rotation=0)
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()

## 3. Create Derived Features

In [None]:
# Create derived features
df_featured = engineer.create_derived_features(df_risk)

print(f"Features before: {len(df_risk.columns)}")
print(f"Features after: {len(df_featured.columns)}")
print(f"New features created: {len(df_featured.columns) - len(df_risk.columns)}")

# Show new features
new_features = [col for col in df_featured.columns if col not in df_risk.columns]
print(f"\nNew features:")
for feat in new_features:
    print(f"  - {feat}")

In [None]:
# Explore some derived features
if 'debt_to_income_ratio' in df_featured.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Distribution
    axes[0].hist(df_featured['debt_to_income_ratio'].dropna(), bins=50, color='skyblue', edgecolor='black')
    axes[0].set_title('Debt-to-Income Ratio Distribution', fontsize=12, fontweight='bold')
    axes[0].set_xlabel('Debt-to-Income Ratio', fontsize=10)
    axes[0].set_ylabel('Frequency', fontsize=10)
    axes[0].grid(alpha=0.3)
    
    # By risk tier
    df_featured.boxplot(column='debt_to_income_ratio', by='risk_tier', ax=axes[1])
    axes[1].set_title('Debt-to-Income Ratio by Risk Tier', fontsize=12, fontweight='bold')
    axes[1].set_xlabel('Risk Tier', fontsize=10)
    axes[1].set_ylabel('Debt-to-Income Ratio', fontsize=10)
    plt.suptitle('')
    
    plt.tight_layout()
    plt.show()

## 4. Encode Categorical Features

In [None]:
# Check categorical features
categorical_cols = df_featured.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical features: {categorical_cols}")

# Encode (using one-hot encoding)
df_encoded = engineer.encode_categorical_features(df_featured, method='onehot', fit=True)

print(f"\nFeatures after encoding: {len(df_encoded.columns)}")
print(f"Features added: {len(df_encoded.columns) - len(df_featured.columns)}")

## 5. Scale Numerical Features

In [None]:
# Separate target from features for scaling
target_cols = ['risk_tier', 'loan_status', 'risk_tier_encoded']
target_cols = [col for col in target_cols if col in df_encoded.columns]

# Scale features
df_scaled = engineer.scale_numerical_features(df_encoded, method='standard', fit=True, exclude_cols=target_cols)

print("Features scaled successfully!")
print(f"\nSample scaled values (first 5 rows):")
numeric_cols = df_scaled.select_dtypes(include=[np.number]).columns[:5]
df_scaled[numeric_cols].head()

In [None]:
# Compare before and after scaling
if 'person_income' in df_featured.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Before scaling
    axes[0].hist(df_featured['person_income'].dropna(), bins=50, color='coral', edgecolor='black')
    axes[0].set_title('Person Income - Before Scaling', fontsize=12, fontweight='bold')
    axes[0].set_xlabel('Income', fontsize=10)
    axes[0].set_ylabel('Frequency', fontsize=10)
    
    # After scaling
    if 'person_income' in df_scaled.columns:
        axes[1].hist(df_scaled['person_income'].dropna(), bins=50, color='skyblue', edgecolor='black')
        axes[1].set_title('Person Income - After Scaling', fontsize=12, fontweight='bold')
        axes[1].set_xlabel('Scaled Income', fontsize=10)
        axes[1].set_ylabel('Frequency', fontsize=10)
    
    plt.tight_layout()
    plt.show()

## 6. Feature Selection

In [None]:
# Prepare data for feature selection
target_cols = ['risk_tier', 'loan_status', 'risk_tier_encoded']
available_targets = [col for col in target_cols if col in df_scaled.columns]

feature_cols = [col for col in df_scaled.columns if col not in available_targets]

# Select target for feature selection
target_col = 'risk_tier_encoded' if 'risk_tier_encoded' in df_scaled.columns else 'loan_status'

X = df_scaled[feature_cols]
y = df_scaled[target_col]

print(f"Features: {len(feature_cols)}")
print(f"Target: {target_col}")
print(f"Samples: {len(X)}")

### 6.1 Mutual Information Selection

In [None]:
# Select features using mutual information
mi_features, mi_scores = engineer.select_features_mutual_info(X, y, k=15)

print(f"Selected {len(mi_features)} features using mutual information")
print(f"\nTop 10 features:")
for i, feat in enumerate(mi_features[:10], 1):
    print(f"  {i}. {feat}")

In [None]:
# Visualize mutual information scores
engineer.visualize_feature_importance(
    X.columns.tolist(),
    mi_scores,
    title="Mutual Information Scores",
    top_n=20
)

### 6.2 Correlation-Based Selection

In [None]:
# Remove highly correlated features
corr_features = engineer.select_features_correlation(X, threshold=0.9)

print(f"Features after correlation removal: {len(corr_features)}")
print(f"Features removed: {len(feature_cols) - len(corr_features)}")

### 6.3 Model-Based Selection

In [None]:
# Select features using Random Forest
rf_features, rf_importances = engineer.select_features_model_based(X, y, k=15)

print(f"Selected {len(rf_features)} features using Random Forest importance")
print(f"\nTop 10 features:")
for i, feat in enumerate(rf_features[:10], 1):
    print(f"  {i}. {feat}")

In [None]:
# Visualize Random Forest feature importance
engineer.visualize_feature_importance(
    X.columns.tolist(),
    rf_importances,
    title="Random Forest Feature Importance",
    top_n=20
)

### 6.4 Combined Feature Selection

In [None]:
# Combine all selection methods (intersection)
final_features = list(set(mi_features) & set(corr_features) & set(rf_features))

print(f"Final selected features: {len(final_features)}")
print(f"\nSelected features:")
for i, feat in enumerate(final_features, 1):
    print(f"  {i}. {feat}")

# Visualize overlap
from matplotlib_venn import venn3
try:
    plt.figure(figsize=(10, 8))
    venn3([set(mi_features), set(rf_features), set(corr_features)],
          ('Mutual Info', 'Random Forest', 'Correlation'))
    plt.title('Feature Selection Method Overlap', fontsize=14, fontweight='bold')
    plt.show()
except:
    print("Note: Install matplotlib-venn to visualize overlap: pip install matplotlib-venn")

## 7. Save Engineered Features

In [None]:
# Create final dataset with selected features
final_cols = final_features + available_targets
df_final = df_scaled[final_cols]

# Save
output_path = '../data/processed/engineered_features.csv'
df_final.to_csv(output_path, index=False)

print(f"✓ Engineered features saved to: {output_path}")
print(f"✓ Final dataset shape: {df_final.shape}")
print(f"✓ Features: {len(final_features)}")
print(f"✓ Targets: {len(available_targets)}")

## 8. Summary Statistics

In [None]:
print("="*60)
print("FEATURE ENGINEERING SUMMARY")
print("="*60)

print(f"\n1. Original features: {len(df.columns)}")
print(f"2. After derived features: {len(df_featured.columns)}")
print(f"3. After encoding: {len(df_encoded.columns)}")
print(f"4. Final selected features: {len(final_features)}")

print(f"\n5. Risk tier distribution:")
if 'risk_tier' in df_final.columns:
    print(df_final['risk_tier'].value_counts())
elif 'risk_tier_encoded' in df_final.columns:
    tier_map = {0: 'Low', 1: 'Medium', 2: 'High'}
    print(df_final['risk_tier_encoded'].map(tier_map).value_counts())

print(f"\n6. Dataset ready for modeling: ✓")
print(f"   - Train/test split: Next step")
print(f"   - Model training: Next step")

print("\n" + "="*60)