In [None]:
# Feature Engineering Deep Dive - 4 Pillar Architecture

This notebook provides an in-depth analysis of the 4-pillar feature architecture that powers the Enterprise Fraud Detection System.

## Table of Contents

1. [Feature Architecture Overview](#overview)
2. [Pillar 1: Profile Features](#pillar1)
3. [Pillar 2: Behavioral Features](#pillar2)  
4. [Pillar 3: Network Features](#pillar3)
5. [Pillar 4: Contextual Features](#pillar4)
6. [Feature Importance Analysis](#importance)
7. [Feature Engineering Best Practices](#best-practices)

---

## Introduction

The 4-pillar feature architecture is designed to capture comprehensive customer and transaction understanding:

- **Pillar 1**: Profile Features (Who the customer is)
- **Pillar 2**: Behavioral Features (How the customer behaves)
- **Pillar 3**: Network Features (Who the customer connects with)
- **Pillar 4**: Contextual Features (What the customer is doing now)

Each pillar contributes unique insights that, when combined, provide a holistic view of fraud risk.


In [None]:
# Setup and imports
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

# Add the src directory to Python path
sys.path.append('../src')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("🔧 Feature Analysis Environment Setup Complete")
print(f"📅 Analysis Session: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Set random seed for reproducibility
np.random.seed(42)


In [None]:
## Pillar 1: Profile Features Analysis

Profile features capture stable customer characteristics that change slowly over time. These features help establish a baseline understanding of "who the customer is" and their relationship with the financial institution.


In [None]:
# Generate comprehensive customer profile data
def generate_profile_features(n_customers=5000):
    """Generate synthetic profile features for analysis"""
    
    customers = []
    
    for i in range(n_customers):
        # Basic demographics
        age = np.random.normal(40, 15)
        age = max(18, min(80, int(age)))
        
        # Account characteristics
        account_age_days = np.random.exponential(600)
        account_age_days = max(1, min(3650, int(account_age_days)))
        
        # Product portfolio
        total_products = max(1, np.random.poisson(2.5))
        
        # Credit information
        # Higher age tends to correlate with better credit
        credit_base = 550 + (age - 30) * 2
        credit_score = max(300, min(850, int(np.random.normal(credit_base, 100))))
        
        # KYC completion (newer customers tend to have more complete KYC)
        kyc_base = 0.6 + (min(account_age_days, 365) / 365) * 0.3
        kyc_completion = min(1.0, np.random.beta(4, 1) * kyc_base)
        
        # Income estimation (correlated with age and credit score)
        income_bracket = "low"
        if credit_score > 700 and age > 35:
            income_bracket = np.random.choice(["medium", "high"], p=[0.6, 0.4])
        elif credit_score > 600:
            income_bracket = np.random.choice(["low", "medium"], p=[0.4, 0.6])
        
        # PEP status (rare)
        is_pep = np.random.choice([True, False], p=[0.015, 0.985])
        
        # Fraud label (realistic distribution based on risk factors)
        fraud_probability = 0.01  # Base rate
        
        # Risk factors
        if credit_score < 500:
            fraud_probability += 0.03
        if kyc_completion < 0.5:
            fraud_probability += 0.02
        if is_pep:
            fraud_probability += 0.04
        if account_age_days < 30:
            fraud_probability += 0.02
        if total_products > 5:
            fraud_probability += 0.01
        
        is_fraud = np.random.choice([True, False], p=[fraud_probability, 1-fraud_probability])
        
        customer = {
            'customer_id': f'cust_{i:06d}',
            'age': age,
            'account_age_days': account_age_days,
            'total_products': total_products,
            'credit_score_internal': credit_score,
            'kyc_completion_score': kyc_completion,
            'income_bracket': income_bracket,
            'is_pep': is_pep,
            'is_fraud': is_fraud
        }
        
        customers.append(customer)
    
    return pd.DataFrame(customers)

# Generate profile data
profile_df = generate_profile_features(5000)

print("👤 Profile Features Dataset")
print("=" * 30)
print(f"📊 Total Customers: {len(profile_df):,}")
print(f"🚨 Fraud Rate: {profile_df['is_fraud'].mean():.2%}")
print(f"📈 Average Age: {profile_df['age'].mean():.1f}")
print(f"💳 Average Credit Score: {profile_df['credit_score_internal'].mean():.0f}")
print(f"⚖️ PEP Rate: {profile_df['is_pep'].mean():.2%}")

print(f"\n📋 Sample Profile Data:")
print(profile_df.head())

# Analyze profile feature distributions
print(f"\n📊 Profile Feature Statistics:")
profile_stats = profile_df.describe()
print(profile_stats.round(2))
