# Credit Card Churn Prediction - Advanced ML Pipeline

**Goal**: Predict customer churn with advanced ML models and optimize business ROI

**Models**: XGBoost, LightGBM, Random Forest  
**Focus**: F2-score (prioritize recall), business cost optimization, model interpretability


## 1. Setup


In [1]:
# Core imports
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append(str(Path.cwd().parent / 'src'))

# ML imports
from sklearn.model_selection import train_test_split
from models.churn_predictor import ChurnPredictor
from business.clv_calculator import CLVCalculator
from business.cost_analysis import CostAnalyzer
from utils.data_preprocessing import DataPreprocessor
import shap

# Style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

print('✓ Setup complete')


✓ Setup complete


## 2. Load Data

Automatically loads from R2 if `.env` exists, otherwise uses synthetic data.


In [2]:
from dotenv import load_dotenv
import os

# Load .env from project root (correct path)
PROJ_ROOT = Path.cwd().parent
env_loaded = load_dotenv(dotenv_path=PROJ_ROOT / '.env')

if env_loaded:
    # Load from R2
    import boto3
    from io import BytesIO
    
    print('✓ Loading data from R2...')
    s3_client = boto3.client(
        's3',
        endpoint_url=f"https://{os.getenv('ACCOUNT_ID')}.r2.cloudflarestorage.com",
        aws_access_key_id=os.getenv('ACCESS_KEY_ID_USER2'),
        aws_secret_access_key=os.getenv('SECRET_ACCESS_KEY_USER2'),
        region_name='auto'
    )
    
    s3_object = s3_client.get_object(Bucket='cc-churn-splits', Key='train_data.parquet.gzip')
    df = pd.read_parquet(BytesIO(s3_object['Body'].read()), dtype_backend='pyarrow')
    
    # Convert categorical columns
    df = df.astype({
        "gender": 'string[pyarrow]',
        "marital_status": 'string[pyarrow]',
        "income_category": 'string[pyarrow]',
        "card_category": 'string[pyarrow]',
    })
    
    print(f'✓ Loaded {len(df):,} real customers from R2')
else:
    # Generate synthetic data
    print('⚠ No .env found, using synthetic data')
    np.random.seed(42)
    n = 2000
    
    df = pd.DataFrame({
        'customer_age': np.random.randint(25, 75, n),
        'credit_limit': np.random.uniform(1000, 50000, n),
        'total_revolving_bal': np.random.uniform(0, 20000, n),
        'total_trans_amt': np.random.uniform(500, 50000, n),
        'total_trans_ct': np.random.randint(10, 200, n),
        'months_on_book': np.random.randint(1, 60, n),
        'months_inactive_12_mon': np.random.randint(0, 6, n),
        'avg_utilization_ratio': np.random.uniform(0, 1, n),
        'gender': np.random.choice(['M', 'F'], n),
        'card_category': np.random.choice(['Blue', 'Silver', 'Gold', 'Platinum'], n),
    })
    
    # Synthetic churn (16% rate)
    churn_prob = 0.1 + 0.3 * (df['avg_utilization_ratio'] > 0.8) + \
                 0.2 * (df['months_inactive_12_mon'] > 2) + np.random.normal(0, 0.1, n)
    df['is_churned'] = np.random.binomial(1, np.clip(churn_prob, 0, 1), n)
    print(f'✓ Generated {len(df):,} synthetic customers')

print(f'✓ Shape: {df.shape}')
print(f'✓ Churn rate: {df["is_churned"].mean():.1%}')
df.head()


✓ Loading data from R2...
✓ Loaded 6,285 real customers from R2
✓ Shape: (6285, 21)
✓ Churn rate: 16.1%


Unnamed: 0,clientnum,is_churned,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,...,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolv_bal,avg_open_to_buy,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio
0,714283458,0,40,M,2,College,Single,$80K - $120K,Blue,36,...,1,4,14544.0,0.0,14544.0,0.768,4064.0,92,0.769,0.0
1,787587033,0,42,F,4,Graduate,Single,Less than $40K,Blue,32,...,1,2,2996.0,1992.0,1004.0,0.948,4463.0,87,0.74,0.665
2,714672933,0,52,F,3,Graduate,Married,$40K - $60K,Blue,36,...,4,2,3143.0,2268.0,875.0,0.801,4417.0,84,0.68,0.722
3,714974658,0,48,F,4,College,Married,Less than $40K,Blue,36,...,1,4,2464.0,1867.0,597.0,0.6,1219.0,35,1.333,0.758
4,712049208,0,56,M,3,Post-Graduate,Married,$60K - $80K,Blue,39,...,1,2,3955.0,2517.0,1438.0,0.484,1238.0,25,1.083,0.636


## 3. Preprocess & Split


In [3]:
# Preprocess
preprocessor = DataPreprocessor(random_state=42)
df_processed = preprocessor.fit_preprocessing_pipeline(df, target_column='is_churned')

# Separate features and target
X = df_processed.drop(columns=['is_churned'])
y = df_processed['is_churned']

# Handle infinite values
X = X.replace([np.inf, -np.inf], np.nan)
for col in X.columns[X.isna().any()]:
    X[col] = X[col].fillna(X[col].median())

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f'✓ Train: {X_train.shape[0]:,} samples, {X_train.shape[1]} features')
print(f'✓ Test:  {X_test.shape[0]:,} samples')
print(f'✓ Churn rate - Train: {y_train.mean():.1%}, Test: {y_test.mean():.1%}')


Starting data preprocessing pipeline...
Data validation completed. Shape: (6285, 21)
Missing values handled
Categorical variables encoded
Outliers handled
Created 8 interaction features
Created 24 polynomial features
Features scaled
Preprocessing completed. Final shape: (6285, 61)
✓ Train: 5,028 samples, 60 features
✓ Test:  1,257 samples
✓ Churn rate - Train: 16.1%, Test: 16.1%


## 4. Train Models

Trains XGBoost, LightGBM, and Random Forest with SMOTE sampling.


In [4]:
# Initialize predictor
predictor = ChurnPredictor(random_state=42)

# Train with cross-validation
print('Training models (this may take a few minutes)...')
results = predictor.fit(
    X_train, y_train, 
    sampling_method='smote',
    optimize_threshold=True,
    cv_folds=5
)

print(f'\n✓ Best model: {results["best_model"]}')
print(f'✓ Optimal threshold: {results["optimal_threshold"]:.3f}')
print(f'✓ CV F2 Score: {results["cv_scores"]["f2"]:.3f} (±{results["cv_scores"]["f2_std"]:.3f})')


Training models (this may take a few minutes)...
Starting model training...
Data shape: (5028, 60)
Class distribution: {0: 4220, 1: 808}
Class imbalance ratio: 0.191
Applying smote sampling...
After sampling: (8440, 60), Class distribution: {0: 4220, 1: 4220}

Training xgboost...
xgboost CV F2: 0.970 ± 0.003
xgboost CV AUC: 0.998 ± 0.001

Training lightgbm...
lightgbm CV F2: 0.984 ± 0.002
lightgbm CV AUC: 0.998 ± 0.000

Training random_forest...
random_forest CV F2: 0.979 ± 0.003
random_forest CV AUC: 0.995 ± 0.001

Training logistic_regression...
logistic_regression CV F2: 0.932 ± 0.006
logistic_regression CV AUC: 0.981 ± 0.002

Best model: lightgbm
Best F2 score: 0.984
Optimized threshold: 0.551
Best F2 score at threshold: 1.000

✓ Best model: lightgbm
✓ Optimal threshold: 0.551


KeyError: 'f2'

## 5. Evaluate Performance


In [None]:
# Predictions
y_pred, y_pred_proba = predictor.predict(X_test)
metrics = predictor.evaluate_model(X_test, y_test)

# Display results
print('='*50)
print('MODEL PERFORMANCE')
print('='*50)
print(f'F2 Score:   {metrics["f2"]:.3f}  (recall-focused)')
print(f'Recall:     {metrics["recall"]:.3f}  (catch churning customers)')
print(f'Precision:  {metrics["precision"]:.3f}  (avoid false alarms)')
print(f'AUC:        {metrics["auc"]:.3f}  (discrimination ability)')
print('='*50)

# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(f'\nTrue Negatives:  {cm[0,0]:4d}  (correct non-churn)')
print(f'False Positives: {cm[0,1]:4d}  (false alarm)')
print(f'False Negatives: {cm[1,0]:4d}  (missed churn) ← minimize this')
print(f'True Positives:  {cm[1,1]:4d}  (caught churn)')


## 6. Business Analysis

Calculate CLV and ROI for intervention strategy.


In [None]:
# Initialize business components
clv_calc = CLVCalculator(
    interchange_rate=0.02,      # 2% interchange fee
    apr=0.18,                   # 18% APR
    expected_tenure=3.0,        # 3 years
    intervention_cost=50.0,     # $50 per intervention
    success_rate=0.40,          # 40% success
    cac=200.0                   # $200 acquisition cost
)

cost_analyzer = CostAnalyzer(clv_calc)

# Calculate CLV (use original data before preprocessing)
X_test_original = df.loc[X_test.index]
if 'is_churned' in X_test_original.columns:
    X_test_original = X_test_original.drop(columns=['is_churned'])

clv_values = clv_calc.calculate_clv(X_test_original)

# Generate cost report
cost_report = cost_analyzer.generate_cost_report(y_test, y_pred_proba, clv_values)

# Display results
print('='*50)
print('BUSINESS IMPACT ANALYSIS')
print('='*50)
print(f'Total customers:     {cost_report["overview"]["total_customers"]:,}')
print(f'Churn rate:          {cost_report["overview"]["churn_rate"]:.1%}')
print(f'Average CLV:         ${clv_values.mean():,.2f}')
print(f'Total at-risk value: ${cost_report["overview"]["total_clv"]:,.2f}')

# Optimal strategy
profit_results = cost_report['optimal_thresholds']['profit_maximization']['results']
print('\n' + '='*50)
print('OPTIMAL INTERVENTION STRATEGY')
print('='*50)
print(f'Customers to target: {profit_results["customers_targeted"]:,}')
print(f'Expected profit:     ${profit_results["total_profit"]:,.2f}')
print(f'ROI:                 {profit_results["roi_percentage"]:.0f}%')
print(f'Optimal threshold:   {cost_report["optimal_thresholds"]["profit_maximization"]["threshold"]:.3f}')
print('='*50)


## 7. Model Interpretability

Understand which features drive churn predictions.


In [None]:
# Feature importance from model
importance = predictor.get_feature_importance().head(10)

plt.figure(figsize=(10, 6))
plt.barh(range(len(importance)), importance['importance'])
plt.yticks(range(len(importance)), importance['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 10 Features Driving Churn Prediction')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print('Top 5 most important features:')
for i, row in importance.head(5).iterrows():
    print(f'  {row["feature"]:30s} {row["importance"]:.3f}')


In [None]:
# SHAP analysis (sample for speed)
print('Calculating SHAP values...')
explainer = shap.TreeExplainer(predictor.best_model)
sample_size = min(200, len(X_test))
X_sample = X_test.sample(n=sample_size, random_state=42)
shap_values = explainer.shap_values(X_sample)

# Summary plot
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_sample, max_display=10, show=False)
plt.title('SHAP Feature Impact on Churn Prediction')
plt.tight_layout()
plt.show()

print('✓ Red = high feature value pushes prediction higher')
print('✓ Blue = low feature value')
print('✓ X-axis = impact on prediction')


## 8. Summary & Recommendations


In [None]:
# Final summary
print('='*60)
print('CREDIT CARD CHURN PREDICTION - FINAL RESULTS')
print('='*60)

print(f'Best Model: {results["best_model"]}')
print(f'F2 Score:  {metrics["f2"]:.3f}')
print(f'Recall:    {metrics["recall"]:.3f}')

# Business impact
profit_results = cost_report['optimal_thresholds']['profit_maximization']['results']
print(f'\nBusiness Impact:')
print(f'  • Target {profit_results["customers_targeted"]:,} customers')
print(f'  • Expected profit: ${profit_results["total_profit"]:,.0f}')
print(f'  • ROI: {profit_results["roi_percentage"]:.0f}%')

# Key recommendations
print('\n' + '='*60)
print('RECOMMENDATIONS')
print('='*60)
print('1. Deploy model with optimal threshold for profit maximization')
print('2. Focus retention efforts on high-CLV customers')
print('3. Monitor model performance monthly for drift')
print('4. A/B test intervention strategies to improve success rate')
print('5. Expand feature set with transaction patterns if available')

print('\n' + '='*60)
print('Next Steps:')
print('• Run hyperparameter optimization for production model')
print('• Implement model monitoring dashboard')
print('• Design customer intervention workflows')
print('• Calculate full ROI for deployment decision')
print('='*60)
