## Feature Engineering

- Phase 1: Bureau (credit history)
- Phase 2: Bureau Balance (monthly credit status)
- Phase 3: Previous Applications
- Phase 4: POS & Credit Card
- Phase 5: Installments


In [297]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import joblib
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
import json
import os

from credit_model_helpers import (
    aggregate_numeric, aggregate_categorical_ohe,
    filter_by_missing, remove_low_variance, remove_correlated,
    get_feature_importances, select_by_importance_threshold,
    train_preliminary_model, compare_feature_sets, print_comparison_results
)
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
mlflow.lightgbm.autolog(disable=True)
mlflow.xgboost.autolog(disable=True)


### Setup: Feature Definitions


In [212]:
NUMERICAL_FEATURES = [
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
    'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH',
    'DAYS_LAST_PHONE_CHANGE', 'REGION_POPULATION_RELATIVE',
    'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
    'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
    'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY',
    'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON',
    'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR',
    'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE',
    'CNT_FAM_MEMBERS', 'HOUR_APPR_PROCESS_START'
]

CATEGORICAL_FEATURES = [
    'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
    'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
    'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START',
    'ORGANIZATION_TYPE'
]

BASELINE_FEATURES = NUMERICAL_FEATURES + CATEGORICAL_FEATURES

def create_cat_mappings(X_train, cat_cols):
    mappings = {}
    for col in cat_cols:
        unique_vals = X_train[col].dropna().unique()
        mappings[col] = {val: i for i, val in enumerate(unique_vals)}
    return mappings

print(f"Baseline features: {len(BASELINE_FEATURES)} ({len(NUMERICAL_FEATURES)} num + {len(CATEGORICAL_FEATURES)} cat)")


Baseline features: 36 (25 num + 11 cat)


In [213]:
df = pd.read_csv('../../data/raw/application_train.csv')
test_df = pd.read_csv('../../data/raw/application_test.csv')

X = df[BASELINE_FEATURES].copy()
y = df['TARGET'].copy()

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

cat_mappings = create_cat_mappings(X_train, CATEGORICAL_FEATURES)

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(test_df)}")
print(f"Categorical mappings created for {len(cat_mappings)} features")


Train: 246008, Val: 61503, Test: 48744
Categorical mappings created for 11 features


### Phase 1: Bureau Features



In [214]:
bureau = pd.read_csv('../../data/raw/bureau.csv')
print(f"Bureau shape: {bureau.shape}")
print(f"Unique SK_ID_CURR: {bureau['SK_ID_CURR'].nunique()}")


Bureau shape: (1716428, 17)
Unique SK_ID_CURR: 305811


### Bureau Aggregations


In [215]:
bureau_agg = bureau.groupby('SK_ID_CURR').agg({
    'DAYS_CREDIT': ['min', 'max', 'mean', 'std'],
    'CREDIT_DAY_OVERDUE': ['max', 'mean', 'sum'],
    'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
    'AMT_CREDIT_MAX_OVERDUE': ['max', 'mean'],
    'AMT_CREDIT_SUM': ['sum', 'mean', 'std', 'max'],
    'AMT_CREDIT_SUM_DEBT': ['sum', 'mean', 'max'],
    'AMT_CREDIT_SUM_OVERDUE': ['sum', 'max'],
    'AMT_CREDIT_SUM_LIMIT': ['sum', 'mean', 'max'],
    'DAYS_CREDIT_UPDATE': ['min', 'max', 'mean']
}).reset_index()

bureau_agg.columns = ['SK_ID_CURR'] + [f'bureau_{col[0]}_{col[1]}' for col in bureau_agg.columns[1:]]

print(f"Bureau aggregated features: {bureau_agg.shape[1] - 1}")


Bureau aggregated features: 27


In [216]:
bureau_agg = bureau.groupby('SK_ID_CURR').agg({
    'DAYS_CREDIT': ['min', 'max', 'mean', 'std'],
    'CREDIT_DAY_OVERDUE': ['max', 'mean', 'sum'],
    'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
    'AMT_CREDIT_MAX_OVERDUE': ['max', 'mean'],
    'AMT_CREDIT_SUM': ['sum', 'mean', 'std', 'max'],
    'AMT_CREDIT_SUM_DEBT': ['sum', 'mean', 'max'],
    'AMT_CREDIT_SUM_OVERDUE': ['sum', 'max'],
    'AMT_CREDIT_SUM_LIMIT': ['sum', 'mean', 'max'],
    'DAYS_CREDIT_UPDATE': ['min', 'max', 'mean']
}).reset_index()

bureau_agg.columns = ['SK_ID_CURR'] + [f'bureau_{col[0]}_{col[1]}' for col in bureau_agg.columns[1:]]

print(f"Bureau aggregated features: {bureau_agg.shape[1] - 1}")


Bureau aggregated features: 27


In [217]:
bureau_active = aggregate_categorical_ohe(bureau, 'SK_ID_CURR', 'CREDIT_ACTIVE', 'bureau_active')
bureau_type = aggregate_categorical_ohe(bureau, 'SK_ID_CURR', 'CREDIT_TYPE', 'bureau_type')

bureau_agg = bureau_agg.merge(bureau_active, on='SK_ID_CURR', how='left')
bureau_agg = bureau_agg.merge(bureau_type, on='SK_ID_CURR', how='left')

print(f"Total bureau features after categorical: {bureau_agg.shape[1] - 1}")


Total bureau features after categorical: 46


In [218]:
bureau_agg['bureau_debt_credit_ratio'] = bureau_agg['bureau_AMT_CREDIT_SUM_DEBT_sum'] / (bureau_agg['bureau_AMT_CREDIT_SUM_sum'] + 1)
bureau_agg['bureau_overdue_debt_ratio'] = bureau_agg['bureau_AMT_CREDIT_SUM_OVERDUE_sum'] / (bureau_agg['bureau_AMT_CREDIT_SUM_DEBT_sum'] + 1)
bureau_agg['bureau_active_closed_ratio'] = bureau_agg['bureau_active_Active'] / (bureau_agg['bureau_active_Closed'] + 1)

if 'bureau_active_Bad debt' in bureau_agg.columns:
    bureau_agg['bureau_bad_debt_ratio'] = bureau_agg['bureau_active_Bad debt'] / (bureau_agg[['bureau_active_Active', 'bureau_active_Closed']].sum(axis=1) + 1)

bureau_agg['bureau_credit_count'] = bureau.groupby('SK_ID_CURR').size().values
bureau_agg['bureau_avg_days_between'] = bureau_agg['bureau_DAYS_CREDIT_max'] - bureau_agg['bureau_DAYS_CREDIT_min']

print(f"Total features with derived: {bureau_agg.shape[1] - 1}")


Total features with derived: 52


### Level 1 Filtering


In [219]:
bureau_features = bureau_agg.drop(columns=['SK_ID_CURR']).select_dtypes(include=[np.number])

print(f"Before filtering: {bureau_features.shape[1]} features")

bureau_features, dropped_missing = filter_by_missing(bureau_features, threshold=0.80)
print(f"Dropped {len(dropped_missing)} features (>80% missing)")

bureau_features, dropped_variance = remove_low_variance(bureau_features, threshold=0.01)
print(f"Dropped {len(dropped_variance)} features (low variance)")

bureau_features, dropped_corr = remove_correlated(bureau_features, threshold=0.95)
print(f"Dropped {len(dropped_corr)} features (high correlation)")

print(f"\\nAfter Level 1 filtering: {bureau_features.shape[1]} features")


Before filtering: 52 features
Dropped 0 features (>80% missing)
Dropped 11 features (low variance)
Dropped 5 features (high correlation)
\nAfter Level 1 filtering: 36 features


### Merge with Train/Val/Test


In [220]:
bureau_agg_filtered = pd.concat([bureau_agg[['SK_ID_CURR']], bureau_features], axis=1)

train_ids = df.loc[X_train.index, 'SK_ID_CURR']
val_ids = df.loc[X_val.index, 'SK_ID_CURR']

X_train_bureau = pd.merge(
    pd.concat([train_ids.reset_index(drop=True), X_train.reset_index(drop=True)], axis=1),
    bureau_agg_filtered, on='SK_ID_CURR', how='left'
).drop(columns=['SK_ID_CURR'])

X_val_bureau = pd.merge(
    pd.concat([val_ids.reset_index(drop=True), X_val.reset_index(drop=True)], axis=1),
    bureau_agg_filtered, on='SK_ID_CURR', how='left'
).drop(columns=['SK_ID_CURR'])

X_test_bureau = pd.merge(
    test_df[['SK_ID_CURR'] + BASELINE_FEATURES],
    bureau_agg_filtered, on='SK_ID_CURR', how='left'
).drop(columns=['SK_ID_CURR'])

print(f"Train shape: {X_train_bureau.shape}")
print(f"Val shape: {X_val_bureau.shape}")
print(f"Test shape: {X_test_bureau.shape}")


Train shape: (246008, 72)
Val shape: (61503, 72)
Test shape: (48744, 72)


### Preliminary Training (for Level 2 Filtering)


In [221]:
%%time

X_train_lgb = X_train_bureau.copy()
X_val_lgb = X_val_bureau.copy()
X_test_lgb = X_test_bureau.copy()

for col in CATEGORICAL_FEATURES:
    if col in X_train_lgb.columns:
        X_train_lgb[col] = X_train_lgb[col].map(cat_mappings[col])
        X_val_lgb[col] = X_val_lgb[col].map(cat_mappings[col])
        X_test_lgb[col] = X_test_lgb[col].map(cat_mappings[col])

lgb_bureau = LGBMClassifier(
    #n_estimators=500,
    #learning_rate=0.05,
    #max_depth=7,
    #num_leaves=31,
    random_state=42,
    class_weight='balanced',
    verbose=-1
)

lgb_bureau.fit(
    X_train_lgb, y_train,
    eval_set=[(X_val_lgb, y_val)],
    eval_metric='auc'
)

print("LightGBM training complete")


LightGBM training complete
CPU times: total: 23.3 s
Wall time: 4.19 s


In [222]:
train_auc_lgb_prelim = roc_auc_score(y_train, lgb_bureau.predict_proba(X_train_lgb)[:, 1])
val_auc_lgb_prelim = roc_auc_score(y_val, lgb_bureau.predict_proba(X_val_lgb)[:, 1])

print("Preliminary LightGBM Results:")
print(f"  Train AUC: {train_auc_lgb_prelim:.4f}")
print(f"  Val AUC: {val_auc_lgb_prelim:.4f}")


Preliminary LightGBM Results:
  Train AUC: 0.8043
  Val AUC: 0.7645


In [223]:
%%time

xgb_bureau = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    scale_pos_weight=11.4,
    random_state=42,
    tree_method='hist',
    eval_metric='auc',
    verbosity=0
)

xgb_bureau.fit(
    X_train_lgb, y_train,
    eval_set=[(X_val_lgb, y_val)],
    verbose=False
)

train_auc_xgb_prelim = roc_auc_score(y_train, xgb_bureau.predict_proba(X_train_lgb)[:, 1])
val_auc_xgb_prelim = roc_auc_score(y_val, xgb_bureau.predict_proba(X_val_lgb)[:, 1])

print("\nPreliminary XGBoost Results:")
print(f"  Train AUC: {train_auc_xgb_prelim:.4f}")
print(f"  Val AUC: {val_auc_xgb_prelim:.4f}")



Preliminary XGBoost Results:
  Train AUC: 0.9077
  Val AUC: 0.7594
CPU times: total: 5min 3s
Wall time: 23.1 s


### Level 2 Filtering: Feature Importance Selection


In [224]:
lgb_importances = get_feature_importances(lgb_bureau, X_train_lgb.columns.tolist())
xgb_importances = get_feature_importances(xgb_bureau, X_train_lgb.columns.tolist())

print("\nTop 10 LightGBM Importances:")
print(lgb_importances.head(10))

print("\nTop 10 XGBoost Importances:")
print(xgb_importances.head(10))

importance_threshold = 20
lgb_selected_features = select_by_importance_threshold(lgb_importances, importance_threshold)
xgb_selected_features = select_by_importance_threshold(xgb_importances, importance_threshold)

common_features = list(set(lgb_selected_features) & set(xgb_selected_features))
all_selected = list(set(lgb_selected_features) | set(xgb_selected_features))

print(f"\nThreshold: {importance_threshold}")
print(f"LightGBM selected: {len(lgb_selected_features)}")
print(f"XGBoost selected: {len(xgb_selected_features)}")
print(f"Common features: {len(common_features)}")
print(f"Union features: {len(all_selected)}")

selected_features = all_selected
print(f"\nUsing UNION: {len(selected_features)} features selected")



Top 10 LightGBM Importances:
                           feature  importance
12                    EXT_SOURCE_1         245
14                    EXT_SOURCE_3         183
13                    EXT_SOURCE_2         180
4                       DAYS_BIRTH         162
1                       AMT_CREDIT         142
2                      AMT_ANNUITY         112
3                  AMT_GOODS_PRICE         101
5                    DAYS_EMPLOYED          86
43  bureau_DAYS_CREDIT_ENDDATE_max          85
7                  DAYS_ID_PUBLISH          81

Top 10 XGBoost Importances:
                     feature  importance
14              EXT_SOURCE_3    0.059793
13              EXT_SOURCE_2    0.049448
26               CODE_GENDER    0.039989
25        NAME_CONTRACT_TYPE    0.031664
30       NAME_EDUCATION_TYPE    0.031238
27              FLAG_OWN_CAR    0.023732
65     bureau_type_Microloan    0.023571
12              EXT_SOURCE_1    0.023311
66      bureau_type_Mortgage    0.020177
67  bureau_deb

### Final Model Training (with selected features)


In [225]:
X_train_selected = X_train_lgb[selected_features]
X_val_selected = X_val_lgb[selected_features]
X_test_selected = X_test_lgb[selected_features]

print(f"Selected features shape:")
print(f"  Train: {X_train_selected.shape}")
print(f"  Val: {X_val_selected.shape}")
print(f"  Test: {X_test_selected.shape}")


Selected features shape:
  Train: (246008, 48)
  Val: (61503, 48)
  Test: (48744, 48)


In [226]:
%%time

lgb_final = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=31,
    random_state=42,
    class_weight='balanced',
    verbose=-1
)

lgb_final.fit(
    X_train_selected, y_train,
    eval_set=[(X_val_selected, y_val)],
    eval_metric='auc'
)

train_auc_lgb = roc_auc_score(y_train, lgb_final.predict_proba(X_train_selected)[:, 1])
val_auc_lgb = roc_auc_score(y_val, lgb_final.predict_proba(X_val_selected)[:, 1])

print("Final LightGBM Results:")
print(f"  Train AUC: {train_auc_lgb:.4f}")
print(f"  Val AUC: {val_auc_lgb:.4f}")


Final LightGBM Results:
  Train AUC: 0.8444
  Val AUC: 0.7665
CPU times: total: 1min 11s
Wall time: 9.25 s


In [227]:
%%time

xgb_final = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    scale_pos_weight=11.4,
    random_state=42,
    tree_method='hist',
    eval_metric='auc',
    verbosity=0
)

xgb_final.fit(
    X_train_selected, y_train,
    eval_set=[(X_val_selected, y_val)],
    verbose=False
)

train_auc_xgb = roc_auc_score(y_train, xgb_final.predict_proba(X_train_selected)[:, 1])
val_auc_xgb = roc_auc_score(y_val, xgb_final.predict_proba(X_val_selected)[:, 1])

print("\nFinal XGBoost Results:")
print(f"  Train AUC: {train_auc_xgb:.4f}")
print(f"  Val AUC: {val_auc_xgb:.4f}")



Final XGBoost Results:
  Train AUC: 0.9100
  Val AUC: 0.7582
CPU times: total: 4min 9s
Wall time: 17.7 s


In [228]:
baseline_train_auc = 0.8288
baseline_val_auc = 0.7610

print("="*60)
print("QUICK EVALUATION (80/20 Split - Baseline Comparison)")
print("="*60)
print(f"{'Model':<20} {'Train AUC':<12} {'Val AUC':<12} {'Improvement':<12} {'Gap':<8}")
print("-"*60)
print(f"{'Baseline':<20} {baseline_train_auc:<12.4f} {baseline_val_auc:<12.4f} {'-':<12} {baseline_train_auc - baseline_val_auc:<8.4f}")
print(f"{'LightGBM':<20} {train_auc_lgb:<12.4f} {val_auc_lgb:<12.4f} {val_auc_lgb - baseline_val_auc:+<12.4f} {train_auc_lgb - val_auc_lgb:<8.4f}")
print(f"{'XGBoost':<20} {train_auc_xgb:<12.4f} {val_auc_xgb:<12.4f} {val_auc_xgb - baseline_val_auc:+<12.4f} {train_auc_xgb - val_auc_xgb:<8.4f}")
print("="*60)

bureau_new_features = [f for f in selected_features if f.startswith('bureau_')]
print(f"\nSelected Features:")
print(f"  Total: {len(selected_features)}")
print(f"  Baseline: {len([f for f in selected_features if not f.startswith('bureau_')])}")
print(f"  Bureau: {len(bureau_new_features)}")


QUICK EVALUATION (80/20 Split - Baseline Comparison)
Model                Train AUC    Val AUC      Improvement  Gap     
------------------------------------------------------------
Baseline             0.8288       0.7610       -            0.0678  
LightGBM             0.8444       0.7665       0.0055++++++ 0.0779  
XGBoost              0.9100       0.7582       -0.0028+++++ 0.1517  

Selected Features:
  Total: 48
  Baseline: 24
  Bureau: 24


### Cross-Validation (5-Fold StratifiedKFold)


In [229]:
%%time

X_full = pd.concat([X_train, X_val], axis=0).reset_index(drop=True)
y_full = pd.concat([y_train, y_val], axis=0).reset_index(drop=True)

X_full_merged = pd.merge(
    pd.concat([df.loc[X_full.index, 'SK_ID_CURR'].reset_index(drop=True), X_full], axis=1),
    bureau_agg_filtered, on='SK_ID_CURR', how='left'
).drop(columns=['SK_ID_CURR'])

X_full_selected = X_full_merged[selected_features].copy()
for col in CATEGORICAL_FEATURES:
    if col in X_full_selected.columns:
        X_full_selected[col] = X_full_selected[col].map(cat_mappings[col])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

lgb_cv_scores = []
xgb_cv_scores = []

print("Running 5-Fold Cross-Validation...")
print("-" * 60)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_full_selected, y_full), 1):
    X_cv_train, X_cv_val = X_full_selected.iloc[train_idx], X_full_selected.iloc[val_idx]
    y_cv_train, y_cv_val = y_full.iloc[train_idx], y_full.iloc[val_idx]
    
    lgb_cv = LGBMClassifier(
        n_estimators=500, learning_rate=0.05, max_depth=7,
        num_leaves=31, random_state=42, class_weight='balanced', verbose=-1
    )
    lgb_cv.fit(X_cv_train, y_cv_train, eval_set=[(X_cv_val, y_cv_val)], eval_metric='auc')
    lgb_val_auc = roc_auc_score(y_cv_val, lgb_cv.predict_proba(X_cv_val)[:, 1])
    lgb_cv_scores.append(lgb_val_auc)
    
    xgb_cv = XGBClassifier(
        n_estimators=500, learning_rate=0.05, max_depth=7,
        scale_pos_weight=11.4, random_state=42, tree_method='hist',
        eval_metric='auc', verbosity=0
    )
    xgb_cv.fit(X_cv_train, y_cv_train, eval_set=[(X_cv_val, y_cv_val)], verbose=False)
    xgb_val_auc = roc_auc_score(y_cv_val, xgb_cv.predict_proba(X_cv_val)[:, 1])
    xgb_cv_scores.append(xgb_val_auc)
    
    print(f"Fold {fold}: LightGBM={lgb_val_auc:.4f}, XGBoost={xgb_val_auc:.4f}")

print("-" * 60)
print(f"\nLightGBM CV: {np.mean(lgb_cv_scores):.4f} ± {np.std(lgb_cv_scores):.4f}")
print(f"XGBoost CV:  {np.mean(xgb_cv_scores):.4f} ± {np.std(xgb_cv_scores):.4f}")
print(f"\nBaseline Val AUC: {baseline_val_auc:.4f}")
print(f"LightGBM Improvement: {np.mean(lgb_cv_scores) - baseline_val_auc:+.4f}")
print(f"XGBoost Improvement:  {np.mean(xgb_cv_scores) - baseline_val_auc:+.4f}")


Running 5-Fold Cross-Validation...
------------------------------------------------------------
Fold 1: LightGBM=0.7583, XGBoost=0.7490
Fold 2: LightGBM=0.7515, XGBoost=0.7428
Fold 3: LightGBM=0.7524, XGBoost=0.7433
Fold 4: LightGBM=0.7531, XGBoost=0.7444
Fold 5: LightGBM=0.7559, XGBoost=0.7471
------------------------------------------------------------

LightGBM CV: 0.7543 ± 0.0025
XGBoost CV:  0.7453 ± 0.0024

Baseline Val AUC: 0.7610
LightGBM Improvement: -0.0067
XGBoost Improvement:  -0.0157
CPU times: total: 24min 24s
Wall time: 2min 3s


### Save Processed Data

In [None]:
output_dir = '../../data/processed/phase1_bureau'
os.makedirs(output_dir, exist_ok=True)

train_ids = df.loc[X_train.index, 'SK_ID_CURR'].reset_index(drop=True)
val_ids = df.loc[X_val.index, 'SK_ID_CURR'].reset_index(drop=True)
test_ids = test_df['SK_ID_CURR'].reset_index(drop=True)

pd.concat([train_ids, X_train_selected.reset_index(drop=True)], axis=1).to_csv(
    f'{output_dir}/X_train.csv', index=False
)
pd.DataFrame(y_train).to_csv(f'{output_dir}/y_train.csv', index=False)

pd.concat([val_ids, X_val_selected.reset_index(drop=True)], axis=1).to_csv(
    f'{output_dir}/X_val.csv', index=False
)
pd.DataFrame(y_val).to_csv(f'{output_dir}/y_val.csv', index=False)

pd.concat([test_ids, X_test_selected.reset_index(drop=True)], axis=1).to_csv(
    f'{output_dir}/X_test.csv', index=False
)

feature_metadata = {
    'phase': 'phase1_bureau',
    'n_features_created': 52,
    'n_features_after_level1': 36,
    'n_features_final': len(selected_features),
    'feature_list': selected_features,
    'baseline_features': [f for f in selected_features if not f.startswith('bureau_')],
    'bureau_features': bureau_new_features,
    'dropped_missing': dropped_missing,
    'dropped_variance': dropped_variance,
    'dropped_corr': dropped_corr,
    'importance_threshold': importance_threshold,
    'quick_eval': {
        'lgb_val_auc': float(val_auc_lgb),
        'xgb_val_auc': float(val_auc_xgb)
    },
    'cv_eval': {
        'lgb_cv_mean': float(np.mean(lgb_cv_scores)),
        'lgb_cv_std': float(np.std(lgb_cv_scores)),
        'xgb_cv_mean': float(np.mean(xgb_cv_scores)),
        'xgb_cv_std': float(np.std(xgb_cv_scores))
    }
}

with open(f'{output_dir}/feature_metadata.json', 'w') as f:
    json.dump(feature_metadata, f, indent=2)

print(f"Saved processed data to {output_dir}/")
print(f"  - train_features.csv: {X_train_selected.shape}")
print(f"  - val_features.csv: {X_val_selected.shape}")
print(f"  - test_features.csv: {X_test_selected.shape}")
print(f"  - feature_metadata.json: {len(selected_features)} features")


Saved processed data to ../../data/processed/phase1_bureau/
  - train_features.csv: (246008, 48)
  - val_features.csv: (61503, 48)
  - test_features.csv: (48744, 48)
  - feature_metadata.json: 48 features


### MLflow Tracking


In [231]:
mlflow_tracking_uri = os.path.join(os.getcwd(), 'mlruns')
mlflow.set_tracking_uri(f"file:///{mlflow_tracking_uri}")
mlflow.set_experiment("feature_engineering")

with mlflow.start_run(run_name="phase1_bureau_lightgbm"):
    mlflow.log_param("phase", "bureau")
    mlflow.log_param("model_type", "LightGBM")
    mlflow.log_param("n_estimators", 500)
    mlflow.log_param("learning_rate", 0.05)
    mlflow.log_param("max_depth", 7)
    mlflow.log_param("n_baseline_features", len(BASELINE_FEATURES))
    mlflow.log_param("n_bureau_features", len(bureau_new_features))
    mlflow.log_param("n_features_created", 52)
    mlflow.log_param("n_features_after_level1", 36)
    mlflow.log_param("n_features_final", len(selected_features))
    mlflow.log_param("n_dropped_missing", len(dropped_missing))
    mlflow.log_param("n_dropped_variance", len(dropped_variance))
    mlflow.log_param("n_dropped_correlation", len(dropped_corr))
    mlflow.log_param("importance_threshold", importance_threshold)
    
    mlflow.log_metric("quick_train_auc", train_auc_lgb)
    mlflow.log_metric("quick_val_auc", val_auc_lgb)
    mlflow.log_metric("cv_mean_auc", np.mean(lgb_cv_scores))
    mlflow.log_metric("cv_std_auc", np.std(lgb_cv_scores))
    mlflow.log_metric("baseline_val_auc", baseline_val_auc)
    mlflow.log_metric("improvement_quick", val_auc_lgb - baseline_val_auc)
    mlflow.log_metric("improvement_cv", np.mean(lgb_cv_scores) - baseline_val_auc)
    mlflow.log_metric("train_val_gap", train_auc_lgb - val_auc_lgb)
    
    X_sample = X_train_selected.iloc[:5].fillna(0)
    y_sample = y_train.iloc[:5]
    signature = mlflow.models.infer_signature(X_sample, y_sample)
    
    mlflow.sklearn.log_model(lgb_final, "model", signature=signature, input_example=X_sample)
    
    lgb_final_importances = get_feature_importances(lgb_final, selected_features)
    lgb_final_importances.to_csv('feature_importance_lgb.csv', index=False)
    mlflow.log_artifact('feature_importance_lgb.csv')
    os.remove('feature_importance_lgb.csv')
    
    with open('selected_features.json', 'w') as f:
        json.dump({'features': selected_features}, f, indent=2)
    mlflow.log_artifact('selected_features.json')
    os.remove('selected_features.json')
    
    with open('dropped_features.json', 'w') as f:
        json.dump({
            'dropped_missing': dropped_missing,
            'dropped_variance': dropped_variance,
            'dropped_correlation': dropped_corr
        }, f, indent=2)
    mlflow.log_artifact('dropped_features.json')
    os.remove('dropped_features.json')

print("Logged LightGBM to MLflow")




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Logged LightGBM to MLflow


In [232]:
with mlflow.start_run(run_name="phase1_bureau_xgboost"):
    mlflow.log_param("phase", "bureau")
    mlflow.log_param("model_type", "XGBoost")
    mlflow.log_param("n_estimators", 500)
    mlflow.log_param("learning_rate", 0.05)
    mlflow.log_param("max_depth", 7)
    mlflow.log_param("n_baseline_features", len(BASELINE_FEATURES))
    mlflow.log_param("n_bureau_features", len(bureau_new_features))
    mlflow.log_param("n_features_created", 52)
    mlflow.log_param("n_features_after_level1", 36)
    mlflow.log_param("n_features_final", len(selected_features))
    mlflow.log_param("n_dropped_missing", len(dropped_missing))
    mlflow.log_param("n_dropped_variance", len(dropped_variance))
    mlflow.log_param("n_dropped_correlation", len(dropped_corr))
    mlflow.log_param("importance_threshold", importance_threshold)
    
    mlflow.log_metric("quick_train_auc", train_auc_xgb)
    mlflow.log_metric("quick_val_auc", val_auc_xgb)
    mlflow.log_metric("cv_mean_auc", np.mean(xgb_cv_scores))
    mlflow.log_metric("cv_std_auc", np.std(xgb_cv_scores))
    mlflow.log_metric("baseline_val_auc", baseline_val_auc)
    mlflow.log_metric("improvement_quick", val_auc_xgb - baseline_val_auc)
    mlflow.log_metric("improvement_cv", np.mean(xgb_cv_scores) - baseline_val_auc)
    mlflow.log_metric("train_val_gap", train_auc_xgb - val_auc_xgb)
    
    mlflow.sklearn.log_model(xgb_final, "model", signature=signature, input_example=X_sample)
    
    xgb_final_importances = get_feature_importances(xgb_final, selected_features)
    xgb_final_importances.to_csv('feature_importance_xgb.csv', index=False)
    mlflow.log_artifact('feature_importance_xgb.csv')
    os.remove('feature_importance_xgb.csv')
    
    with open('selected_features.json', 'w') as f:
        json.dump({'features': selected_features}, f, indent=2)
    mlflow.log_artifact('selected_features.json')
    os.remove('selected_features.json')
    
    with open('dropped_features.json', 'w') as f:
        json.dump({
            'dropped_missing': dropped_missing,
            'dropped_variance': dropped_variance,
            'dropped_correlation': dropped_corr
        }, f, indent=2)
    mlflow.log_artifact('dropped_features.json')
    os.remove('dropped_features.json')

print("Logged XGBoost to MLflow")




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Logged XGBoost to MLflow


### Generate Kaggle Submissions


In [233]:
sample_submission = pd.read_csv('../../data/raw/sample_submission.csv')

lgb_preds = lgb_final.predict_proba(X_test_selected)[:, 1]
xgb_preds = xgb_final.predict_proba(X_test_selected)[:, 1]

submission_lgb = sample_submission.copy()
submission_lgb['TARGET'] = lgb_preds

submission_xgb = sample_submission.copy()
submission_xgb['TARGET'] = xgb_preds

os.makedirs('../../data/submissions', exist_ok=True)

submission_lgb.to_csv('../../data/submissions/phase1_bureau_lightgbm_v1.csv', index=False)
submission_xgb.to_csv('../../data/submissions/phase1_bureau_xgboost_v1.csv', index=False)

print("Submission files created:")
print("  phase1_bureau_lightgbm_v1.csv")
print("  phase1_bureau_xgboost_v1.csv")

print(f"\nLightGBM predictions - Min: {lgb_preds.min():.4f}, Max: {lgb_preds.max():.4f}, Mean: {lgb_preds.mean():.4f}")
print(f"XGBoost predictions  - Min: {xgb_preds.min():.4f}, Max: {xgb_preds.max():.4f}, Mean: {xgb_preds.mean():.4f}")


Submission files created:
  phase1_bureau_lightgbm_v1.csv
  phase1_bureau_xgboost_v1.csv

LightGBM predictions - Min: 0.0029, Max: 0.9655, Mean: 0.3867
XGBoost predictions  - Min: 0.0009, Max: 0.9774, Mean: 0.3486


### Save Models


In [234]:
output_model_dir = '../../models/phase1_bureau'
os.makedirs(output_model_dir, exist_ok=True)

model_artifacts_lgb = {
    'model': lgb_final,
    'selected_features': selected_features,
    'categorical_features': CATEGORICAL_FEATURES,
    'cat_mappings': cat_mappings,
    'quick_val_auc': val_auc_lgb,
    'cv_mean_auc': np.mean(lgb_cv_scores),
    'cv_std_auc': np.std(lgb_cv_scores)
}

model_artifacts_xgb = {
    'model': xgb_final,
    'selected_features': selected_features,
    'categorical_features': CATEGORICAL_FEATURES,
    'cat_mappings': cat_mappings,
    'quick_val_auc': val_auc_xgb,
    'cv_mean_auc': np.mean(xgb_cv_scores),
    'cv_std_auc': np.std(xgb_cv_scores)
}

joblib.dump(model_artifacts_lgb, f'{output_model_dir}/lightgbm_v1.pkl')
joblib.dump(model_artifacts_xgb, f'{output_model_dir}/xgboost_v1.pkl')

print(f"Models saved to {output_model_dir}/")
print(f"  - lightgbm_v1.pkl")
print(f"  - xgboost_v1.pkl")


Models saved to ../../models/phase1_bureau/
  - lightgbm_v1.pkl
  - xgboost_v1.pkl


### Phase 1 Summary

**Strategy:** Bureau credit history aggregations with Level 1 statistical filtering and Level 2 model-based importance selection.

**Features Created:** 52 bureau features (DAYS_CREDIT patterns, AMT_CREDIT_SUM aggregations, CREDIT_ACTIVE/TYPE categories, derived ratios). Level 1 filtering: 52 → 36. Level 2 selection: Final 48 features (24 baseline + 24 bureau).

**Results:** LightGBM Val 0.7665, CV 0.7543. XGBoost Val 0.7582, CV 0.7453. Kaggle: LightGBM Public 0.74795, XGBoost Private 0.74736/Public 0.73786.

**Top Contributors:** DAYS_CREDIT patterns, AMT_CREDIT_SUM aggregations, debt_credit_ratio, active_closed_ratio, bureau_type_Microloan.

**Saved:** Processed data and models to phase1_bureau/, tracked in MLflow, submissions generated.


---

## Phase 2: Bureau Balance Features

**Base:** Phase 1 selected features (48)

Add monthly credit status patterns from bureau_balance.csv


### Load Phase 1 Base Features


In [235]:
with open('../../data/processed/phase1_bureau/feature_metadata.json', 'r') as f:
    phase1_metadata = json.load(f)

phase1_selected_features = phase1_metadata['feature_list']
phase1_val_auc_lgb = phase1_metadata['cv_eval']['lgb_cv_mean']
phase1_val_auc_xgb = phase1_metadata['cv_eval']['xgb_cv_mean']

print(f"Phase 1 Base Features: {len(phase1_selected_features)}")
print(f"Phase 1 LightGBM CV AUC: {phase1_val_auc_lgb:.4f}")
print(f"Phase 1 XGBoost CV AUC: {phase1_val_auc_xgb:.4f}")


Phase 1 Base Features: 48
Phase 1 LightGBM CV AUC: 0.7543
Phase 1 XGBoost CV AUC: 0.7453


### Bureau Balance Aggregations


In [236]:
bb = pd.read_csv('../../data/raw/bureau_balance.csv')
print(f"Bureau Balance shape: {bb.shape}")
print(f"Unique SK_ID_BUREAU: {bb['SK_ID_BUREAU'].nunique()}")

bb_agg = bb.groupby('SK_ID_BUREAU').agg({
    'MONTHS_BALANCE': ['min', 'max', 'size']
}).reset_index()
bb_agg.columns = ['SK_ID_BUREAU', 'bb_months_balance_min', 'bb_months_balance_max', 'bb_months_balance_size']

status_dummies = pd.get_dummies(bb['STATUS'], prefix='bb_status')
bb_status = pd.concat([bb[['SK_ID_BUREAU']], status_dummies], axis=1)
bb_status_agg = bb_status.groupby('SK_ID_BUREAU').sum().reset_index()

bb_agg = bb_agg.merge(bb_status_agg, on='SK_ID_BUREAU', how='left')

bureau_bb = bureau[['SK_ID_CURR', 'SK_ID_BUREAU']].merge(bb_agg, on='SK_ID_BUREAU', how='left')

bb_features = bureau_bb.drop(columns=['SK_ID_BUREAU']).groupby('SK_ID_CURR').agg({
    'bb_months_balance_min': ['min', 'mean'],
    'bb_months_balance_max': ['max', 'mean'],
    'bb_months_balance_size': ['sum', 'mean', 'max']
}).reset_index()

bb_features.columns = ['SK_ID_CURR'] + ['_'.join(col).strip() for col in bb_features.columns[1:]]

status_cols = [col for col in bureau_bb.columns if col.startswith('bb_status')]
if status_cols:
    bb_status_final = bureau_bb[['SK_ID_CURR'] + status_cols].groupby('SK_ID_CURR').sum().reset_index()
    bb_features = bb_features.merge(bb_status_final, on='SK_ID_CURR', how='left')

print(f"Bureau Balance features created: {bb_features.shape[1] - 1}")


Bureau Balance shape: (27299925, 3)
Unique SK_ID_BUREAU: 817395
Bureau Balance features created: 15


### Level 1 Filtering


In [237]:
bb_features_only = bb_features.drop(columns=['SK_ID_CURR']).select_dtypes(include=[np.number])

print(f"Before filtering: {bb_features_only.shape[1]} features")

bb_features_only, bb_dropped_missing = filter_by_missing(bb_features_only, threshold=0.80)
print(f"Dropped {len(bb_dropped_missing)} features (>80% missing)")

bb_features_only, bb_dropped_variance = remove_low_variance(bb_features_only, threshold=0.01)
print(f"Dropped {len(bb_dropped_variance)} features (low variance)")

bb_features_only, bb_dropped_corr = remove_correlated(bb_features_only, threshold=0.95)
print(f"Dropped {len(bb_dropped_corr)} features (high correlation)")

print(f"\nAfter Level 1 filtering: {bb_features_only.shape[1]} features")


Before filtering: 15 features
Dropped 0 features (>80% missing)
Dropped 0 features (low variance)
Dropped 0 features (high correlation)

After Level 1 filtering: 15 features


### Merge with Phase 1 Features + Train/Val/Test


In [238]:
bb_features_filtered = pd.concat([bb_features[['SK_ID_CURR']], bb_features_only], axis=1)

X_train_phase2 = X_train_bureau[phase1_selected_features].copy()
X_val_phase2 = X_val_bureau[phase1_selected_features].copy()

X_test_with_bureau = pd.merge(
    test_df[['SK_ID_CURR'] + BASELINE_FEATURES],
    bureau_agg_filtered, on='SK_ID_CURR', how='left'
).drop(columns=['SK_ID_CURR'])
X_test_phase2 = X_test_with_bureau[phase1_selected_features].copy()

train_ids = df.loc[X_train.index, 'SK_ID_CURR'].reset_index(drop=True)
val_ids = df.loc[X_val.index, 'SK_ID_CURR'].reset_index(drop=True)
test_ids = test_df['SK_ID_CURR'].reset_index(drop=True)

X_train_bb = pd.merge(
    pd.concat([train_ids, X_train_phase2.reset_index(drop=True)], axis=1),
    bb_features_filtered, on='SK_ID_CURR', how='left'
).drop(columns=['SK_ID_CURR'])

X_val_bb = pd.merge(
    pd.concat([val_ids, X_val_phase2.reset_index(drop=True)], axis=1),
    bb_features_filtered, on='SK_ID_CURR', how='left'
).drop(columns=['SK_ID_CURR'])

X_test_bb = pd.merge(
    pd.concat([test_ids, X_test_phase2.reset_index(drop=True)], axis=1),
    bb_features_filtered, on='SK_ID_CURR', how='left'
).drop(columns=['SK_ID_CURR'])

print(f"Train shape: {X_train_bb.shape}")
print(f"Val shape: {X_val_bb.shape}")
print(f"Test shape: {X_test_bb.shape}")


Train shape: (246008, 63)
Val shape: (61503, 63)
Test shape: (48744, 63)


### Preliminary Training (for Level 2 Filtering)


In [239]:
%%time

X_train_lgb_p2 = X_train_bb.copy()
X_val_lgb_p2 = X_val_bb.copy()
X_test_lgb_p2 = X_test_bb.copy()

for col in CATEGORICAL_FEATURES:
    if col in X_train_lgb_p2.columns:
        X_train_lgb_p2[col] = X_train_lgb_p2[col].map(cat_mappings[col])
        X_val_lgb_p2[col] = X_val_lgb_p2[col].map(cat_mappings[col])
        X_test_lgb_p2[col] = X_test_lgb_p2[col].map(cat_mappings[col])

lgb_bb_prelim = LGBMClassifier(
    random_state=42,
    class_weight='balanced',
    verbose=-1
)

lgb_bb_prelim.fit(
    X_train_lgb_p2, y_train,
    eval_set=[(X_val_lgb_p2, y_val)],
    eval_metric='auc'
)

train_auc_lgb_p2_prelim = roc_auc_score(y_train, lgb_bb_prelim.predict_proba(X_train_lgb_p2)[:, 1])
val_auc_lgb_p2_prelim = roc_auc_score(y_val, lgb_bb_prelim.predict_proba(X_val_lgb_p2)[:, 1])

print("Preliminary LightGBM Results:")
print(f"  Train AUC: {train_auc_lgb_p2_prelim:.4f}")
print(f"  Val AUC: {val_auc_lgb_p2_prelim:.4f}")


Preliminary LightGBM Results:
  Train AUC: 0.8041
  Val AUC: 0.7640
CPU times: total: 25.2 s
Wall time: 3.93 s


In [240]:
%%time

xgb_bb_prelim = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    scale_pos_weight=11.4,
    random_state=42,
    tree_method='hist',
    eval_metric='auc',
    verbosity=0
)

xgb_bb_prelim.fit(
    X_train_lgb_p2, y_train,
    eval_set=[(X_val_lgb_p2, y_val)],
    verbose=False
)

train_auc_xgb_p2_prelim = roc_auc_score(y_train, xgb_bb_prelim.predict_proba(X_train_lgb_p2)[:, 1])
val_auc_xgb_p2_prelim = roc_auc_score(y_val, xgb_bb_prelim.predict_proba(X_val_lgb_p2)[:, 1])

print("\nPreliminary XGBoost Results:")
print(f"  Train AUC: {train_auc_xgb_p2_prelim:.4f}")
print(f"  Val AUC: {val_auc_xgb_p2_prelim:.4f}")



Preliminary XGBoost Results:
  Train AUC: 0.9093
  Val AUC: 0.7582
CPU times: total: 5min 15s
Wall time: 23.6 s


### Level 2 Filtering: Feature Importance Selection


In [241]:
lgb_importances_p2 = get_feature_importances(lgb_bb_prelim, X_train_lgb_p2.columns.tolist())
xgb_importances_p2 = get_feature_importances(xgb_bb_prelim, X_train_lgb_p2.columns.tolist())

print("\nTop 10 LightGBM Importances:")
print(lgb_importances_p2.head(10))

print("\nTop 10 XGBoost Importances:")
print(xgb_importances_p2.head(10))

importance_threshold_p2 = 20
lgb_selected_p2 = select_by_importance_threshold(lgb_importances_p2, importance_threshold_p2)
xgb_selected_p2 = select_by_importance_threshold(xgb_importances_p2, importance_threshold_p2)

common_p2 = list(set(lgb_selected_p2) & set(xgb_selected_p2))
all_selected_p2 = list(set(lgb_selected_p2) | set(xgb_selected_p2))

print(f"\nThreshold: {importance_threshold_p2}")
print(f"LightGBM selected: {len(lgb_selected_p2)}")
print(f"XGBoost selected: {len(xgb_selected_p2)}")
print(f"Common features: {len(common_p2)}")
print(f"Union features: {len(all_selected_p2)}")

selected_features_p2 = all_selected_p2
print(f"\nUsing UNION: {len(selected_features_p2)} features selected")



Top 10 LightGBM Importances:
                           feature  importance
7                     EXT_SOURCE_1         233
31                    EXT_SOURCE_3         179
32                    EXT_SOURCE_2         175
27                      DAYS_BIRTH         157
42                      AMT_CREDIT         149
37                     AMT_ANNUITY         127
8                  AMT_GOODS_PRICE         116
1                    DAYS_EMPLOYED          93
29  bureau_DAYS_CREDIT_ENDDATE_max          87
9                  DAYS_ID_PUBLISH          79

Top 10 XGBoost Importances:
                     feature  importance
31              EXT_SOURCE_3    0.065170
32              EXT_SOURCE_2    0.057020
36               CODE_GENDER    0.046176
25       NAME_EDUCATION_TYPE    0.034258
13        NAME_CONTRACT_TYPE    0.033505
7               EXT_SOURCE_1    0.025308
15              FLAG_OWN_CAR    0.024942
16     bureau_type_Microloan    0.020835
43          NAME_INCOME_TYPE    0.020273
40  DEF_60_CNT

### Final Model Training + Quick Evaluation + Cross-Validation + Saves


In [242]:
X_train_selected_p2 = X_train_lgb_p2[selected_features_p2]
X_val_selected_p2 = X_val_lgb_p2[selected_features_p2]
X_test_selected_p2 = X_test_lgb_p2[selected_features_p2]

print(f"Selected features shape:")
print(f"  Train: {X_train_selected_p2.shape}")
print(f"  Val: {X_val_selected_p2.shape}")
print(f"  Test: {X_test_selected_p2.shape}")


Selected features shape:
  Train: (246008, 48)
  Val: (61503, 48)
  Test: (48744, 48)


In [243]:
%%time

lgb_final_p2 = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=31,
    random_state=42,
    class_weight='balanced',
    verbose=-1
)

lgb_final_p2.fit(
    X_train_selected_p2, y_train,
    eval_set=[(X_val_selected_p2, y_val)],
    eval_metric='auc'
)

train_auc_lgb_p2 = roc_auc_score(y_train, lgb_final_p2.predict_proba(X_train_selected_p2)[:, 1])
val_auc_lgb_p2 = roc_auc_score(y_val, lgb_final_p2.predict_proba(X_val_selected_p2)[:, 1])

print("Final LightGBM Results:")
print(f"  Train AUC: {train_auc_lgb_p2:.4f}")
print(f"  Val AUC: {val_auc_lgb_p2:.4f}")


Final LightGBM Results:
  Train AUC: 0.8448
  Val AUC: 0.7664
CPU times: total: 1min 14s
Wall time: 9.62 s


In [244]:
%%time

xgb_final_p2 = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    scale_pos_weight=11.4,
    random_state=42,
    tree_method='hist',
    eval_metric='auc',
    verbosity=0
)

xgb_final_p2.fit(
    X_train_selected_p2, y_train,
    eval_set=[(X_val_selected_p2, y_val)],
    verbose=False
)

train_auc_xgb_p2 = roc_auc_score(y_train, xgb_final_p2.predict_proba(X_train_selected_p2)[:, 1])
val_auc_xgb_p2 = roc_auc_score(y_val, xgb_final_p2.predict_proba(X_val_selected_p2)[:, 1])

print("\nFinal XGBoost Results:")
print(f"  Train AUC: {train_auc_xgb_p2:.4f}")
print(f"  Val AUC: {val_auc_xgb_p2:.4f}")



Final XGBoost Results:
  Train AUC: 0.9100
  Val AUC: 0.7591
CPU times: total: 4min 22s
Wall time: 20.2 s


In [245]:
print("="*60)
print("QUICK EVALUATION (80/20 Split - Phase 1 Comparison)")
print("="*60)
print(f"{'Model':<20} {'Train AUC':<12} {'Val AUC':<12} {'Improvement':<12} {'Gap':<8}")
print("-"*60)
print(f"{'Phase 1 LightGBM':<20} {'-':<12} {phase1_val_auc_lgb:<12.4f} {'-':<12} {'-':<8}")
print(f"{'Phase 2 LightGBM':<20} {train_auc_lgb_p2:<12.4f} {val_auc_lgb_p2:<12.4f} {val_auc_lgb_p2 - phase1_val_auc_lgb:+<12.4f} {train_auc_lgb_p2 - val_auc_lgb_p2:<8.4f}")
print(f"{'Phase 1 XGBoost':<20} {'-':<12} {phase1_val_auc_xgb:<12.4f} {'-':<12} {'-':<8}")
print(f"{'Phase 2 XGBoost':<20} {train_auc_xgb_p2:<12.4f} {val_auc_xgb_p2:<12.4f} {val_auc_xgb_p2 - phase1_val_auc_xgb:+<12.4f} {train_auc_xgb_p2 - val_auc_xgb_p2:<8.4f}")
print("="*60)

bb_new_features = [f for f in selected_features_p2 if f.startswith('bb_')]
print(f"\nSelected Features:")
print(f"  Total: {len(selected_features_p2)}")
print(f"  Phase 1: {len([f for f in selected_features_p2 if f in phase1_selected_features])}")
print(f"  Bureau Balance (new): {len(bb_new_features)}")


QUICK EVALUATION (80/20 Split - Phase 1 Comparison)
Model                Train AUC    Val AUC      Improvement  Gap     
------------------------------------------------------------
Phase 1 LightGBM     -            0.7543       -            -       
Phase 2 LightGBM     0.8448       0.7664       0.0122++++++ 0.0784  
Phase 1 XGBoost      -            0.7453       -            -       
Phase 2 XGBoost      0.9100       0.7591       0.0138++++++ 0.1510  

Selected Features:
  Total: 48
  Phase 1: 46
  Bureau Balance (new): 2


### Cross-Validation (5-Fold StratifiedKFold)


In [246]:
%%time

X_full = pd.concat([X_train, X_val], axis=0).reset_index(drop=True)
y_full = pd.concat([y_train, y_val], axis=0).reset_index(drop=True)

X_full_with_bureau = pd.merge(
    pd.concat([df.loc[X_full.index, 'SK_ID_CURR'].reset_index(drop=True), X_full], axis=1),
    bureau_agg_filtered, on='SK_ID_CURR', how='left'
).drop(columns=['SK_ID_CURR'])

X_full_phase2 = X_full_with_bureau[phase1_selected_features].copy()

full_ids = pd.concat([df.loc[X_train.index, 'SK_ID_CURR'], df.loc[X_val.index, 'SK_ID_CURR']]).reset_index(drop=True)
X_full_bb = pd.merge(
    pd.concat([full_ids, X_full_phase2], axis=1),
    bb_features_filtered, on='SK_ID_CURR', how='left'
).drop(columns=['SK_ID_CURR'])

X_full_selected_p2 = X_full_bb[selected_features_p2].copy()
for col in CATEGORICAL_FEATURES:
    if col in X_full_selected_p2.columns:
        X_full_selected_p2[col] = X_full_selected_p2[col].map(cat_mappings[col])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

lgb_cv_scores_p2 = []
xgb_cv_scores_p2 = []

print("-" * 60)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_full_selected_p2, y_full), 1):
    X_cv_train, X_cv_val = X_full_selected_p2.iloc[train_idx], X_full_selected_p2.iloc[val_idx]
    y_cv_train, y_cv_val = y_full.iloc[train_idx], y_full.iloc[val_idx]
    
    lgb_cv = LGBMClassifier(
        n_estimators=500, learning_rate=0.05, max_depth=7,
        num_leaves=31, random_state=42, class_weight='balanced', verbose=-1
    )
    lgb_cv.fit(X_cv_train, y_cv_train, eval_set=[(X_cv_val, y_cv_val)], eval_metric='auc')
    lgb_val_auc = roc_auc_score(y_cv_val, lgb_cv.predict_proba(X_cv_val)[:, 1])
    lgb_cv_scores_p2.append(lgb_val_auc)
    
    xgb_cv = XGBClassifier(
        n_estimators=500, learning_rate=0.05, max_depth=7,
        scale_pos_weight=11.4, random_state=42, tree_method='hist',
        eval_metric='auc', verbosity=0
    )
    xgb_cv.fit(X_cv_train, y_cv_train, eval_set=[(X_cv_val, y_cv_val)], verbose=False)
    xgb_val_auc = roc_auc_score(y_cv_val, xgb_cv.predict_proba(X_cv_val)[:, 1])
    xgb_cv_scores_p2.append(xgb_val_auc)
    
    print(f"Fold {fold}: LightGBM={lgb_val_auc:.4f}, XGBoost={xgb_val_auc:.4f}")

print("-" * 60)
print(f"\nLightGBM CV: {np.mean(lgb_cv_scores_p2):.4f} ± {np.std(lgb_cv_scores_p2):.4f}")
print(f"XGBoost CV:  {np.mean(xgb_cv_scores_p2):.4f} ± {np.std(xgb_cv_scores_p2):.4f}")
print(f"\nPhase 1 LightGBM CV: {phase1_val_auc_lgb:.4f}")
print(f"Phase 1 XGBoost CV:  {phase1_val_auc_xgb:.4f}")
print(f"\nLightGBM Improvement: {np.mean(lgb_cv_scores_p2) - phase1_val_auc_lgb:+.4f}")
print(f"XGBoost Improvement:  {np.mean(xgb_cv_scores_p2) - phase1_val_auc_xgb:+.4f}")


------------------------------------------------------------
Fold 1: LightGBM=0.7583, XGBoost=0.7483
Fold 2: LightGBM=0.7520, XGBoost=0.7433
Fold 3: LightGBM=0.7534, XGBoost=0.7435
Fold 4: LightGBM=0.7542, XGBoost=0.7471
Fold 5: LightGBM=0.7566, XGBoost=0.7470
------------------------------------------------------------

LightGBM CV: 0.7549 ± 0.0022
XGBoost CV:  0.7458 ± 0.0021

Phase 1 LightGBM CV: 0.7543
Phase 1 XGBoost CV:  0.7453

LightGBM Improvement: +0.0006
XGBoost Improvement:  +0.0005
CPU times: total: 25min 48s
Wall time: 2min 13s


### Save Processed Data


In [None]:
output_dir_p2 = '../../data/processed/phase2_bureau_balance'
os.makedirs(output_dir_p2, exist_ok=True)

train_ids = df.loc[X_train.index, 'SK_ID_CURR'].reset_index(drop=True)
val_ids = df.loc[X_val.index, 'SK_ID_CURR'].reset_index(drop=True)
test_ids = test_df['SK_ID_CURR'].reset_index(drop=True)

pd.concat([train_ids, X_train_selected_p2.reset_index(drop=True)], axis=1).to_csv(
    f'{output_dir_p2}/X_train.csv', index=False
)
pd.DataFrame(y_train).to_csv(f'{output_dir_p2}/y_train.csv', index=False)

pd.concat([val_ids, X_val_selected_p2.reset_index(drop=True)], axis=1).to_csv(
    f'{output_dir_p2}/X_val.csv', index=False
)
pd.DataFrame(y_val).to_csv(f'{output_dir_p2}/y_val.csv', index=False)

pd.concat([test_ids, X_test_selected_p2.reset_index(drop=True)], axis=1).to_csv(
    f'{output_dir_p2}/X_test.csv', index=False
)

feature_metadata_p2 = {
    'phase': 'phase2_bureau_balance',
    'base_phase': 'phase1_bureau',
    'n_phase1_features': len(phase1_selected_features),
    'n_bb_features_created': bb_features.shape[1] - 1,
    'n_bb_features_after_level1': bb_features_only.shape[1],
    'n_features_final': len(selected_features_p2),
    'feature_list': selected_features_p2,
    'phase1_features': [f for f in selected_features_p2 if f in phase1_selected_features],
    'bb_features_new': bb_new_features,
    'dropped_missing': bb_dropped_missing,
    'dropped_variance': bb_dropped_variance,
    'dropped_corr': bb_dropped_corr,
    'importance_threshold': importance_threshold_p2,
    'quick_eval': {
        'lgb_val_auc': float(val_auc_lgb_p2),
        'xgb_val_auc': float(val_auc_xgb_p2),
        'lgb_improvement': float(val_auc_lgb_p2 - phase1_val_auc_lgb),
        'xgb_improvement': float(val_auc_xgb_p2 - phase1_val_auc_xgb)
    },
    'cv_eval': {
        'lgb_cv_mean': float(np.mean(lgb_cv_scores_p2)),
        'lgb_cv_std': float(np.std(lgb_cv_scores_p2)),
        'xgb_cv_mean': float(np.mean(xgb_cv_scores_p2)),
        'xgb_cv_std': float(np.std(xgb_cv_scores_p2)),
        'lgb_improvement_cv': float(np.mean(lgb_cv_scores_p2) - phase1_val_auc_lgb),
        'xgb_improvement_cv': float(np.mean(xgb_cv_scores_p2) - phase1_val_auc_xgb)
    }
}

with open(f'{output_dir_p2}/feature_metadata.json', 'w') as f:
    json.dump(feature_metadata_p2, f, indent=2)

print(f"Saved processed data to {output_dir_p2}/")
print(f"  - train_features.csv: {X_train_selected_p2.shape}")
print(f"  - val_features.csv: {X_val_selected_p2.shape}")
print(f"  - test_features.csv: {X_test_selected_p2.shape}")
print(f"  - feature_metadata.json: {len(selected_features_p2)} features")


Saved processed data to ../../data/processed/phase2_bureau_balance/
  - train_features.csv: (246008, 48)
  - val_features.csv: (61503, 48)
  - test_features.csv: (48744, 48)
  - feature_metadata.json: 48 features


### MLflow Tracking


In [248]:
mlflow_tracking_uri = os.path.join(os.getcwd(), 'mlruns')
mlflow.set_tracking_uri(f"file:///{mlflow_tracking_uri}")
mlflow.set_experiment("feature_engineering")

with mlflow.start_run(run_name="phase2_bureau_balance_lightgbm"):
    mlflow.log_param("phase", "bureau_balance")
    mlflow.log_param("base_phase", "phase1_bureau")
    mlflow.log_param("model_type", "LightGBM")
    mlflow.log_param("n_estimators", 500)
    mlflow.log_param("learning_rate", 0.05)
    mlflow.log_param("max_depth", 7)
    mlflow.log_param("n_phase1_features", len(phase1_selected_features))
    mlflow.log_param("n_bb_features_new", len(bb_new_features))
    mlflow.log_param("n_features_created", bb_features.shape[1] - 1)
    mlflow.log_param("n_features_after_level1", bb_features_only.shape[1])
    mlflow.log_param("n_features_final", len(selected_features_p2))
    mlflow.log_param("n_dropped_missing", len(bb_dropped_missing))
    mlflow.log_param("n_dropped_variance", len(bb_dropped_variance))
    mlflow.log_param("n_dropped_correlation", len(bb_dropped_corr))
    mlflow.log_param("importance_threshold", importance_threshold_p2)
    
    mlflow.log_metric("quick_train_auc", train_auc_lgb_p2)
    mlflow.log_metric("quick_val_auc", val_auc_lgb_p2)
    mlflow.log_metric("cv_mean_auc", np.mean(lgb_cv_scores_p2))
    mlflow.log_metric("cv_std_auc", np.std(lgb_cv_scores_p2))
    mlflow.log_metric("phase1_val_auc", phase1_val_auc_lgb)
    mlflow.log_metric("improvement_quick", val_auc_lgb_p2 - phase1_val_auc_lgb)
    mlflow.log_metric("improvement_cv", np.mean(lgb_cv_scores_p2) - phase1_val_auc_lgb)
    mlflow.log_metric("train_val_gap", train_auc_lgb_p2 - val_auc_lgb_p2)
    
    X_sample = X_train_selected_p2.iloc[:5].fillna(0)
    y_sample = y_train.iloc[:5]
    signature = mlflow.models.infer_signature(X_sample, y_sample)
    
    mlflow.sklearn.log_model(lgb_final_p2, "model", signature=signature, input_example=X_sample)
    
    lgb_importances_final_p2 = get_feature_importances(lgb_final_p2, selected_features_p2)
    lgb_importances_final_p2.to_csv('feature_importance_lgb_p2.csv', index=False)
    mlflow.log_artifact('feature_importance_lgb_p2.csv')
    os.remove('feature_importance_lgb_p2.csv')
    
    with open('selected_features_p2.json', 'w') as f:
        json.dump({'features': selected_features_p2}, f, indent=2)
    mlflow.log_artifact('selected_features_p2.json')
    os.remove('selected_features_p2.json')
    
    with open('dropped_features_p2.json', 'w') as f:
        json.dump({
            'dropped_missing': bb_dropped_missing,
            'dropped_variance': bb_dropped_variance,
            'dropped_correlation': bb_dropped_corr
        }, f, indent=2)
    mlflow.log_artifact('dropped_features_p2.json')
    os.remove('dropped_features_p2.json')

print("Logged LightGBM to MLflow")




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Logged LightGBM to MLflow


In [249]:
with mlflow.start_run(run_name="phase2_bureau_balance_xgboost"):
    mlflow.log_param("phase", "bureau_balance")
    mlflow.log_param("base_phase", "phase1_bureau")
    mlflow.log_param("model_type", "XGBoost")
    mlflow.log_param("n_estimators", 500)
    mlflow.log_param("learning_rate", 0.05)
    mlflow.log_param("max_depth", 7)
    mlflow.log_param("n_phase1_features", len(phase1_selected_features))
    mlflow.log_param("n_bb_features_new", len(bb_new_features))
    mlflow.log_param("n_features_created", bb_features.shape[1] - 1)
    mlflow.log_param("n_features_after_level1", bb_features_only.shape[1])
    mlflow.log_param("n_features_final", len(selected_features_p2))
    mlflow.log_param("n_dropped_missing", len(bb_dropped_missing))
    mlflow.log_param("n_dropped_variance", len(bb_dropped_variance))
    mlflow.log_param("n_dropped_correlation", len(bb_dropped_corr))
    mlflow.log_param("importance_threshold", importance_threshold_p2)
    
    mlflow.log_metric("quick_train_auc", train_auc_xgb_p2)
    mlflow.log_metric("quick_val_auc", val_auc_xgb_p2)
    mlflow.log_metric("cv_mean_auc", np.mean(xgb_cv_scores_p2))
    mlflow.log_metric("cv_std_auc", np.std(xgb_cv_scores_p2))
    mlflow.log_metric("phase1_val_auc", phase1_val_auc_xgb)
    mlflow.log_metric("improvement_quick", val_auc_xgb_p2 - phase1_val_auc_xgb)
    mlflow.log_metric("improvement_cv", np.mean(xgb_cv_scores_p2) - phase1_val_auc_xgb)
    mlflow.log_metric("train_val_gap", train_auc_xgb_p2 - val_auc_xgb_p2)
    
    mlflow.sklearn.log_model(xgb_final_p2, "model", signature=signature, input_example=X_sample)
    
    xgb_importances_final_p2 = get_feature_importances(xgb_final_p2, selected_features_p2)
    xgb_importances_final_p2.to_csv('feature_importance_xgb_p2.csv', index=False)
    mlflow.log_artifact('feature_importance_xgb_p2.csv')
    os.remove('feature_importance_xgb_p2.csv')
    
    with open('selected_features_p2.json', 'w') as f:
        json.dump({'features': selected_features_p2}, f, indent=2)
    mlflow.log_artifact('selected_features_p2.json')
    os.remove('selected_features_p2.json')
    
    with open('dropped_features_p2.json', 'w') as f:
        json.dump({
            'dropped_missing': bb_dropped_missing,
            'dropped_variance': bb_dropped_variance,
            'dropped_correlation': bb_dropped_corr
        }, f, indent=2)
    mlflow.log_artifact('dropped_features_p2.json')
    os.remove('dropped_features_p2.json')

print("Logged XGBoost to MLflow")




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Logged XGBoost to MLflow


### Save Models


In [250]:
output_model_dir_p2 = '../../models/phase2_bureau_balance'
os.makedirs(output_model_dir_p2, exist_ok=True)

model_artifacts_lgb_p2 = {
    'model': lgb_final_p2,
    'selected_features': selected_features_p2,
    'categorical_features': CATEGORICAL_FEATURES,
    'cat_mappings': cat_mappings,
    'quick_val_auc': val_auc_lgb_p2,
    'cv_mean_auc': np.mean(lgb_cv_scores_p2),
    'cv_std_auc': np.std(lgb_cv_scores_p2),
    'phase1_val_auc': phase1_val_auc_lgb,
    'improvement': val_auc_lgb_p2 - phase1_val_auc_lgb
}

model_artifacts_xgb_p2 = {
    'model': xgb_final_p2,
    'selected_features': selected_features_p2,
    'categorical_features': CATEGORICAL_FEATURES,
    'cat_mappings': cat_mappings,
    'quick_val_auc': val_auc_xgb_p2,
    'cv_mean_auc': np.mean(xgb_cv_scores_p2),
    'cv_std_auc': np.std(xgb_cv_scores_p2),
    'phase1_val_auc': phase1_val_auc_xgb,
    'improvement': val_auc_xgb_p2 - phase1_val_auc_xgb
}

joblib.dump(model_artifacts_lgb_p2, f'{output_model_dir_p2}/lightgbm_v1.pkl')
joblib.dump(model_artifacts_xgb_p2, f'{output_model_dir_p2}/xgboost_v1.pkl')

print(f"Models saved to {output_model_dir_p2}/")
print(f"  - lightgbm_v1.pkl")
print(f"  - xgboost_v1.pkl")


Models saved to ../../models/phase2_bureau_balance/
  - lightgbm_v1.pkl
  - xgboost_v1.pkl


### Generate Kaggle Submissions


In [251]:
sample_submission = pd.read_csv('../../data/raw/sample_submission.csv')

lgb_preds_p2 = lgb_final_p2.predict_proba(X_test_selected_p2)[:, 1]
xgb_preds_p2 = xgb_final_p2.predict_proba(X_test_selected_p2)[:, 1]

submission_lgb_p2 = sample_submission.copy()
submission_lgb_p2['TARGET'] = lgb_preds_p2

submission_xgb_p2 = sample_submission.copy()
submission_xgb_p2['TARGET'] = xgb_preds_p2

os.makedirs('../../data/submissions', exist_ok=True)

submission_lgb_p2.to_csv('../../data/submissions/phase2_bureau_balance_lightgbm_v1.csv', index=False)
submission_xgb_p2.to_csv('../../data/submissions/phase2_bureau_balance_xgboost_v1.csv', index=False)

print("Submission files created:")
print("  phase2_bureau_balance_lightgbm_v1.csv")
print("  phase2_bureau_balance_xgboost_v1.csv")

print(f"\nLightGBM predictions - Min: {lgb_preds_p2.min():.4f}, Max: {lgb_preds_p2.max():.4f}, Mean: {lgb_preds_p2.mean():.4f}")
print(f"XGBoost predictions  - Min: {xgb_preds_p2.min():.4f}, Max: {xgb_preds_p2.max():.4f}, Mean: {xgb_preds_p2.mean():.4f}")


Submission files created:
  phase2_bureau_balance_lightgbm_v1.csv
  phase2_bureau_balance_xgboost_v1.csv

LightGBM predictions - Min: 0.0045, Max: 0.9661, Mean: 0.3758
XGBoost predictions  - Min: 0.0015, Max: 0.9817, Mean: 0.3368


### Phase 2 Summary

**Strategy:** Bureau Balance monthly status patterns linked through bureau table, with minimal incremental gain.

**Features Created:** 17-20 bureau_balance features (MONTHS_BALANCE aggregations, STATUS categories via OneHotEncoder). Level 1 filtering: No drops. Level 2 selection: Only 2 new features passed (bb_months_balance_size_mean, bb_status_1). Final: 48 features total.

**Results:** LightGBM Val 0.7664, CV 0.7549 (+0.0006 from Phase 1). XGBoost Val 0.7591, CV 0.7458. Kaggle: LightGBM Private 0.75467/Public 0.74934, XGBoost Private 0.74857/Public 0.73306.

**Top Contributors:** bb_months_balance_size_mean, bb_status_1. Most STATUS categories filtered out due to low variance.

**Insight:** Minimal impact phase. Bureau Balance added limited value. XGBoost overfitting worsened.

**Saved:** Processed data and models to phase2_bureau_balance/, tracked in MLflow, submissions generated.

---

## Phase 3: Previous Application Features

**Base:** Phase 2 selected features (48)

Add Home Credit application history patterns


### Load Phase 2 Base Features


In [252]:
with open('../../data/processed/phase2_bureau_balance/feature_metadata.json', 'r') as f:
    phase2_metadata = json.load(f)

phase2_selected_features = phase2_metadata['feature_list']
phase2_val_auc_lgb = phase2_metadata['cv_eval']['lgb_cv_mean']

print(f"Phase 2 Base Features: {len(phase2_selected_features)}")
print(f"Phase 2 LightGBM CV AUC: {phase2_val_auc_lgb:.4f}")


Phase 2 Base Features: 48
Phase 2 LightGBM CV AUC: 0.7549


### Previous Application Aggregations


In [253]:
prev = pd.read_csv('../../data/raw/previous_application.csv')
print(f"Previous Application shape: {prev.shape}")
print(f"Unique SK_ID_CURR: {prev['SK_ID_CURR'].nunique()}")

prev_agg = prev.groupby('SK_ID_CURR').agg({
    'AMT_ANNUITY': ['min', 'max', 'mean', 'std'],
    'AMT_APPLICATION': ['min', 'max', 'mean', 'std', 'sum'],
    'AMT_CREDIT': ['min', 'max', 'mean', 'std', 'sum'],
    'AMT_GOODS_PRICE': ['min', 'max', 'mean', 'std'],
    'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
    'DAYS_DECISION': ['min', 'max', 'mean', 'std'],
    'CNT_PAYMENT': ['min', 'max', 'mean', 'std']
}).reset_index()

prev_agg.columns = ['SK_ID_CURR'] + [f'prev_{col[0]}_{col[1]}' for col in prev_agg.columns[1:]]

print(f"Previous Application aggregated features: {prev_agg.shape[1] - 1}")


Previous Application shape: (1670214, 37)
Unique SK_ID_CURR: 338857
Previous Application aggregated features: 29


In [254]:
prev_status = aggregate_categorical_ohe(prev, 'SK_ID_CURR', 'NAME_CONTRACT_STATUS', 'prev_status')
prev_type = aggregate_categorical_ohe(prev, 'SK_ID_CURR', 'NAME_CONTRACT_TYPE', 'prev_type')

prev_agg = prev_agg.merge(prev_status, on='SK_ID_CURR', how='left')
prev_agg = prev_agg.merge(prev_type, on='SK_ID_CURR', how='left')

if 'prev_status_Approved' in prev_agg.columns and 'prev_status_Refused' in prev_agg.columns:
    prev_agg['prev_approval_rate'] = prev_agg['prev_status_Approved'] / (prev_agg['prev_status_Approved'] + prev_agg['prev_status_Refused'] + 1)

prev_agg['prev_app_credit_diff'] = (prev_agg['prev_AMT_APPLICATION_mean'] - prev_agg['prev_AMT_CREDIT_mean']) / (prev_agg['prev_AMT_APPLICATION_mean'] + 1)
prev_agg['prev_app_count'] = prev.groupby('SK_ID_CURR').size().values

print(f"Total features with categorical and derived: {prev_agg.shape[1] - 1}")


Total features with categorical and derived: 40


### Level 1 Filtering


In [255]:
prev_features = prev_agg.drop(columns=['SK_ID_CURR']).select_dtypes(include=[np.number])

print(f"Before filtering: {prev_features.shape[1]} features")

prev_features, prev_dropped_missing = filter_by_missing(prev_features, threshold=0.80)
print(f"Dropped {len(prev_dropped_missing)} features (>80% missing)")

prev_features, prev_dropped_variance = remove_low_variance(prev_features, threshold=0.01)
print(f"Dropped {len(prev_dropped_variance)} features (low variance)")

prev_features, prev_dropped_corr = remove_correlated(prev_features, threshold=0.95)
print(f"Dropped {len(prev_dropped_corr)} features (high correlation)")

print(f"\nAfter Level 1 filtering: {prev_features.shape[1]} features")


Before filtering: 40 features
Dropped 0 features (>80% missing)
Dropped 1 features (low variance)
Dropped 7 features (high correlation)

After Level 1 filtering: 32 features


### Merge with Phase 2 Features + Train/Val/Test


In [256]:
prev_agg_filtered = pd.concat([prev_agg[['SK_ID_CURR']], prev_features], axis=1)

X_train_with_bureau_bb = pd.merge(
    pd.concat([df.loc[X_train.index, 'SK_ID_CURR'].reset_index(drop=True), X_train.reset_index(drop=True)], axis=1),
    bureau_agg_filtered, on='SK_ID_CURR', how='left'
)
X_train_with_bureau_bb = pd.merge(
    X_train_with_bureau_bb,
    bb_features_filtered, on='SK_ID_CURR', how='left'
).drop(columns=['SK_ID_CURR'])
X_train_phase3 = X_train_with_bureau_bb[phase2_selected_features].copy()

X_val_with_bureau_bb = pd.merge(
    pd.concat([df.loc[X_val.index, 'SK_ID_CURR'].reset_index(drop=True), X_val.reset_index(drop=True)], axis=1),
    bureau_agg_filtered, on='SK_ID_CURR', how='left'
)
X_val_with_bureau_bb = pd.merge(
    X_val_with_bureau_bb,
    bb_features_filtered, on='SK_ID_CURR', how='left'
).drop(columns=['SK_ID_CURR'])
X_val_phase3 = X_val_with_bureau_bb[phase2_selected_features].copy()

X_test_with_bureau_bb = pd.merge(
    test_df[['SK_ID_CURR'] + BASELINE_FEATURES],
    bureau_agg_filtered, on='SK_ID_CURR', how='left'
)
X_test_with_bureau_bb = pd.merge(
    X_test_with_bureau_bb,
    bb_features_filtered, on='SK_ID_CURR', how='left'
).drop(columns=['SK_ID_CURR'])
X_test_phase3 = X_test_with_bureau_bb[phase2_selected_features].copy()

train_ids = df.loc[X_train.index, 'SK_ID_CURR'].reset_index(drop=True)
val_ids = df.loc[X_val.index, 'SK_ID_CURR'].reset_index(drop=True)
test_ids = test_df['SK_ID_CURR'].reset_index(drop=True)

X_train_prev = pd.merge(
    pd.concat([train_ids, X_train_phase3.reset_index(drop=True)], axis=1),
    prev_agg_filtered, on='SK_ID_CURR', how='left'
).drop(columns=['SK_ID_CURR'])

X_val_prev = pd.merge(
    pd.concat([val_ids, X_val_phase3.reset_index(drop=True)], axis=1),
    prev_agg_filtered, on='SK_ID_CURR', how='left'
).drop(columns=['SK_ID_CURR'])

X_test_prev = pd.merge(
    pd.concat([test_ids, X_test_phase3.reset_index(drop=True)], axis=1),
    prev_agg_filtered, on='SK_ID_CURR', how='left'
).drop(columns=['SK_ID_CURR'])

print(f"Train shape: {X_train_prev.shape}")
print(f"Val shape: {X_val_prev.shape}")
print(f"Test shape: {X_test_prev.shape}")


Train shape: (246008, 80)
Val shape: (61503, 80)
Test shape: (48744, 80)


### Preliminary Training (for Level 2 Filtering)


In [257]:
X_train_lgb_p3 = X_train_prev.copy()
X_val_lgb_p3 = X_val_prev.copy()
X_test_lgb_p3 = X_test_prev.copy()

for col in CATEGORICAL_FEATURES:
    if col in X_train_lgb_p3.columns:
        X_train_lgb_p3[col] = X_train_lgb_p3[col].map(cat_mappings[col])
        X_val_lgb_p3[col] = X_val_lgb_p3[col].map(cat_mappings[col])
        X_test_lgb_p3[col] = X_test_lgb_p3[col].map(cat_mappings[col])

lgb_prev_prelim = LGBMClassifier(
    random_state=42,
    class_weight='balanced',
    verbose=-1
)

lgb_prev_prelim.fit(
    X_train_lgb_p3, y_train,
    eval_set=[(X_val_lgb_p3, y_val)],
    eval_metric='auc'
)

train_auc_lgb_p3_prelim = roc_auc_score(y_train, lgb_prev_prelim.predict_proba(X_train_lgb_p3)[:, 1])
val_auc_lgb_p3_prelim = roc_auc_score(y_val, lgb_prev_prelim.predict_proba(X_val_lgb_p3)[:, 1])

print("Preliminary LightGBM Results:")
print(f"  Train AUC: {train_auc_lgb_p3_prelim:.4f}")
print(f"  Val AUC: {val_auc_lgb_p3_prelim:.4f}")

scale_pos_weight_p3 = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

xgb_prev_prelim = XGBClassifier(
    random_state=42,
    scale_pos_weight=scale_pos_weight_p3,
    eval_metric='auc',
    verbosity=0
)

xgb_prev_prelim.fit(
    X_train_lgb_p3, y_train,
    eval_set=[(X_val_lgb_p3, y_val)],
    verbose=False
)

train_auc_xgb_p3_prelim = roc_auc_score(y_train, xgb_prev_prelim.predict_proba(X_train_lgb_p3)[:, 1])
val_auc_xgb_p3_prelim = roc_auc_score(y_val, xgb_prev_prelim.predict_proba(X_val_lgb_p3)[:, 1])

print("\nPreliminary XGBoost Results:")
print(f"  Train AUC: {train_auc_xgb_p3_prelim:.4f}")
print(f"  Val AUC: {val_auc_xgb_p3_prelim:.4f}")


Preliminary LightGBM Results:
  Train AUC: 0.8140
  Val AUC: 0.7722

Preliminary XGBoost Results:
  Train AUC: 0.8908
  Val AUC: 0.7570


### Level 2 Filtering: Feature Importance Selection


In [258]:
lgb_importances_p3 = get_feature_importances(lgb_prev_prelim, X_train_lgb_p3.columns.tolist())
xgb_importances_p3 = get_feature_importances(xgb_prev_prelim, X_train_lgb_p3.columns.tolist())

print("\nTop 15 LightGBM Importances:")
print(lgb_importances_p3.head(15))

print("\nTop 15 XGBoost Importances:")
print(xgb_importances_p3.head(15))

importance_threshold_p3 = 20
lgb_selected_p3 = select_by_importance_threshold(lgb_importances_p3, importance_threshold_p3)
xgb_selected_p3 = select_by_importance_threshold(xgb_importances_p3, importance_threshold_p3)

print(f"\nThreshold: {importance_threshold_p3}")
print(f"LightGBM selected: {len(lgb_selected_p3)}")
print(f"XGBoost selected: {len(xgb_selected_p3)}")

selected_features_p3 = sorted(list(set(lgb_selected_p3 + xgb_selected_p3)))
print(f"\nTotal features selected (union): {len(selected_features_p3)}")



Top 15 LightGBM Importances:
                               feature  importance
7                         EXT_SOURCE_1         188
33                        EXT_SOURCE_2         154
32                        EXT_SOURCE_3         129
28                          DAYS_BIRTH         126
38                         AMT_ANNUITY         116
43                          AMT_CREDIT         107
1                        DAYS_EMPLOYED          79
8                      AMT_GOODS_PRICE          76
77                  prev_approval_rate          73
30      bureau_DAYS_CREDIT_ENDDATE_max          72
50               prev_AMT_ANNUITY_mean          70
69                prev_CNT_PAYMENT_std          70
78                prev_app_credit_diff          69
9                      DAYS_ID_PUBLISH          61
10  bureau_AMT_CREDIT_MAX_OVERDUE_mean          61

Top 15 XGBoost Importances:
                     feature  importance
32              EXT_SOURCE_3    0.059032
33              EXT_SOURCE_2    0.055702
37

### Final Model Training (with selected features)


In [259]:
X_train_selected_p3 = X_train_lgb_p3[selected_features_p3]
X_val_selected_p3 = X_val_lgb_p3[selected_features_p3]
X_test_selected_p3 = X_test_lgb_p3[selected_features_p3]

print(f"Selected features shape:")
print(f"  Train: {X_train_selected_p3.shape}")
print(f"  Val: {X_val_selected_p3.shape}")
print(f"  Test: {X_test_selected_p3.shape}")


Selected features shape:
  Train: (246008, 55)
  Val: (61503, 55)
  Test: (48744, 55)


In [260]:
%%time

lgb_final_p3 = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=31,
    random_state=42,
    class_weight='balanced',
    verbose=-1
)

lgb_final_p3.fit(
    X_train_selected_p3, y_train,
    eval_set=[(X_val_selected_p3, y_val)],
    eval_metric='auc'
)

train_auc_lgb_p3 = roc_auc_score(y_train, lgb_final_p3.predict_proba(X_train_selected_p3)[:, 1])
val_auc_lgb_p3 = roc_auc_score(y_val, lgb_final_p3.predict_proba(X_val_selected_p3)[:, 1])

print("Final LightGBM Results:")
print(f"  Train AUC: {train_auc_lgb_p3:.4f}")
print(f"  Val AUC: {val_auc_lgb_p3:.4f}")

xgb_final_p3 = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    random_state=42,
    scale_pos_weight=scale_pos_weight_p3,
    eval_metric='auc',
    verbosity=0
)

xgb_final_p3.fit(
    X_train_selected_p3, y_train,
    eval_set=[(X_val_selected_p3, y_val)],
    verbose=False
)

train_auc_xgb_p3 = roc_auc_score(y_train, xgb_final_p3.predict_proba(X_train_selected_p3)[:, 1])
val_auc_xgb_p3 = roc_auc_score(y_val, xgb_final_p3.predict_proba(X_val_selected_p3)[:, 1])

print("\nFinal XGBoost Results:")
print(f"  Train AUC: {train_auc_xgb_p3:.4f}")
print(f"  Val AUC: {val_auc_xgb_p3:.4f}")


Final LightGBM Results:
  Train AUC: 0.8556
  Val AUC: 0.7720

Final XGBoost Results:
  Train AUC: 0.9278
  Val AUC: 0.7652
CPU times: total: 6min 10s
Wall time: 33.4 s


In [261]:
print("="*70)
print("QUICK EVALUATION (80/20 Split - Phase 2 Comparison)")
print("="*70)
print(f"{'Model':<25} {'Train AUC':<12} {'Val AUC':<12} {'Improvement':<12} {'Gap':<10}")
print("-"*70)
print(f"{'Phase 2 LightGBM':<25} {'-':<12} {phase2_val_auc_lgb:<12.4f} {'-':<12} {'-':<10}")
print(f"{'Phase 3 LightGBM':<25} {train_auc_lgb_p3:<12.4f} {val_auc_lgb_p3:<12.4f} {val_auc_lgb_p3 - phase2_val_auc_lgb:+<12.4f} {train_auc_lgb_p3 - val_auc_lgb_p3:<10.4f}")
print(f"{'Phase 3 XGBoost':<25} {train_auc_xgb_p3:<12.4f} {val_auc_xgb_p3:<12.4f} {val_auc_xgb_p3 - phase2_val_auc_lgb:+<12.4f} {train_auc_xgb_p3 - val_auc_xgb_p3:<10.4f}")
print("="*70)

prev_new_features = [f for f in selected_features_p3 if f.startswith('prev_')]
print(f"\nSelected Features:")
print(f"  Total: {len(selected_features_p3)}")
print(f"  Phase 2: {len([f for f in selected_features_p3 if f in phase2_selected_features])}")
print(f"  Previous Application (new): {len(prev_new_features)}")


QUICK EVALUATION (80/20 Split - Phase 2 Comparison)
Model                     Train AUC    Val AUC      Improvement  Gap       
----------------------------------------------------------------------
Phase 2 LightGBM          -            0.7549       -            -         
Phase 3 LightGBM          0.8556       0.7720       0.0171++++++ 0.0837    
Phase 3 XGBoost           0.9278       0.7652       0.0103++++++ 0.1627    

Selected Features:
  Total: 55
  Phase 2: 40
  Previous Application (new): 15


### Cross-Validation + Save Data + MLflow + Save Models + Submissions


### Cross-Validation (5-Fold StratifiedKFold)


In [262]:
%%time

X_full = pd.concat([X_train, X_val], axis=0).reset_index(drop=True)
y_full = pd.concat([y_train, y_val], axis=0).reset_index(drop=True)

full_ids = pd.concat([df.loc[X_train.index, 'SK_ID_CURR'], df.loc[X_val.index, 'SK_ID_CURR']]).reset_index(drop=True)

X_full_with_all = pd.merge(
    pd.concat([full_ids, X_full], axis=1),
    bureau_agg_filtered, on='SK_ID_CURR', how='left'
)
X_full_with_all = pd.merge(
    X_full_with_all,
    bb_features_filtered, on='SK_ID_CURR', how='left'
)
X_full_with_all = pd.merge(
    X_full_with_all,
    prev_agg_filtered, on='SK_ID_CURR', how='left'
).drop(columns=['SK_ID_CURR'])

X_full_selected_p3 = X_full_with_all[selected_features_p3].copy()
for col in CATEGORICAL_FEATURES:
    if col in X_full_selected_p3.columns:
        X_full_selected_p3[col] = X_full_selected_p3[col].map(cat_mappings[col])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

lgb_cv_scores_p3 = []
xgb_cv_scores_p3 = []

print("-" * 70)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_full_selected_p3, y_full), 1):
    X_cv_train, X_cv_val = X_full_selected_p3.iloc[train_idx], X_full_selected_p3.iloc[val_idx]
    y_cv_train, y_cv_val = y_full.iloc[train_idx], y_full.iloc[val_idx]
    
    lgb_cv = LGBMClassifier(
        n_estimators=500, learning_rate=0.05, max_depth=7,
        num_leaves=31, random_state=42, class_weight='balanced', verbose=-1
    )
    lgb_cv.fit(X_cv_train, y_cv_train, eval_set=[(X_cv_val, y_cv_val)], eval_metric='auc')
    lgb_val_auc = roc_auc_score(y_cv_val, lgb_cv.predict_proba(X_cv_val)[:, 1])
    lgb_cv_scores_p3.append(lgb_val_auc)
    
    scale_pos_weight_cv = len(y_cv_train[y_cv_train == 0]) / len(y_cv_train[y_cv_train == 1])
    xgb_cv = XGBClassifier(
        n_estimators=500, learning_rate=0.05, max_depth=7,
        random_state=42, scale_pos_weight=scale_pos_weight_cv,
        eval_metric='auc', verbosity=0
    )
    xgb_cv.fit(X_cv_train, y_cv_train, eval_set=[(X_cv_val, y_cv_val)], verbose=False)
    xgb_val_auc = roc_auc_score(y_cv_val, xgb_cv.predict_proba(X_cv_val)[:, 1])
    xgb_cv_scores_p3.append(xgb_val_auc)
    
    print(f"Fold {fold}: LightGBM={lgb_val_auc:.4f}  XGBoost={xgb_val_auc:.4f}")

print("-" * 70)
print(f"\nLightGBM CV: {np.mean(lgb_cv_scores_p3):.4f} ± {np.std(lgb_cv_scores_p3):.4f}")
print(f"XGBoost CV:  {np.mean(xgb_cv_scores_p3):.4f} ± {np.std(xgb_cv_scores_p3):.4f}")
print(f"\nPhase 2 LightGBM CV: {phase2_val_auc_lgb:.4f}")
print(f"LightGBM Improvement: {np.mean(lgb_cv_scores_p3) - phase2_val_auc_lgb:+.4f}")
print(f"XGBoost Improvement:  {np.mean(xgb_cv_scores_p3) - phase2_val_auc_lgb:+.4f}")


----------------------------------------------------------------------
Fold 1: LightGBM=0.7752  XGBoost=0.7687
Fold 2: LightGBM=0.7678  XGBoost=0.7616
Fold 3: LightGBM=0.7704  XGBoost=0.7652
Fold 4: LightGBM=0.7692  XGBoost=0.7609
Fold 5: LightGBM=0.7722  XGBoost=0.7639
----------------------------------------------------------------------

LightGBM CV: 0.7710 ± 0.0026
XGBoost CV:  0.7641 ± 0.0028

Phase 2 LightGBM CV: 0.7549
LightGBM Improvement: +0.0161
XGBoost Improvement:  +0.0092
CPU times: total: 30min 8s
Wall time: 2min 33s


### Save Processed Data + MLflow + Save Models + Submissions


In [283]:
output_dir_p3 = '../../data/processed/phase3_previous_application'
os.makedirs(output_dir_p3, exist_ok=True)

pd.concat([train_ids, X_train_selected_p3.reset_index(drop=True)], axis=1).to_csv(
    f'{output_dir_p3}/X_train.csv', index=False
)
pd.DataFrame(y_train).to_csv(f'{output_dir_p3}/y_train.csv', index=False)

pd.concat([val_ids, X_val_selected_p3.reset_index(drop=True)], axis=1).to_csv(
    f'{output_dir_p3}/X_val.csv', index=False
)
pd.DataFrame(y_val).to_csv(f'{output_dir_p3}/y_val.csv', index=False)

pd.concat([test_ids, X_test_selected_p3.reset_index(drop=True)], axis=1).to_csv(
    f'{output_dir_p3}/X_test.csv', index=False
)

feature_metadata_p3 = {
    'phase': 'phase3_previous_application',
    'base_phase': 'phase2_bureau_balance',
    'n_phase2_features': len(phase2_selected_features),
    'n_prev_features_created': prev_agg.shape[1] - 1,
    'n_prev_features_after_level1': prev_features.shape[1],
    'n_features_final': len(selected_features_p3),
    'feature_list': selected_features_p3,
    'phase2_features': [f for f in selected_features_p3 if f in phase2_selected_features],
    'prev_features_new': prev_new_features,
    'dropped_missing': prev_dropped_missing,
    'dropped_variance': prev_dropped_variance,
    'dropped_corr': prev_dropped_corr,
    'importance_threshold': importance_threshold_p3,
    'quick_eval': {
        'lgb_val_auc': float(val_auc_lgb_p3),
        'lgb_improvement': float(val_auc_lgb_p3 - phase2_val_auc_lgb),
        'xgb_val_auc': float(val_auc_xgb_p3),
        'xgb_improvement': float(val_auc_xgb_p3 - phase2_val_auc_lgb)
    },
    'cv_eval': {
        'lgb_cv_mean': float(np.mean(lgb_cv_scores_p3)),
        'lgb_cv_std': float(np.std(lgb_cv_scores_p3)),
        'lgb_improvement_cv': float(np.mean(lgb_cv_scores_p3) - phase2_val_auc_lgb),
        'xgb_cv_mean': float(np.mean(xgb_cv_scores_p3)),
        'xgb_cv_std': float(np.std(xgb_cv_scores_p3)),
        'xgb_improvement_cv': float(np.mean(xgb_cv_scores_p3) - phase2_val_auc_lgb)
    }
}

with open(f'{output_dir_p3}/feature_metadata.json', 'w') as f:
    json.dump(feature_metadata_p3, f, indent=2)

print(f"Saved processed data to {output_dir_p3}/")


Saved processed data to ../../data/processed/phase3_previous_application/


In [264]:
mlflow_tracking_uri = os.path.join(os.getcwd(), 'mlruns')
mlflow.set_tracking_uri(f"file:///{mlflow_tracking_uri}")
mlflow.set_experiment("feature_engineering")

with mlflow.start_run(run_name="phase3_previous_application_lightgbm"):
    mlflow.log_param("phase", "previous_application")
    mlflow.log_param("base_phase", "phase2_bureau_balance")
    mlflow.log_param("model_type", "LightGBM")
    mlflow.log_param("n_estimators", 500)
    mlflow.log_param("learning_rate", 0.05)
    mlflow.log_param("max_depth", 7)
    mlflow.log_param("n_phase2_features", len(phase2_selected_features))
    mlflow.log_param("n_prev_features_new", len(prev_new_features))
    mlflow.log_param("n_features_final", len(selected_features_p3))
    mlflow.log_param("importance_threshold", importance_threshold_p3)
    
    mlflow.log_metric("quick_train_auc", train_auc_lgb_p3)
    mlflow.log_metric("quick_val_auc", val_auc_lgb_p3)
    mlflow.log_metric("cv_mean_auc", np.mean(lgb_cv_scores_p3))
    mlflow.log_metric("cv_std_auc", np.std(lgb_cv_scores_p3))
    mlflow.log_metric("phase2_val_auc", phase2_val_auc_lgb)
    mlflow.log_metric("improvement_quick", val_auc_lgb_p3 - phase2_val_auc_lgb)
    mlflow.log_metric("improvement_cv", np.mean(lgb_cv_scores_p3) - phase2_val_auc_lgb)
    
    X_sample = X_train_selected_p3.iloc[:5].fillna(0)
    y_sample = y_train.iloc[:5]
    signature = mlflow.models.infer_signature(X_sample, y_sample)
    
    mlflow.sklearn.log_model(lgb_final_p3, "model", signature=signature, input_example=X_sample)
    
    lgb_importances_final_p3 = get_feature_importances(lgb_final_p3, selected_features_p3)
    lgb_importances_final_p3.to_csv('feature_importance_lgb_p3.csv', index=False)
    mlflow.log_artifact('feature_importance_lgb_p3.csv')
    os.remove('feature_importance_lgb_p3.csv')
    
    with open('selected_features_p3.json', 'w') as f:
        json.dump({'features': selected_features_p3}, f, indent=2)
    mlflow.log_artifact('selected_features_p3.json')
    os.remove('selected_features_p3.json')
    
    with open('dropped_features_p3.json', 'w') as f:
        json.dump({
            'dropped_missing': prev_dropped_missing,
            'dropped_variance': prev_dropped_variance,
            'dropped_correlation': prev_dropped_corr
        }, f, indent=2)
    mlflow.log_artifact('dropped_features_p3.json')
    os.remove('dropped_features_p3.json')

print("Logged LightGBM to MLflow")

with mlflow.start_run(run_name="phase3_previous_application_xgboost"):
    mlflow.log_param("phase", "previous_application")
    mlflow.log_param("base_phase", "phase2_bureau_balance")
    mlflow.log_param("model_type", "XGBoost")
    mlflow.log_param("n_estimators", 500)
    mlflow.log_param("learning_rate", 0.05)
    mlflow.log_param("max_depth", 7)
    mlflow.log_param("n_phase2_features", len(phase2_selected_features))
    mlflow.log_param("n_prev_features_new", len(prev_new_features))
    mlflow.log_param("n_features_final", len(selected_features_p3))
    mlflow.log_param("importance_threshold", importance_threshold_p3)
    
    mlflow.log_metric("quick_train_auc", train_auc_xgb_p3)
    mlflow.log_metric("quick_val_auc", val_auc_xgb_p3)
    mlflow.log_metric("cv_mean_auc", np.mean(xgb_cv_scores_p3))
    mlflow.log_metric("cv_std_auc", np.std(xgb_cv_scores_p3))
    mlflow.log_metric("phase2_val_auc", phase2_val_auc_lgb)
    mlflow.log_metric("improvement_quick", val_auc_xgb_p3 - phase2_val_auc_lgb)
    mlflow.log_metric("improvement_cv", np.mean(xgb_cv_scores_p3) - phase2_val_auc_lgb)
    
    X_sample = X_train_selected_p3.iloc[:5].fillna(0)
    y_sample = y_train.iloc[:5]
    signature = mlflow.models.infer_signature(X_sample, y_sample)
    
    mlflow.sklearn.log_model(xgb_final_p3, "model", signature=signature, input_example=X_sample)
    
    xgb_importances_final_p3 = get_feature_importances(xgb_final_p3, selected_features_p3)
    xgb_importances_final_p3.to_csv('feature_importance_xgb_p3.csv', index=False)
    mlflow.log_artifact('feature_importance_xgb_p3.csv')
    os.remove('feature_importance_xgb_p3.csv')
    
    with open('selected_features_p3.json', 'w') as f:
        json.dump({'features': selected_features_p3}, f, indent=2)
    mlflow.log_artifact('selected_features_p3.json')
    os.remove('selected_features_p3.json')
    
    with open('dropped_features_p3.json', 'w') as f:
        json.dump({
            'dropped_missing': prev_dropped_missing,
            'dropped_variance': prev_dropped_variance,
            'dropped_correlation': prev_dropped_corr
        }, f, indent=2)
    mlflow.log_artifact('dropped_features_p3.json')
    os.remove('dropped_features_p3.json')

print("Logged XGBoost to MLflow")




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Logged LightGBM to MLflow




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Logged XGBoost to MLflow


In [265]:
output_model_dir_p3 = '../../models/phase3_previous_application'
os.makedirs(output_model_dir_p3, exist_ok=True)

model_artifacts_lgb_p3 = {
    'model': lgb_final_p3,
    'selected_features': selected_features_p3,
    'categorical_features': CATEGORICAL_FEATURES,
    'cat_mappings': cat_mappings,
    'quick_val_auc': val_auc_lgb_p3,
    'cv_mean_auc': np.mean(lgb_cv_scores_p3),
    'cv_std_auc': np.std(lgb_cv_scores_p3),
    'phase2_val_auc': phase2_val_auc_lgb,
    'improvement': val_auc_lgb_p3 - phase2_val_auc_lgb
}

joblib.dump(model_artifacts_lgb_p3, f'{output_model_dir_p3}/lightgbm_v1.pkl')
print(f"Model saved to {output_model_dir_p3}/lightgbm_v1.pkl")

model_artifacts_xgb_p3 = {
    'model': xgb_final_p3,
    'selected_features': selected_features_p3,
    'categorical_features': CATEGORICAL_FEATURES,
    'cat_mappings': cat_mappings,
    'quick_val_auc': val_auc_xgb_p3,
    'cv_mean_auc': np.mean(xgb_cv_scores_p3),
    'cv_std_auc': np.std(xgb_cv_scores_p3),
    'phase2_val_auc': phase2_val_auc_lgb,
    'improvement': val_auc_xgb_p3 - phase2_val_auc_lgb
}

joblib.dump(model_artifacts_xgb_p3, f'{output_model_dir_p3}/xgboost_v1.pkl')
print(f"Model saved to {output_model_dir_p3}/xgboost_v1.pkl")


Model saved to ../../models/phase3_previous_application/lightgbm_v1.pkl
Model saved to ../../models/phase3_previous_application/xgboost_v1.pkl


In [266]:
sample_submission = pd.read_csv('../../data/raw/sample_submission.csv')

lgb_preds_p3 = lgb_final_p3.predict_proba(X_test_selected_p3)[:, 1]
xgb_preds_p3 = xgb_final_p3.predict_proba(X_test_selected_p3)[:, 1]

submission_lgb_p3 = sample_submission.copy()
submission_lgb_p3['TARGET'] = lgb_preds_p3

submission_xgb_p3 = sample_submission.copy()
submission_xgb_p3['TARGET'] = xgb_preds_p3

os.makedirs('../../data/submissions', exist_ok=True)

submission_lgb_p3.to_csv('../../data/submissions/phase3_previous_application_lightgbm_v1.csv', index=False)
submission_xgb_p3.to_csv('../../data/submissions/phase3_previous_application_xgboost_v1.csv', index=False)

print("Submission files created:")
print("  phase3_previous_application_lightgbm_v1.csv")
print("  phase3_previous_application_xgboost_v1.csv")
print(f"\nLightGBM predictions - Min: {lgb_preds_p3.min():.4f}, Max: {lgb_preds_p3.max():.4f}, Mean: {lgb_preds_p3.mean():.4f}")
print(f"XGBoost predictions  - Min: {xgb_preds_p3.min():.4f}, Max: {xgb_preds_p3.max():.4f}, Mean: {xgb_preds_p3.mean():.4f}")


Submission files created:
  phase3_previous_application_lightgbm_v1.csv
  phase3_previous_application_xgboost_v1.csv

LightGBM predictions - Min: 0.0064, Max: 0.9524, Mean: 0.3682
XGBoost predictions  - Min: 0.0016, Max: 0.9737, Mean: 0.3232


### Phase 3 Summary

**Strategy:** Previous application history with approval patterns and amount differences - highest impact phase.

**Features Created:** 40 previous_application features (AMT_ANNUITY, AMT_GOODS_PRICE, CNT_PAYMENT, DAYS_DECISION, NAME_CONTRACT_STATUS/TYPE, approval_rate, app_credit_diff). Level 1 filtering: 1 variance + 7 correlation drops → 32 features. Level 2 selection: 15 new features. Final: 55 features total.

**Results:** LightGBM Val 0.7720, CV 0.7710 (+0.0161 from Phase 2 - largest gain). XGBoost Val 0.7652, CV 0.7641. Kaggle: LightGBM Private 0.76267/Public 0.75670, XGBoost Private 0.75218/Public 0.74200.

**Top Contributors:** prev_approval_rate, prev_app_credit_diff, prev_DAYS_DECISION patterns, prev_AMT_ANNUITY aggregations. Application history highly predictive.

**Insight:** Strongest phase. Approval patterns and application timing critical for credit risk.

**Saved:** Processed data and models to phase3_previous_application/, tracked in MLflow, submissions generated.

---

## Phase 4: POS & Credit Card Features

**Base:** Phase 3 selected features (55)

Add payment behavior patterns from POS and Credit Card

### Load Phase 3 Base Features


In [274]:
with open('../../data/processed/phase3_previous_application/feature_metadata.json', 'r') as f:
    phase3_metadata = json.load(f)

phase3_selected_features = phase3_metadata['feature_list']
phase3_val_auc_lgb = phase3_metadata['cv_eval']['lgb_cv_mean']

train_ids = df.loc[X_train.index, 'SK_ID_CURR'].reset_index(drop=True)
val_ids = df.loc[X_val.index, 'SK_ID_CURR'].reset_index(drop=True)
test_ids = test_df['SK_ID_CURR'].reset_index(drop=True)

train_df = pd.concat([train_ids, X_train.reset_index(drop=True)], axis=1)
val_df = pd.concat([val_ids, X_val.reset_index(drop=True)], axis=1)

print(f"Phase 3 Base Features: {len(phase3_selected_features)}")
print(f"Phase 3 LightGBM CV AUC: {phase3_val_auc_lgb:.4f}")
print(f"train_df shape: {train_df.shape}, val_df shape: {val_df.shape}, test_df shape: {test_df.shape}")


Phase 3 Base Features: 55
Phase 3 LightGBM CV AUC: 0.7710
train_df shape: (246008, 37), val_df shape: (61503, 37), test_df shape: (48744, 121)


### POS & Credit Card Aggregations


In [275]:
pos = pd.read_csv('../../data/raw/POS_CASH_balance.csv')
cc = pd.read_csv('../../data/raw/credit_card_balance.csv')

print(f"POS Cash Balance shape: {pos.shape}")
print(f"Unique SK_ID_PREV (POS): {pos['SK_ID_PREV'].nunique()}")
print(f"Credit Card Balance shape: {cc.shape}")
print(f"Unique SK_ID_PREV (CC): {cc['SK_ID_PREV'].nunique()}")


POS Cash Balance shape: (10001358, 8)
Unique SK_ID_PREV (POS): 936325
Credit Card Balance shape: (3840312, 23)
Unique SK_ID_PREV (CC): 104307


In [276]:
pos_agg = pos.groupby('SK_ID_PREV').agg({
    'MONTHS_BALANCE': ['min', 'max', 'size'],
    'CNT_INSTALMENT': ['min', 'max', 'mean', 'sum'],
    'CNT_INSTALMENT_FUTURE': ['min', 'max', 'mean', 'sum'],
    'SK_DPD': ['max', 'mean', 'sum'],
    'SK_DPD_DEF': ['max', 'mean', 'sum']
}).reset_index()

pos_agg.columns = ['SK_ID_PREV'] + [f'pos_{col[0]}_{col[1]}' for col in pos_agg.columns[1:]]

pos_status = aggregate_categorical_ohe(pos, 'SK_ID_PREV', 'NAME_CONTRACT_STATUS', 'pos_status')
pos_agg = pos_agg.merge(pos_status, on='SK_ID_PREV', how='left')

prev_to_curr = prev[['SK_ID_PREV', 'SK_ID_CURR']].drop_duplicates()
pos_by_curr = pos_agg.merge(prev_to_curr, on='SK_ID_PREV', how='left')

pos_features = pos_by_curr.drop(columns=['SK_ID_PREV']).groupby('SK_ID_CURR').agg('mean').reset_index()

print(f"POS features created: {pos_features.shape[1] - 1}")


POS features created: 26


In [277]:
cc_agg = cc.groupby('SK_ID_PREV').agg({
    'MONTHS_BALANCE': ['min', 'max', 'size'],
    'AMT_BALANCE': ['min', 'max', 'mean', 'std'],
    'AMT_CREDIT_LIMIT_ACTUAL': ['min', 'max', 'mean'],
    'AMT_DRAWINGS_ATM_CURRENT': ['max', 'mean', 'sum'],
    'AMT_DRAWINGS_CURRENT': ['max', 'mean', 'sum'],
    'AMT_PAYMENT_CURRENT': ['min', 'max', 'mean', 'sum'],
    'SK_DPD': ['max', 'mean', 'sum'],
    'SK_DPD_DEF': ['max', 'mean', 'sum']
}).reset_index()

cc_agg.columns = ['SK_ID_PREV'] + [f'cc_{col[0]}_{col[1]}' for col in cc_agg.columns[1:]]

cc_agg['cc_balance_limit_ratio'] = cc_agg['cc_AMT_BALANCE_mean'] / (cc_agg['cc_AMT_CREDIT_LIMIT_ACTUAL_mean'] + 1)
cc_agg['cc_payment_balance_ratio'] = cc_agg['cc_AMT_PAYMENT_CURRENT_mean'] / (cc_agg['cc_AMT_BALANCE_mean'] + 1)

cc_by_curr = cc_agg.merge(prev_to_curr, on='SK_ID_PREV', how='left')
cc_features = cc_by_curr.drop(columns=['SK_ID_PREV']).groupby('SK_ID_CURR').agg('mean').reset_index()

print(f"Credit Card features created: {cc_features.shape[1] - 1}")


Credit Card features created: 28


In [278]:
pos_cc_features = pos_features.merge(cc_features, on='SK_ID_CURR', how='outer')

print(f"Total POS + CC features: {pos_cc_features.shape[1] - 1}")


Total POS + CC features: 54


### Level 1 Filtering


In [279]:
pos_cc_cols = [c for c in pos_cc_features.columns if c != 'SK_ID_CURR']

pos_cc_filtered = pos_cc_features[['SK_ID_CURR'] + pos_cc_cols].copy()
pos_cc_filtered, missing_dropped = filter_by_missing(pos_cc_filtered, threshold=0.80)
pos_cc_filtered, var_dropped = remove_low_variance(pos_cc_filtered, threshold=0.01)
pos_cc_filtered, corr_dropped = remove_correlated(pos_cc_filtered, threshold=0.95)

print(f"POS+CC features after Level 1 filtering: {len(pos_cc_filtered.columns) - 1}")
print(f"  Dropped by missing: {len(missing_dropped)}")
print(f"  Dropped by low variance: {len(var_dropped)}")
print(f"  Dropped by correlation: {len(corr_dropped)}")


POS+CC features after Level 1 filtering: 33
  Dropped by missing: 6
  Dropped by low variance: 5
  Dropped by correlation: 10


### Load Phase 3 Processed Data & Merge with POS/CC Features


In [284]:
X_train_p3_df = pd.read_csv('../../data/processed/phase3_previous_application/X_train.csv')
X_val_p3_df = pd.read_csv('../../data/processed/phase3_previous_application/X_val.csv')
X_test_p3_df = pd.read_csv('../../data/processed/phase3_previous_application/X_test.csv')

train_df_p4 = X_train_p3_df.merge(pos_cc_filtered, on='SK_ID_CURR', how='left')
val_df_p4 = X_val_p3_df.merge(pos_cc_filtered, on='SK_ID_CURR', how='left')
test_df_p4 = X_test_p3_df.merge(pos_cc_filtered, on='SK_ID_CURR', how='left')

X_train_pos_cc = train_df_p4.drop(columns=['SK_ID_CURR'])
X_val_pos_cc = val_df_p4.drop(columns=['SK_ID_CURR'])
X_test_pos_cc = test_df_p4.drop(columns=['SK_ID_CURR'])

print(f"X_train shape: {X_train_pos_cc.shape}")
print(f"X_val shape: {X_val_pos_cc.shape}")
print(f"X_test shape: {X_test_pos_cc.shape}")


X_train shape: (246008, 88)
X_val shape: (61503, 88)
X_test shape: (48744, 88)


### Preliminary Training (for Level 2 Filtering)


In [285]:
%%time
X_train_lgb_p4 = X_train_pos_cc.copy()
X_val_lgb_p4 = X_val_pos_cc.copy()
X_test_lgb_p4 = X_test_pos_cc.copy()

for col in CATEGORICAL_FEATURES:
    if col in X_train_lgb_p4.columns:
        X_train_lgb_p4[col] = X_train_lgb_p4[col].map(cat_mappings[col]).fillna(-1).astype(int)
        X_val_lgb_p4[col] = X_val_lgb_p4[col].map(cat_mappings[col]).fillna(-1).astype(int)
        X_test_lgb_p4[col] = X_test_lgb_p4[col].map(cat_mappings[col]).fillna(-1).astype(int)

lgb_pos_cc_prelim = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=31,
    random_state=42,
    class_weight='balanced',
    verbose=-1
)

xgb_pos_cc_prelim = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    random_state=42,
    scale_pos_weight=(len(y_train) - y_train.sum()) / y_train.sum(),
    eval_metric='logloss',
    early_stopping_rounds=50
)

lgb_pos_cc_prelim.fit(X_train_lgb_p4, y_train, eval_set=[(X_val_lgb_p4, y_val)])
xgb_pos_cc_prelim.fit(X_train_lgb_p4, y_train, eval_set=[(X_val_lgb_p4, y_val)], verbose=False)

lgb_train_auc = roc_auc_score(y_train, lgb_pos_cc_prelim.predict_proba(X_train_lgb_p4)[:, 1])
lgb_val_auc = roc_auc_score(y_val, lgb_pos_cc_prelim.predict_proba(X_val_lgb_p4)[:, 1])

xgb_train_auc = roc_auc_score(y_train, xgb_pos_cc_prelim.predict_proba(X_train_lgb_p4)[:, 1])
xgb_val_auc = roc_auc_score(y_val, xgb_pos_cc_prelim.predict_proba(X_val_lgb_p4)[:, 1])

print(f"Preliminary LightGBM - Train AUC: {lgb_train_auc:.4f}, Val AUC: {lgb_val_auc:.4f}")
print(f"Preliminary XGBoost  - Train AUC: {xgb_train_auc:.4f}, Val AUC: {xgb_val_auc:.4f}")


Preliminary LightGBM - Train AUC: 0.8566, Val AUC: 0.7730
Preliminary XGBoost  - Train AUC: 0.9211, Val AUC: 0.7649
CPU times: total: 6min 48s
Wall time: 36.8 s


### Level 2 Filtering: Feature Importance Selection


In [286]:
lgb_importances_p4 = get_feature_importances(lgb_pos_cc_prelim, X_train_lgb_p4.columns)
xgb_importances_p4 = get_feature_importances(xgb_pos_cc_prelim, X_train_lgb_p4.columns)

importance_threshold = 20

lgb_selected_p4 = select_by_importance_threshold(lgb_importances_p4, threshold=importance_threshold)
xgb_selected_p4 = select_by_importance_threshold(xgb_importances_p4, threshold=importance_threshold)

selected_features_p4 = list(set(lgb_selected_p4) | set(xgb_selected_p4))

print(f"Features selected by LightGBM (>{importance_threshold}): {len(lgb_selected_p4)}")
print(f"Features selected by XGBoost (>{importance_threshold}): {len(xgb_selected_p4)}")
print(f"Union of selected features: {len(selected_features_p4)}")


Features selected by LightGBM (>20): 76
Features selected by XGBoost (>20): 0
Union of selected features: 76


### Final Model Training (with selected features)


In [287]:
%%time

X_train_selected_p4 = X_train_lgb_p4[selected_features_p4]
X_val_selected_p4 = X_val_lgb_p4[selected_features_p4]
X_test_selected_p4 = X_test_lgb_p4[selected_features_p4]

lgb_final_p4 = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=31,
    random_state=42,
    class_weight='balanced',
    verbose=-1
)

xgb_final_p4 = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    random_state=42,
    scale_pos_weight=(len(y_train) - y_train.sum()) / y_train.sum(),
    eval_metric='logloss',
    early_stopping_rounds=50
)

lgb_final_p4.fit(X_train_selected_p4, y_train, eval_set=[(X_val_selected_p4, y_val)])
xgb_final_p4.fit(X_train_selected_p4, y_train, eval_set=[(X_val_selected_p4, y_val)], verbose=False)

train_auc_lgb_p4 = roc_auc_score(y_train, lgb_final_p4.predict_proba(X_train_selected_p4)[:, 1])
val_auc_lgb_p4 = roc_auc_score(y_val, lgb_final_p4.predict_proba(X_val_selected_p4)[:, 1])

train_auc_xgb_p4 = roc_auc_score(y_train, xgb_final_p4.predict_proba(X_train_selected_p4)[:, 1])
val_auc_xgb_p4 = roc_auc_score(y_val, xgb_final_p4.predict_proba(X_val_selected_p4)[:, 1])

print(f"Final LightGBM - Train AUC: {train_auc_lgb_p4:.4f}, Val AUC: {val_auc_lgb_p4:.4f}")
print(f"Final XGBoost  - Train AUC: {train_auc_xgb_p4:.4f}, Val AUC: {val_auc_xgb_p4:.4f}")


Final LightGBM - Train AUC: 0.8569, Val AUC: 0.7735
Final XGBoost  - Train AUC: 0.9224, Val AUC: 0.7646
CPU times: total: 6min 34s
Wall time: 38.6 s


### Quick Evaluation (80/20 Split - Phase 3 Comparison)


In [288]:
comparison_p4 = pd.DataFrame({
    'Model': ['LightGBM', 'XGBoost'],
    'Train AUC': [train_auc_lgb_p4, train_auc_xgb_p4],
    'Val AUC': [val_auc_lgb_p4, val_auc_xgb_p4],
    'Phase3 Val AUC': [phase3_val_auc_lgb, phase3_val_auc_lgb],
    'Improvement': [val_auc_lgb_p4 - phase3_val_auc_lgb, val_auc_xgb_p4 - phase3_val_auc_lgb],
    'Train-Val Gap': [train_auc_lgb_p4 - val_auc_lgb_p4, train_auc_xgb_p4 - val_auc_xgb_p4]
})

print("\n" + "="*70)
print("QUICK EVALUATION (80/20 Split)")
print("="*70)
print(comparison_p4.to_string(index=False))
print("="*70)



QUICK EVALUATION (80/20 Split)
   Model  Train AUC  Val AUC  Phase3 Val AUC  Improvement  Train-Val Gap
LightGBM   0.856909 0.773550        0.770963     0.002586       0.083360
 XGBoost   0.922432 0.764555        0.770963    -0.006408       0.157877


### Cross-Validation (5-Fold StratifiedKFold)


In [289]:
%%time

X_full = pd.concat([X_train, X_val], axis=0).reset_index(drop=True)
y_full = pd.concat([y_train, y_val], axis=0).reset_index(drop=True)

X_train_p3_full = pd.read_csv('../../data/processed/phase3_previous_application/X_train.csv')
X_val_p3_full = pd.read_csv('../../data/processed/phase3_previous_application/X_val.csv')
X_full_p3 = pd.concat([X_train_p3_full, X_val_p3_full], axis=0).reset_index(drop=True)

X_full_p4 = X_full_p3.merge(pos_cc_filtered, on='SK_ID_CURR', how='left').drop(columns=['SK_ID_CURR'])

for col in CATEGORICAL_FEATURES:
    if col in X_full_p4.columns:
        X_full_p4[col] = X_full_p4[col].map(cat_mappings[col]).fillna(-1).astype(int)

X_full_selected_p4 = X_full_p4[selected_features_p4]

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

lgb_cv_scores_p4 = []
xgb_cv_scores_p4 = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_full_selected_p4, y_full), 1):
    X_tr, X_va = X_full_selected_p4.iloc[train_idx], X_full_selected_p4.iloc[val_idx]
    y_tr, y_va = y_full.iloc[train_idx], y_full.iloc[val_idx]
    
    lgb_cv = LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=7,
        num_leaves=31,
        random_state=42,
        class_weight='balanced',
        verbose=-1
    )
    
    xgb_cv = XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=7,
        random_state=42,
        scale_pos_weight=(len(y_tr) - y_tr.sum()) / y_tr.sum(),
        eval_metric='logloss',
        early_stopping_rounds=50
    )
    
    lgb_cv.fit(X_tr, y_tr, eval_set=[(X_va, y_va)])
    xgb_cv.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
    
    lgb_va_auc = roc_auc_score(y_va, lgb_cv.predict_proba(X_va)[:, 1])
    xgb_va_auc = roc_auc_score(y_va, xgb_cv.predict_proba(X_va)[:, 1])
    
    lgb_cv_scores_p4.append(lgb_va_auc)
    xgb_cv_scores_p4.append(xgb_va_auc)

print("\n" + "="*70)
print("CROSS-VALIDATION (5-Fold)")
print("="*70)
print(f"LightGBM CV Scores: {lgb_cv_scores_p4}")
print(f"LightGBM Mean: {np.mean(lgb_cv_scores_p4):.4f} ± {np.std(lgb_cv_scores_p4):.4f}")
print(f"Phase 3 LightGBM CV: {phase3_val_auc_lgb:.4f}")
print(f"Improvement: {np.mean(lgb_cv_scores_p4) - phase3_val_auc_lgb:+.4f}")
print("-"*70)
print(f"XGBoost CV Scores: {xgb_cv_scores_p4}")
print(f"XGBoost Mean: {np.mean(xgb_cv_scores_p4):.4f} ± {np.std(xgb_cv_scores_p4):.4f}")
print(f"Phase 3 XGBoost: N/A (using LightGBM baseline)")
print(f"Improvement vs LightGBM: {np.mean(xgb_cv_scores_p4) - phase3_val_auc_lgb:+.4f}")
print("="*70)



CROSS-VALIDATION (5-Fold)
LightGBM CV Scores: [0.7774428035763592, 0.768374578680938, 0.7686018554523939, 0.7709351241451896, 0.7728685441777108]
LightGBM Mean: 0.7716 ± 0.0033
Phase 3 LightGBM CV: 0.7710
Improvement: +0.0007
----------------------------------------------------------------------
XGBoost CV Scores: [0.7694817915510809, 0.7616670158751924, 0.7612263879239861, 0.7623781276940422, 0.76232325537656]
XGBoost Mean: 0.7634 ± 0.0031
Phase 3 XGBoost: N/A (using LightGBM baseline)
Improvement vs LightGBM: -0.0075
CPU times: total: 30min 22s
Wall time: 2min 44s


### Save Processed Data


In [291]:
output_dir_p4 = '../../data/processed/phase4_pos_cc'
os.makedirs(output_dir_p4, exist_ok=True)

train_df_p4[['SK_ID_CURR'] + selected_features_p4].to_csv(f'{output_dir_p4}/X_train.csv', index=False)
pd.DataFrame(y_train).to_csv(f'{output_dir_p4}/y_train.csv', index=False)

val_df_p4[['SK_ID_CURR'] + selected_features_p4].to_csv(f'{output_dir_p4}/X_val.csv', index=False)
pd.DataFrame(y_val).to_csv(f'{output_dir_p4}/y_val.csv', index=False)

test_df_p4[['SK_ID_CURR'] + selected_features_p4].to_csv(f'{output_dir_p4}/X_test.csv', index=False)

feature_metadata_p4 = {
    'phase': 4,
    'base_phase': 3,
    'base_features': len(phase3_selected_features),
    'pos_cc_features_created': len(pos_cc_cols),
    'pos_cc_features_after_l1': len(pos_cc_filtered.columns) - 1,
    'features_dropped_missing': missing_dropped,
    'features_dropped_variance': var_dropped,
    'features_dropped_correlation': corr_dropped,
    'importance_threshold': importance_threshold,
    'features_selected_lgb': len(lgb_selected_p4),
    'features_selected_xgb': len(xgb_selected_p4),
    'feature_list': selected_features_p4,
    'quick_eval': {
        'lgb_train_auc': train_auc_lgb_p4,
        'lgb_val_auc': val_auc_lgb_p4,
        'xgb_train_auc': train_auc_xgb_p4,
        'xgb_val_auc': val_auc_xgb_p4,
        'phase3_baseline': phase3_val_auc_lgb
    },
    'cv_eval': {
        'lgb_cv_scores': lgb_cv_scores_p4,
        'lgb_cv_mean': np.mean(lgb_cv_scores_p4),
        'lgb_cv_std': np.std(lgb_cv_scores_p4),
        'xgb_cv_scores': xgb_cv_scores_p4,
        'xgb_cv_mean': np.mean(xgb_cv_scores_p4),
        'xgb_cv_std': np.std(xgb_cv_scores_p4)
    }
}

with open(f'{output_dir_p4}/feature_metadata.json', 'w') as f:
    json.dump(feature_metadata_p4, f, indent=2)

print(f"Processed data saved to {output_dir_p4}/")


Processed data saved to ../../data/processed/phase4_pos_cc/


### MLflow Tracking


In [299]:
mlflow_tracking_uri = os.path.join(os.getcwd(), 'mlruns')
mlflow.set_tracking_uri(f"file:///{mlflow_tracking_uri}")
mlflow.set_experiment("feature_engineering")

with mlflow.start_run(run_name="phase4_pos_cc_lightgbm"):
    mlflow.log_param("phase", "pos_cc")
    mlflow.log_param("base_phase", "phase3_previous_application")
    mlflow.log_param("model_type", "LightGBM")
    mlflow.log_param("n_estimators", 500)
    mlflow.log_param("learning_rate", 0.05)
    mlflow.log_param("max_depth", 7)
    mlflow.log_param("num_leaves", 31)
    mlflow.log_param("n_phase3_features", len(phase3_selected_features))
    mlflow.log_param("n_pos_cc_features_created", len(pos_cc_cols))
    mlflow.log_param("n_pos_cc_features_after_l1", len(pos_cc_filtered.columns) - 1)
    mlflow.log_param("n_features_final", len(selected_features_p4))
    mlflow.log_param("importance_threshold", importance_threshold)
    
    mlflow.log_metric("quick_train_auc", train_auc_lgb_p4)
    mlflow.log_metric("quick_val_auc", val_auc_lgb_p4)
    mlflow.log_metric("cv_mean_auc", np.mean(lgb_cv_scores_p4))
    mlflow.log_metric("cv_std_auc", np.std(lgb_cv_scores_p4))
    mlflow.log_metric("phase3_val_auc", phase3_val_auc_lgb)
    mlflow.log_metric("improvement_quick", val_auc_lgb_p4 - phase3_val_auc_lgb)
    mlflow.log_metric("improvement_cv", np.mean(lgb_cv_scores_p4) - phase3_val_auc_lgb)
    
    X_sample = X_train_selected_p4.iloc[:5].fillna(0)
    y_sample = y_train.iloc[:5]
    signature = infer_signature(X_sample, y_sample)
    
    mlflow.sklearn.log_model(lgb_final_p4, "model", signature=signature, input_example=X_sample)
    
    lgb_imp_df = get_feature_importances(lgb_final_p4, selected_features_p4)
    lgb_imp_df.to_csv('feature_importance_lgb_p4.csv', index=False)
    mlflow.log_artifact('feature_importance_lgb_p4.csv')
    os.remove('feature_importance_lgb_p4.csv')
    
    with open('selected_features_p4.json', 'w') as f:
        json.dump({'features': selected_features_p4}, f, indent=2)
    mlflow.log_artifact('selected_features_p4.json')
    os.remove('selected_features_p4.json')
    
    with open('dropped_features_p4.json', 'w') as f:
        json.dump({
            'dropped_missing': missing_dropped,
            'dropped_variance': var_dropped,
            'dropped_correlation': corr_dropped
        }, f, indent=2)
    mlflow.log_artifact('dropped_features_p4.json')
    os.remove('dropped_features_p4.json')

print("Logged LightGBM to MLflow")

with mlflow.start_run(run_name="phase4_pos_cc_xgboost"):
    mlflow.log_param("phase", "pos_cc")
    mlflow.log_param("base_phase", "phase3_previous_application")
    mlflow.log_param("model_type", "XGBoost")
    mlflow.log_param("n_estimators", 500)
    mlflow.log_param("learning_rate", 0.05)
    mlflow.log_param("max_depth", 7)
    mlflow.log_param("n_phase3_features", len(phase3_selected_features))
    mlflow.log_param("n_pos_cc_features_created", len(pos_cc_cols))
    mlflow.log_param("n_pos_cc_features_after_l1", len(pos_cc_filtered.columns) - 1)
    mlflow.log_param("n_features_final", len(selected_features_p4))
    mlflow.log_param("importance_threshold", importance_threshold)
    
    mlflow.log_metric("quick_train_auc", train_auc_xgb_p4)
    mlflow.log_metric("quick_val_auc", val_auc_xgb_p4)
    mlflow.log_metric("cv_mean_auc", np.mean(xgb_cv_scores_p4))
    mlflow.log_metric("cv_std_auc", np.std(xgb_cv_scores_p4))
    mlflow.log_metric("phase3_val_auc", phase3_val_auc_lgb)
    mlflow.log_metric("improvement_quick", val_auc_xgb_p4 - phase3_val_auc_lgb)
    mlflow.log_metric("improvement_cv", np.mean(xgb_cv_scores_p4) - phase3_val_auc_lgb)
    
    X_sample = X_train_selected_p4.iloc[:5].fillna(0)
    y_sample = y_train.iloc[:5]
    signature = infer_signature(X_sample, y_sample)
    
    mlflow.sklearn.log_model(xgb_final_p4, "model", signature=signature, input_example=X_sample)
    
    xgb_imp_df = get_feature_importances(xgb_final_p4, selected_features_p4)
    xgb_imp_df.to_csv('feature_importance_xgb_p4.csv', index=False)
    mlflow.log_artifact('feature_importance_xgb_p4.csv')
    os.remove('feature_importance_xgb_p4.csv')
    
    with open('selected_features_p4.json', 'w') as f:
        json.dump({'features': selected_features_p4}, f, indent=2)
    mlflow.log_artifact('selected_features_p4.json')
    os.remove('selected_features_p4.json')
    
    with open('dropped_features_p4.json', 'w') as f:
        json.dump({
            'dropped_missing': missing_dropped,
            'dropped_variance': var_dropped,
            'dropped_correlation': corr_dropped
        }, f, indent=2)
    mlflow.log_artifact('dropped_features_p4.json')
    os.remove('dropped_features_p4.json')

print("Logged XGBoost to MLflow")




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Logged LightGBM to MLflow




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Logged XGBoost to MLflow


### Save Models


In [295]:
output_model_dir_p4 = '../../models/phase4_pos_cc'
os.makedirs(output_model_dir_p4, exist_ok=True)

model_artifacts_lgb_p4 = {
    'model': lgb_final_p4,
    'selected_features': selected_features_p4,
    'categorical_features': CATEGORICAL_FEATURES,
    'cat_mappings': cat_mappings,
    'quick_val_auc': val_auc_lgb_p4,
    'cv_mean_auc': np.mean(lgb_cv_scores_p4),
    'cv_std_auc': np.std(lgb_cv_scores_p4),
    'phase3_val_auc': phase3_val_auc_lgb,
    'improvement': val_auc_lgb_p4 - phase3_val_auc_lgb
}

joblib.dump(model_artifacts_lgb_p4, f'{output_model_dir_p4}/lightgbm_v1.pkl')
print(f"Model saved to {output_model_dir_p4}/lightgbm_v1.pkl")

model_artifacts_xgb_p4 = {
    'model': xgb_final_p4,
    'selected_features': selected_features_p4,
    'categorical_features': CATEGORICAL_FEATURES,
    'cat_mappings': cat_mappings,
    'quick_val_auc': val_auc_xgb_p4,
    'cv_mean_auc': np.mean(xgb_cv_scores_p4),
    'cv_std_auc': np.std(xgb_cv_scores_p4),
    'phase3_val_auc': phase3_val_auc_lgb,
    'improvement': val_auc_xgb_p4 - phase3_val_auc_lgb
}

joblib.dump(model_artifacts_xgb_p4, f'{output_model_dir_p4}/xgboost_v1.pkl')
print(f"Model saved to {output_model_dir_p4}/xgboost_v1.pkl")


Model saved to ../../models/phase4_pos_cc/lightgbm_v1.pkl
Model saved to ../../models/phase4_pos_cc/xgboost_v1.pkl


### Generate Kaggle Submissions


In [296]:
sample_submission = pd.read_csv('../../data/raw/sample_submission.csv')

lgb_preds_p4 = lgb_final_p4.predict_proba(X_test_selected_p4)[:, 1]
xgb_preds_p4 = xgb_final_p4.predict_proba(X_test_selected_p4)[:, 1]

submission_lgb_p4 = sample_submission.copy()
submission_lgb_p4['TARGET'] = lgb_preds_p4

submission_xgb_p4 = sample_submission.copy()
submission_xgb_p4['TARGET'] = xgb_preds_p4

os.makedirs('../../data/submissions', exist_ok=True)

submission_lgb_p4.to_csv('../../data/submissions/phase4_pos_cc_lightgbm_v1.csv', index=False)
submission_xgb_p4.to_csv('../../data/submissions/phase4_pos_cc_xgboost_v1.csv', index=False)

print("Submission files created:")
print("  phase4_pos_cc_lightgbm_v1.csv")
print("  phase4_pos_cc_xgboost_v1.csv")
print(f"\nLightGBM predictions - Min: {lgb_preds_p4.min():.4f}, Max: {lgb_preds_p4.max():.4f}, Mean: {lgb_preds_p4.mean():.4f}")
print(f"XGBoost predictions  - Min: {xgb_preds_p4.min():.4f}, Max: {xgb_preds_p4.max():.4f}, Mean: {xgb_preds_p4.mean():.4f}")


Submission files created:
  phase4_pos_cc_lightgbm_v1.csv
  phase4_pos_cc_xgboost_v1.csv

LightGBM predictions - Min: 0.0052, Max: 0.9574, Mean: 0.3612
XGBoost predictions  - Min: 0.0009, Max: 0.9638, Mean: 0.3177


### Phase 4 Summary

**Strategy:** Payment behavior from POS Cash Balance and Credit Card - credit utilization and late payment patterns.

**Features Created:** 54 POS/CC features (CNT_INSTALMENT, SK_DPD, AMT_BALANCE, AMT_CREDIT_LIMIT, AMT_DRAWINGS, AMT_PAYMENT, balance_limit_ratio, payment_balance_ratio). Level 1 filtering: 6 missing + 5 variance + 10 correlation drops → 33 features. Level 2 selection (LightGBM only): 21 new features. Final: 76 features total.

**Results:** LightGBM Val 0.7735, CV 0.7716 (+0.0007 from Phase 3 - minimal gain). XGBoost Val 0.7646, CV 0.7634. Kaggle: LightGBM Private 0.76491/Public 0.75697, XGBoost Private 0.75626/Public 0.75002.

**Top Contributors:** cc_balance_limit_ratio, pos_SK_DPD metrics, cc_AMT_PAYMENT aggregations, pos_CNT_INSTALMENT_FUTURE. Credit utilization ratios valuable.

**Insight:** Moderate impact. Payment behavior adds incremental value. 

**Saved:** Processed data and models to phase4_pos_cc/, tracked in MLflow, submissions generated.

---

## Phase 5: Installments Payments Features

**Base:** Phase 4 selected features

 Add payment discipline patterns from installment history

 Payment timing, amounts, and delay patterns


### Load Phase 4 Base Features


In [300]:
with open('../../data/processed/phase4_pos_cc/feature_metadata.json', 'r') as f:
    phase4_metadata = json.load(f)

phase4_selected_features = phase4_metadata['feature_list']
phase4_val_auc_lgb = phase4_metadata['cv_eval']['lgb_cv_mean']

print(f"Phase 4 Base Features: {len(phase4_selected_features)}")
print(f"Phase 4 LightGBM CV AUC: {phase4_val_auc_lgb:.4f}")


Phase 4 Base Features: 76
Phase 4 LightGBM CV AUC: 0.7716


### Installments Payments Aggregations


In [301]:
inst = pd.read_csv('../../data/raw/installments_payments.csv')

print(f"Installments Payments shape: {inst.shape}")
print(f"Unique SK_ID_PREV: {inst['SK_ID_PREV'].nunique()}")


Installments Payments shape: (13605401, 8)
Unique SK_ID_PREV: 997752


In [302]:
inst['inst_payment_delay'] = inst['DAYS_ENTRY_PAYMENT'] - inst['DAYS_INSTALMENT']
inst['inst_payment_diff'] = inst['AMT_PAYMENT'] - inst['AMT_INSTALMENT']
inst['inst_payment_ratio'] = inst['AMT_PAYMENT'] / (inst['AMT_INSTALMENT'] + 1)

inst_agg = inst.groupby('SK_ID_PREV').agg({
    'NUM_INSTALMENT_VERSION': ['max', 'nunique'],
    'NUM_INSTALMENT_NUMBER': ['max', 'mean'],
    'DAYS_INSTALMENT': ['min', 'max', 'mean'],
    'DAYS_ENTRY_PAYMENT': ['min', 'max', 'mean'],
    'AMT_INSTALMENT': ['min', 'max', 'mean', 'sum'],
    'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
    'inst_payment_delay': ['max', 'mean', 'sum'],
    'inst_payment_diff': ['min', 'max', 'mean', 'sum'],
    'inst_payment_ratio': ['min', 'max', 'mean']
}).reset_index()

inst_agg.columns = ['SK_ID_PREV'] + [f'inst_{col[0]}_{col[1]}' for col in inst_agg.columns[1:]]

inst_agg['inst_late_payment_count'] = (inst.groupby('SK_ID_PREV')['inst_payment_delay'].apply(lambda x: (x > 0).sum())).values
inst_agg['inst_late_payment_ratio'] = inst_agg['inst_late_payment_count'] / (inst_agg['inst_NUM_INSTALMENT_NUMBER_max'] + 1)

prev_to_curr = prev[['SK_ID_PREV', 'SK_ID_CURR']].drop_duplicates()
inst_by_curr = inst_agg.merge(prev_to_curr, on='SK_ID_PREV', how='left')
inst_features = inst_by_curr.drop(columns=['SK_ID_PREV']).groupby('SK_ID_CURR').agg('mean').reset_index()

print(f"Installments features created: {inst_features.shape[1] - 1}")


Installments features created: 30


### Level 1 Filtering


In [303]:
inst_cols = [c for c in inst_features.columns if c != 'SK_ID_CURR']

inst_filtered = inst_features[['SK_ID_CURR'] + inst_cols].copy()
inst_filtered, inst_missing_dropped = filter_by_missing(inst_filtered, threshold=0.80)
inst_filtered, inst_var_dropped = remove_low_variance(inst_filtered, threshold=0.01)
inst_filtered, inst_corr_dropped = remove_correlated(inst_filtered, threshold=0.95)

print(f"Installments features after Level 1 filtering: {len(inst_filtered.columns) - 1}")
print(f"  Dropped by missing: {len(inst_missing_dropped)}")
print(f"  Dropped by low variance: {len(inst_var_dropped)}")
print(f"  Dropped by correlation: {len(inst_corr_dropped)}")


Installments features after Level 1 filtering: 19
  Dropped by missing: 0
  Dropped by low variance: 0
  Dropped by correlation: 11


### Load Phase 4 Processed Data & Merge with Installments Features


In [304]:
X_train_p4_df = pd.read_csv('../../data/processed/phase4_pos_cc/X_train.csv')
X_val_p4_df = pd.read_csv('../../data/processed/phase4_pos_cc/X_val.csv')
X_test_p4_df = pd.read_csv('../../data/processed/phase4_pos_cc/X_test.csv')

train_df_p5 = X_train_p4_df.merge(inst_filtered, on='SK_ID_CURR', how='left')
val_df_p5 = X_val_p4_df.merge(inst_filtered, on='SK_ID_CURR', how='left')
test_df_p5 = X_test_p4_df.merge(inst_filtered, on='SK_ID_CURR', how='left')

X_train_inst = train_df_p5.drop(columns=['SK_ID_CURR'])
X_val_inst = val_df_p5.drop(columns=['SK_ID_CURR'])
X_test_inst = test_df_p5.drop(columns=['SK_ID_CURR'])

print(f"X_train shape: {X_train_inst.shape}")
print(f"X_val shape: {X_val_inst.shape}")
print(f"X_test shape: {X_test_inst.shape}")


X_train shape: (246008, 95)
X_val shape: (61503, 95)
X_test shape: (48744, 95)


### Preliminary Training (for Level 2 Filtering)


In [305]:
%%time
X_train_lgb_p5 = X_train_inst.copy()
X_val_lgb_p5 = X_val_inst.copy()
X_test_lgb_p5 = X_test_inst.copy()

for col in CATEGORICAL_FEATURES:
    if col in X_train_lgb_p5.columns:
        X_train_lgb_p5[col] = X_train_lgb_p5[col].map(cat_mappings[col]).fillna(-1).astype(int)
        X_val_lgb_p5[col] = X_val_lgb_p5[col].map(cat_mappings[col]).fillna(-1).astype(int)
        X_test_lgb_p5[col] = X_test_lgb_p5[col].map(cat_mappings[col]).fillna(-1).astype(int)

lgb_inst_prelim = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=31,
    random_state=42,
    class_weight='balanced',
    verbose=-1
)

xgb_inst_prelim = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    random_state=42,
    scale_pos_weight=(len(y_train) - y_train.sum()) / y_train.sum(),
    eval_metric='logloss',
    early_stopping_rounds=50
)

lgb_inst_prelim.fit(X_train_lgb_p5, y_train, eval_set=[(X_val_lgb_p5, y_val)])
xgb_inst_prelim.fit(X_train_lgb_p5, y_train, eval_set=[(X_val_lgb_p5, y_val)], verbose=False)

lgb_train_auc = roc_auc_score(y_train, lgb_inst_prelim.predict_proba(X_train_lgb_p5)[:, 1])
lgb_val_auc = roc_auc_score(y_val, lgb_inst_prelim.predict_proba(X_val_lgb_p5)[:, 1])

xgb_train_auc = roc_auc_score(y_train, xgb_inst_prelim.predict_proba(X_train_lgb_p5)[:, 1])
xgb_val_auc = roc_auc_score(y_val, xgb_inst_prelim.predict_proba(X_val_lgb_p5)[:, 1])

print(f"Preliminary LightGBM - Train AUC: {lgb_train_auc:.4f}, Val AUC: {lgb_val_auc:.4f}")
print(f"Preliminary XGBoost  - Train AUC: {xgb_train_auc:.4f}, Val AUC: {xgb_val_auc:.4f}")


Preliminary LightGBM - Train AUC: 0.8624, Val AUC: 0.7765
Preliminary XGBoost  - Train AUC: 0.9296, Val AUC: 0.7677
CPU times: total: 7min 53s
Wall time: 43.9 s


### Level 2 Filtering: Feature Importance Selection


In [306]:
lgb_importances_p5 = get_feature_importances(lgb_inst_prelim, X_train_lgb_p5.columns)
xgb_importances_p5 = get_feature_importances(xgb_inst_prelim, X_train_lgb_p5.columns)

importance_threshold = 20

lgb_selected_p5 = select_by_importance_threshold(lgb_importances_p5, threshold=importance_threshold)
xgb_selected_p5 = select_by_importance_threshold(xgb_importances_p5, threshold=importance_threshold)

selected_features_p5 = list(set(lgb_selected_p5) | set(xgb_selected_p5))

print(f"Features selected by LightGBM (>{importance_threshold}): {len(lgb_selected_p5)}")
print(f"Features selected by XGBoost (>{importance_threshold}): {len(xgb_selected_p5)}")
print(f"Union of selected features: {len(selected_features_p5)}")


Features selected by LightGBM (>20): 95
Features selected by XGBoost (>20): 0
Union of selected features: 95


### Final Model Training (with selected features)


In [307]:
%%time
X_train_selected_p5 = X_train_lgb_p5[selected_features_p5]
X_val_selected_p5 = X_val_lgb_p5[selected_features_p5]
X_test_selected_p5 = X_test_lgb_p5[selected_features_p5]

lgb_final_p5 = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=31,
    random_state=42,
    class_weight='balanced',
    verbose=-1
)

xgb_final_p5 = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    random_state=42,
    scale_pos_weight=(len(y_train) - y_train.sum()) / y_train.sum(),
    eval_metric='logloss',
    early_stopping_rounds=50
)

lgb_final_p5.fit(X_train_selected_p5, y_train, eval_set=[(X_val_selected_p5, y_val)])
xgb_final_p5.fit(X_train_selected_p5, y_train, eval_set=[(X_val_selected_p5, y_val)], verbose=False)

train_auc_lgb_p5 = roc_auc_score(y_train, lgb_final_p5.predict_proba(X_train_selected_p5)[:, 1])
val_auc_lgb_p5 = roc_auc_score(y_val, lgb_final_p5.predict_proba(X_val_selected_p5)[:, 1])

train_auc_xgb_p5 = roc_auc_score(y_train, xgb_final_p5.predict_proba(X_train_selected_p5)[:, 1])
val_auc_xgb_p5 = roc_auc_score(y_val, xgb_final_p5.predict_proba(X_val_selected_p5)[:, 1])

print(f"Final LightGBM - Train AUC: {train_auc_lgb_p5:.4f}, Val AUC: {val_auc_lgb_p5:.4f}")
print(f"Final XGBoost  - Train AUC: {train_auc_xgb_p5:.4f}, Val AUC: {val_auc_xgb_p5:.4f}")


Final LightGBM - Train AUC: 0.8624, Val AUC: 0.7765
Final XGBoost  - Train AUC: 0.9296, Val AUC: 0.7677
CPU times: total: 8min 2s
Wall time: 44.4 s


### Quick Evaluation (80/20 Split - Phase 4 Comparison)


In [308]:
comparison_p5 = pd.DataFrame({
    'Model': ['LightGBM', 'XGBoost'],
    'Train AUC': [train_auc_lgb_p5, train_auc_xgb_p5],
    'Val AUC': [val_auc_lgb_p5, val_auc_xgb_p5],
    'Phase4 Val AUC': [phase4_val_auc_lgb, phase4_val_auc_lgb],
    'Improvement': [val_auc_lgb_p5 - phase4_val_auc_lgb, val_auc_xgb_p5 - phase4_val_auc_lgb],
    'Train-Val Gap': [train_auc_lgb_p5 - val_auc_lgb_p5, train_auc_xgb_p5 - val_auc_xgb_p5]
})

print("\n" + "="*70)
print("QUICK EVALUATION (80/20 Split)")
print("="*70)
print(comparison_p5.to_string(index=False))
print("="*70)



QUICK EVALUATION (80/20 Split)
   Model  Train AUC  Val AUC  Phase4 Val AUC  Improvement  Train-Val Gap
LightGBM   0.862449 0.776472        0.771645     0.004827       0.085977
 XGBoost   0.929613 0.767688        0.771645    -0.003957       0.161926


### Cross-Validation (5-Fold StratifiedKFold)


In [309]:
%%time

X_full = pd.concat([X_train, X_val], axis=0).reset_index(drop=True)
y_full = pd.concat([y_train, y_val], axis=0).reset_index(drop=True)

X_train_p4_full = pd.read_csv('../../data/processed/phase4_pos_cc/X_train.csv')
X_val_p4_full = pd.read_csv('../../data/processed/phase4_pos_cc/X_val.csv')
X_full_p4 = pd.concat([X_train_p4_full, X_val_p4_full], axis=0).reset_index(drop=True)

X_full_p5 = X_full_p4.merge(inst_filtered, on='SK_ID_CURR', how='left').drop(columns=['SK_ID_CURR'])

for col in CATEGORICAL_FEATURES:
    if col in X_full_p5.columns:
        X_full_p5[col] = X_full_p5[col].map(cat_mappings[col]).fillna(-1).astype(int)

X_full_selected_p5 = X_full_p5[selected_features_p5]

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

lgb_cv_scores_p5 = []
xgb_cv_scores_p5 = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_full_selected_p5, y_full), 1):
    X_tr, X_va = X_full_selected_p5.iloc[train_idx], X_full_selected_p5.iloc[val_idx]
    y_tr, y_va = y_full.iloc[train_idx], y_full.iloc[val_idx]
    
    lgb_cv = LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=7,
        num_leaves=31,
        random_state=42,
        class_weight='balanced',
        verbose=-1
    )
    
    xgb_cv = XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=7,
        random_state=42,
        scale_pos_weight=(len(y_tr) - y_tr.sum()) / y_tr.sum(),
        eval_metric='logloss',
        early_stopping_rounds=50
    )
    
    lgb_cv.fit(X_tr, y_tr, eval_set=[(X_va, y_va)])
    xgb_cv.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
    
    lgb_va_auc = roc_auc_score(y_va, lgb_cv.predict_proba(X_va)[:, 1])
    xgb_va_auc = roc_auc_score(y_va, xgb_cv.predict_proba(X_va)[:, 1])
    
    lgb_cv_scores_p5.append(lgb_va_auc)
    xgb_cv_scores_p5.append(xgb_va_auc)

print("\n" + "="*70)
print("CROSS-VALIDATION (5-Fold)")
print("="*70)
print(f"LightGBM CV Scores: {lgb_cv_scores_p5}")
print(f"LightGBM Mean: {np.mean(lgb_cv_scores_p5):.4f} ± {np.std(lgb_cv_scores_p5):.4f}")
print(f"Phase 4 LightGBM CV: {phase4_val_auc_lgb:.4f}")
print(f"Improvement: {np.mean(lgb_cv_scores_p5) - phase4_val_auc_lgb:+.4f}")
print("-"*70)
print(f"XGBoost CV Scores: {xgb_cv_scores_p5}")
print(f"XGBoost Mean: {np.mean(xgb_cv_scores_p5):.4f} ± {np.std(xgb_cv_scores_p5):.4f}")
print(f"Phase 4 XGBoost: N/A (using LightGBM baseline)")
print(f"Improvement vs LightGBM: {np.mean(xgb_cv_scores_p5) - phase4_val_auc_lgb:+.4f}")
print("="*70)



CROSS-VALIDATION (5-Fold)
LightGBM CV Scores: [0.7799068487370844, 0.7741456018045629, 0.7720889568508112, 0.7746661888004933, 0.7767181455785774]
LightGBM Mean: 0.7755 ± 0.0026
Phase 4 LightGBM CV: 0.7716
Improvement: +0.0039
----------------------------------------------------------------------
XGBoost CV Scores: [0.772775627346785, 0.7658962098824997, 0.7634166654777013, 0.7655366186151816, 0.7693677006534287]
XGBoost Mean: 0.7674 ± 0.0033
Phase 4 XGBoost: N/A (using LightGBM baseline)
Improvement vs LightGBM: -0.0042
CPU times: total: 38min 12s
Wall time: 3min 20s


### Save Processed Data


In [310]:
output_dir_p5 = '../../data/processed/phase5_installments'
os.makedirs(output_dir_p5, exist_ok=True)

train_df_p5[['SK_ID_CURR'] + selected_features_p5].to_csv(f'{output_dir_p5}/X_train.csv', index=False)
pd.DataFrame(y_train).to_csv(f'{output_dir_p5}/y_train.csv', index=False)

val_df_p5[['SK_ID_CURR'] + selected_features_p5].to_csv(f'{output_dir_p5}/X_val.csv', index=False)
pd.DataFrame(y_val).to_csv(f'{output_dir_p5}/y_val.csv', index=False)

test_df_p5[['SK_ID_CURR'] + selected_features_p5].to_csv(f'{output_dir_p5}/X_test.csv', index=False)

feature_metadata_p5 = {
    'phase': 5,
    'base_phase': 4,
    'base_features': len(phase4_selected_features),
    'inst_features_created': len(inst_cols),
    'inst_features_after_l1': len(inst_filtered.columns) - 1,
    'features_dropped_missing': inst_missing_dropped,
    'features_dropped_variance': inst_var_dropped,
    'features_dropped_correlation': inst_corr_dropped,
    'importance_threshold': importance_threshold,
    'features_selected_lgb': len(lgb_selected_p5),
    'features_selected_xgb': len(xgb_selected_p5),
    'feature_list': selected_features_p5,
    'quick_eval': {
        'lgb_train_auc': train_auc_lgb_p5,
        'lgb_val_auc': val_auc_lgb_p5,
        'xgb_train_auc': train_auc_xgb_p5,
        'xgb_val_auc': val_auc_xgb_p5,
        'phase4_baseline': phase4_val_auc_lgb
    },
    'cv_eval': {
        'lgb_cv_scores': lgb_cv_scores_p5,
        'lgb_cv_mean': np.mean(lgb_cv_scores_p5),
        'lgb_cv_std': np.std(lgb_cv_scores_p5),
        'xgb_cv_scores': xgb_cv_scores_p5,
        'xgb_cv_mean': np.mean(xgb_cv_scores_p5),
        'xgb_cv_std': np.std(xgb_cv_scores_p5)
    }
}

with open(f'{output_dir_p5}/feature_metadata.json', 'w') as f:
    json.dump(feature_metadata_p5, f, indent=2)

print(f"Processed data saved to {output_dir_p5}/")


Processed data saved to ../../data/processed/phase5_installments/


### MLflow Tracking


In [311]:
mlflow_tracking_uri = os.path.join(os.getcwd(), 'mlruns')
mlflow.set_tracking_uri(f"file:///{mlflow_tracking_uri}")

mlflow.set_experiment("feature_engineering")

with mlflow.start_run(run_name="phase5_installments_lightgbm"):
    mlflow.log_param("phase", "installments")
    mlflow.log_param("base_phase", "phase4_pos_cc")
    mlflow.log_param("model_type", "LightGBM")
    mlflow.log_param("n_estimators", 500)
    mlflow.log_param("learning_rate", 0.05)
    mlflow.log_param("max_depth", 7)
    mlflow.log_param("num_leaves", 31)
    mlflow.log_param("n_phase4_features", len(phase4_selected_features))
    mlflow.log_param("n_inst_features_created", len(inst_cols))
    mlflow.log_param("n_inst_features_after_l1", len(inst_filtered.columns) - 1)
    mlflow.log_param("n_features_final", len(selected_features_p5))
    mlflow.log_param("importance_threshold", importance_threshold)
    
    mlflow.log_metric("quick_train_auc", train_auc_lgb_p5)
    mlflow.log_metric("quick_val_auc", val_auc_lgb_p5)
    mlflow.log_metric("cv_mean_auc", np.mean(lgb_cv_scores_p5))
    mlflow.log_metric("cv_std_auc", np.std(lgb_cv_scores_p5))
    mlflow.log_metric("phase4_val_auc", phase4_val_auc_lgb)
    mlflow.log_metric("improvement_quick", val_auc_lgb_p5 - phase4_val_auc_lgb)
    mlflow.log_metric("improvement_cv", np.mean(lgb_cv_scores_p5) - phase4_val_auc_lgb)
    
    X_sample = X_train_selected_p5.iloc[:5].fillna(0)
    y_sample = y_train.iloc[:5]
    signature = infer_signature(X_sample, y_sample)
    
    mlflow.sklearn.log_model(lgb_final_p5, "model", signature=signature, input_example=X_sample)
    
    lgb_imp_df = get_feature_importances(lgb_final_p5, selected_features_p5)
    lgb_imp_df.to_csv('feature_importance_lgb_p5.csv', index=False)
    mlflow.log_artifact('feature_importance_lgb_p5.csv')
    os.remove('feature_importance_lgb_p5.csv')
    
    with open('selected_features_p5.json', 'w') as f:
        json.dump({'features': selected_features_p5}, f, indent=2)
    mlflow.log_artifact('selected_features_p5.json')
    os.remove('selected_features_p5.json')
    
    with open('dropped_features_p5.json', 'w') as f:
        json.dump({
            'dropped_missing': inst_missing_dropped,
            'dropped_variance': inst_var_dropped,
            'dropped_correlation': inst_corr_dropped
        }, f, indent=2)
    mlflow.log_artifact('dropped_features_p5.json')
    os.remove('dropped_features_p5.json')

print("Logged LightGBM to MLflow")

with mlflow.start_run(run_name="phase5_installments_xgboost"):
    mlflow.log_param("phase", "installments")
    mlflow.log_param("base_phase", "phase4_pos_cc")
    mlflow.log_param("model_type", "XGBoost")
    mlflow.log_param("n_estimators", 500)
    mlflow.log_param("learning_rate", 0.05)
    mlflow.log_param("max_depth", 7)
    mlflow.log_param("n_phase4_features", len(phase4_selected_features))
    mlflow.log_param("n_inst_features_created", len(inst_cols))
    mlflow.log_param("n_inst_features_after_l1", len(inst_filtered.columns) - 1)
    mlflow.log_param("n_features_final", len(selected_features_p5))
    mlflow.log_param("importance_threshold", importance_threshold)
    
    mlflow.log_metric("quick_train_auc", train_auc_xgb_p5)
    mlflow.log_metric("quick_val_auc", val_auc_xgb_p5)
    mlflow.log_metric("cv_mean_auc", np.mean(xgb_cv_scores_p5))
    mlflow.log_metric("cv_std_auc", np.std(xgb_cv_scores_p5))
    mlflow.log_metric("phase4_val_auc", phase4_val_auc_lgb)
    mlflow.log_metric("improvement_quick", val_auc_xgb_p5 - phase4_val_auc_lgb)
    mlflow.log_metric("improvement_cv", np.mean(xgb_cv_scores_p5) - phase4_val_auc_lgb)
    
    X_sample = X_train_selected_p5.iloc[:5].fillna(0)
    y_sample = y_train.iloc[:5]
    signature = infer_signature(X_sample, y_sample)
    
    mlflow.sklearn.log_model(xgb_final_p5, "model", signature=signature, input_example=X_sample)
    
    xgb_imp_df = get_feature_importances(xgb_final_p5, selected_features_p5)
    xgb_imp_df.to_csv('feature_importance_xgb_p5.csv', index=False)
    mlflow.log_artifact('feature_importance_xgb_p5.csv')
    os.remove('feature_importance_xgb_p5.csv')
    
    with open('selected_features_p5.json', 'w') as f:
        json.dump({'features': selected_features_p5}, f, indent=2)
    mlflow.log_artifact('selected_features_p5.json')
    os.remove('selected_features_p5.json')
    
    with open('dropped_features_p5.json', 'w') as f:
        json.dump({
            'dropped_missing': inst_missing_dropped,
            'dropped_variance': inst_var_dropped,
            'dropped_correlation': inst_corr_dropped
        }, f, indent=2)
    mlflow.log_artifact('dropped_features_p5.json')
    os.remove('dropped_features_p5.json')

print("Logged XGBoost to MLflow")




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Logged LightGBM to MLflow




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Logged XGBoost to MLflow


### Save Models


In [312]:
output_model_dir_p5 = '../../models/phase5_installments'
os.makedirs(output_model_dir_p5, exist_ok=True)

model_artifacts_lgb_p5 = {
    'model': lgb_final_p5,
    'selected_features': selected_features_p5,
    'categorical_features': CATEGORICAL_FEATURES,
    'cat_mappings': cat_mappings,
    'quick_val_auc': val_auc_lgb_p5,
    'cv_mean_auc': np.mean(lgb_cv_scores_p5),
    'cv_std_auc': np.std(lgb_cv_scores_p5),
    'phase4_val_auc': phase4_val_auc_lgb,
    'improvement': val_auc_lgb_p5 - phase4_val_auc_lgb
}

joblib.dump(model_artifacts_lgb_p5, f'{output_model_dir_p5}/lightgbm_v1.pkl')
print(f"Model saved to {output_model_dir_p5}/lightgbm_v1.pkl")

model_artifacts_xgb_p5 = {
    'model': xgb_final_p5,
    'selected_features': selected_features_p5,
    'categorical_features': CATEGORICAL_FEATURES,
    'cat_mappings': cat_mappings,
    'quick_val_auc': val_auc_xgb_p5,
    'cv_mean_auc': np.mean(xgb_cv_scores_p5),
    'cv_std_auc': np.std(xgb_cv_scores_p5),
    'phase4_val_auc': phase4_val_auc_lgb,
    'improvement': val_auc_xgb_p5 - phase4_val_auc_lgb
}

joblib.dump(model_artifacts_xgb_p5, f'{output_model_dir_p5}/xgboost_v1.pkl')
print(f"Model saved to {output_model_dir_p5}/xgboost_v1.pkl")


Model saved to ../../models/phase5_installments/lightgbm_v1.pkl
Model saved to ../../models/phase5_installments/xgboost_v1.pkl


### Generate Kaggle Submissions


In [313]:
sample_submission = pd.read_csv('../../data/raw/sample_submission.csv')

lgb_preds_p5 = lgb_final_p5.predict_proba(X_test_selected_p5)[:, 1]
xgb_preds_p5 = xgb_final_p5.predict_proba(X_test_selected_p5)[:, 1]

submission_lgb_p5 = sample_submission.copy()
submission_lgb_p5['TARGET'] = lgb_preds_p5

submission_xgb_p5 = sample_submission.copy()
submission_xgb_p5['TARGET'] = xgb_preds_p5

os.makedirs('../../data/submissions', exist_ok=True)

submission_lgb_p5.to_csv('../../data/submissions/phase5_installments_lightgbm_v1.csv', index=False)
submission_xgb_p5.to_csv('../../data/submissions/phase5_installments_xgboost_v1.csv', index=False)

print("Submission files created:")
print("  phase5_installments_lightgbm_v1.csv")
print("  phase5_installments_xgboost_v1.csv")
print(f"\nLightGBM predictions - Min: {lgb_preds_p5.min():.4f}, Max: {lgb_preds_p5.max():.4f}, Mean: {lgb_preds_p5.mean():.4f}")
print(f"XGBoost predictions  - Min: {xgb_preds_p5.min():.4f}, Max: {xgb_preds_p5.max():.4f}, Mean: {xgb_preds_p5.mean():.4f}")


Submission files created:
  phase5_installments_lightgbm_v1.csv
  phase5_installments_xgboost_v1.csv

LightGBM predictions - Min: 0.0038, Max: 0.9576, Mean: 0.3527
XGBoost predictions  - Min: 0.0020, Max: 0.9587, Mean: 0.3063


### Phase 5 Summary

**Strategy:** Installments payment discipline - timing delays and payment patterns - second highest impact phase.

**Features Created:** 30 installments features (AMT_INSTALMENT, AMT_PAYMENT, DAYS_INSTALMENT, DAYS_ENTRY_PAYMENT, payment_delay, payment_diff, payment_ratio, late_payment_count/ratio). Level 1 filtering: 11 correlation drops → 19 features. Level 2 selection (LightGBM only): All 19 retained. Final: 95 features total.

**Results:** LightGBM Val 0.7765, CV 0.7755 (+0.0039 from Phase 4). XGBoost Val 0.7677, CV 0.7674. Kaggle: LightGBM Private 0.76940/Public 0.76412 (BEST), XGBoost Private 0.76028/Public 0.75618.

**Top Contributors:** inst_late_payment_count, inst_payment_delay aggregations, inst_payment_diff patterns, inst_payment_ratio metrics. Payment discipline strong signal.

**Insight:** Strong phase. Payment timing and delays highly predictive. Final model: 95 features, +0.0212 CV AUC improvement over baseline.

**Saved:** Processed data and models to phase5_installments/, tracked in MLflow, submissions generated.