In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, precision_recall_curve, f1_score
from category_encoders import TargetEncoder
from imblearn.combine import SMOTETomek
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from skopt import BayesSearchCV
import shap
import warnings
warnings.filterwarnings('ignore')

# Load dataset (replace with your file path)
df = pd.read_csv('loan_data.csv')

# --- Preprocessing ---

# Date handling
df['DATE_OF_BIRTH'] = pd.to_datetime(df['DATE_OF_BIRTH'], format='%d-%m-%Y')
df['DISBURSAL_DATE'] = pd.to_datetime(df['DISBURSAL_DATE'], format='%d-%m-%Y')
df['AGE_AT_DISBURSAL'] = (df['DISBURSAL_DATE'] - df['DATE_OF_BIRTH']).dt.days / 365.25

# Parse text to numerical
def parse_years_months(val):
    if pd.isna(val):
        return 0
    parts = val.split()
    years = int(parts[0].replace('yrs', ''))
    months = int(parts[1].replace('mon', ''))
    return years * 12 + months

df['AVERAGE_ACCT_AGE_MONTHS'] = df['AVERAGE_ACCT_AGE'].apply(parse_years_months)
df['CREDIT_HISTORY_LENGTH_MONTHS'] = df['CREDIT_HISTORY_LENGTH'].apply(parse_years_months)

# Handle missing values
df['EMPLOYMENT_TYPE'] = df['EMPLOYMENT_TYPE'].fillna('Unknown')
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
imputer = SimpleImputer(strategy='median')
df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

# Binary feature
df['HAS_BUREAU_HISTORY'] = (df['PERFORM_CNS_SCORE'] > 0).astype(int)

# --- Feature Engineering ---

df['PRI_SANCTION_DISBURSED_DIFF'] = df['PRI_SANCTIONED_AMOUNT'] - df['PRI_DISBURSED_AMOUNT']
df['SEC_SANCTION_DISBURSED_DIFF'] = df['SEC_SANCTIONED_AMOUNT'] - df['SEC_DISBURSED_AMOUNT']
df['PRI_OVERDUE_ACCTS_RATIO'] = df['PRI_OVERDUE_ACCTS'] / (df['PRI_NO_OF_ACCTS'] + 1e-5)
df['PRI_DEBT_BURDEN'] = df['PRI_CURRENT_BALANCE'] / (df['PRI_DISBURSED_AMOUNT'] + 1e-5)
df['RECENT_DELINQUENCY_RATE'] = df['DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS'] / (df['NEW_ACCTS_IN_LAST_SIX_MONTHS'] + 1e-5)
df['HAS_OVERDUE_ACCTS'] = ((df['PRI_OVERDUE_ACCTS'] > 0) | (df['SEC_OVERDUE_ACCTS'] > 0)).astype(int)

# --- Encoding ---

encoder = TargetEncoder(cols=['BRANCH_ID', 'SUPPLIER_ID', 'MANUFACTURER_ID', 'CURRENT_PINCODE_ID', 'STATE_ID', 'EMPLOYEE_CODE_ID'])
df_encoded = encoder.fit_transform(df, df['LOAN_DEFAULT'])
employment_dummies = pd.get_dummies(df_encoded['EMPLOYMENT_TYPE'], prefix='EMPLOYMENT')
df_encoded = pd.concat([df_encoded, employment_dummies], axis=1)

# Feature list
features = [
    'DISBURSED_AMOUNT', 'ASSET_COST', 'LTV', 'AGE_AT_DISBURSAL',
    'PERFORM_CNS_SCORE', 'HAS_BUREAU_HISTORY', 'PRI_NO_OF_ACCTS', 'PRI_ACTIVE_ACCTS',
    'PRI_OVERDUE_ACCTS', 'PRI_CURRENT_BALANCE', 'PRI_SANCTIONED_AMOUNT', 'PRI_DISBURSED_AMOUNT',
    'SEC_NO_OF_ACCTS', 'SEC_ACTIVE_ACCTS', 'SEC_OVERDUE_ACCTS', 'SEC_CURRENT_BALANCE',
    'SEC_SANCTIONED_AMOUNT', 'SEC_DISBURSED_AMOUNT', 'PRIMARY_INSTAL_AMT', 'SEC_INSTAL_AMT',
    'NEW_ACCTS_IN_LAST_SIX_MONTHS', 'DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS',
    'AVERAGE_ACCT_AGE_MONTHS', 'CREDIT_HISTORY_LENGTH_MONTHS', 'NO_OF_INQUIRIES',
    'PRI_SANCTION_DISBURSED_DIFF', 'SEC_SANCTION_DISBURSED_DIFF', 'PRI_OVERDUE_ACCTS_RATIO',
    'PRI_DEBT_BURDEN', 'RECENT_DELINQUENCY_RATE', 'HAS_OVERDUE_ACCTS',
    'BRANCH_ID', 'SUPPLIER_ID', 'MANUFACTURER_ID', 'CURRENT_PINCODE_ID', 'STATE_ID', 'EMPLOYEE_CODE_ID',
    'EMPLOYMENT_Salaried', 'EMPLOYMENT_Self employed', 'EMPLOYMENT_Unknown'
]

X = df_encoded[features]
y = df_encoded['LOAN_DEFAULT']

# --- Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- Handle Imbalance ---
smt = SMOTETomek(random_state=42)
X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

# --- Scale Features ---
scaler = StandardScaler()
X_train_res = scaler.fit_transform(X_train_res)
X_test = scaler.transform(X_test)

# --- Base Models ---

# XGBoost
xgb = XGBClassifier(random_state=42, eval_metric='auc')
xgb_params = {'n_estimators': (100, 500), 'max_depth': (3, 10), 'learning_rate': (0.01, 0.3),
              'subsample': (0.6, 1.0), 'colsample_bytree': (0.6, 1.0)}
xgb_opt = BayesSearchCV(xgb, xgb_params, n_iter=20, cv=StratifiedKFold(5), scoring='roc_auc', random_state=42)
xgb_opt.fit(X_train_res, y_train_res)
xgb_best = xgb_opt.best_estimator_

# LightGBM
lgb = LGBMClassifier(random_state=42)
lgb_params = {'n_estimators': (100, 500), 'num_leaves': (20, 100), 'learning_rate': (0.01, 0.3),
              'bagging_fraction': (0.6, 1.0), 'feature_fraction': (0.6, 1.0)}
lgb_opt = BayesSearchCV(lgb, lgb_params, n_iter=20, cv=StratifiedKFold(5), scoring='roc_auc', random_state=42)
lgb_opt.fit(X_train_res, y_train_res)
lgb_best = lgb_opt.best_estimator_

# Neural Network
def create_nn():
    model = Sequential()
    model.add(Dense(64, activation='relu', input_dim=X_train_res.shape[1]))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])
    return model

nn = create_nn()
nn.fit(X_train_res, y_train_res, epochs=20, batch_size=32, verbose=0)

# --- Stacking ---

# Generate base model predictions
xgb_pred_train = xgb_best.predict_proba(X_train_res)[:, 1]
lgb_pred_train = lgb_best.predict_proba(X_train_res)[:, 1]
nn_pred_train = nn.predict(X_train_res).flatten()

xgb_pred_test = xgb_best.predict_proba(X_test)[:, 1]
lgb_pred_test = lgb_best.predict_proba(X_test)[:, 1]
nn_pred_test = nn.predict(X_test).flatten()

# Stack predictions
stacked_train = np.column_stack((xgb_pred_train, lgb_pred_train, nn_pred_train))
stacked_test = np.column_stack((xgb_pred_test, lgb_pred_test, nn_pred_test))

# Meta-learner
meta_learner = LogisticRegression()
meta_learner.fit(stacked_train, y_train_res)
final_pred_proba = meta_learner.predict_proba(stacked_test)[:, 1]
final_pred = (final_pred_proba >= 0.5).astype(int)

# --- Evaluation ---

roc_auc = roc_auc_score(y_test, final_pred_proba)
precision, recall, _ = precision_recall_curve(y_test, final_pred_proba)
pr_auc = auc(recall, precision)
f1 = f1_score(y_test, final_pred)

print(f"ROC-AUC: {roc_auc:.4f}")
print(f"PR-AUC: {pr_auc:.4f}")
print(f"F1-Score: {f1:.4f}")

# --- Interpretability ---

explainer = shap.KernelExplainer(xgb_best.predict_proba, X_test[:100])  # Subset for speed
shap_values = explainer.shap_values(X_test[:100])
shap.summary_plot(shap_values[1], X_test[:100], feature_names=features)