Predict the likelihood of a loan default and optimize the decision threshold based on
cost-benefit analysis.

In [19]:
# Loan Default Risk + Cost-based Threshold Optimization
# Run in a Jupyter notebook or .py script (must have scikit-learn, pandas, numpy, matplotlib).
# CatBoost is attempted but if not installed the code will fall back to RandomForest.

# Cell 1: imports and config
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score, precision_recall_curve, roc_curve,
    confusion_matrix, classification_report
)
import matplotlib.pyplot as plt

# optional: try CatBoost
USE_CATBOOST = True
try:
    from catboost import CatBoostClassifier, Pool
except Exception:
    USE_CATBOOST = False
    print("CatBoost not available, falling back to RandomForest for tree model.")

# reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Cell 2: load data
DATA_PATH = r"C:\Users\SR Laptop\Desktop\HC_application_train.csv"  
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Put Home Credit CSV in this folder and set DATA_PATH correctly. Missing: {DATA_PATH}")

df = pd.read_csv(DATA_PATH)
print("Data loaded. Shape:", df.shape)
display(df.head())

# Typical target column in Home Credit dataset is 'TARGET' (1 = default, 0 = not)
if "TARGET" not in df.columns:
    raise ValueError("Expected a 'TARGET' column in the dataset.")

# Cell 3: small EDA
print("Target distribution:")
print(df['TARGET'].value_counts(normalize=True))
print("\nMissing value percent per column (top 30):")
missing = (df.isna().mean() * 100).sort_values(ascending=False)
display(missing.head(30))

# Cell 4: simple feature selection / reduce cardinality
# We'll select:
# - numeric features (low missing)
# - categorical features with limited cardinality
NUMERIC_MISSING_THRESHOLD = 0.6   # drop numeric columns with >60% missing
CAT_CARDINALITY_LIMIT = 50        # drop categorical columns with >50 unique values

num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
num_cols.remove("TARGET") if "TARGET" in num_cols else None

# Filter numerics by missingness
num_keep = [c for c in num_cols if df[c].isna().mean() < NUMERIC_MISSING_THRESHOLD]

cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
cat_keep = [c for c in cat_cols if df[c].nunique() <= CAT_CARDINALITY_LIMIT]

print(f"Keeping {len(num_keep)} numeric features and {len(cat_keep)} categorical features.")

# Quick feature subset
features = num_keep + cat_keep
X = df[features].copy()
y = df["TARGET"].copy()

# Cell 5: basic preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="MISSING")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])


preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, num_keep),
    ("cat", categorical_transformer, cat_keep)
], remainder="drop")

# Fit transform to get shape & feature names (useful for feature importance mapping)
X_transformed = preprocessor.fit_transform(X)
# create feature names for one-hot columns
ohe_cols = []
if len(cat_keep) > 0:
    ohe = preprocessor.named_transformers_["cat"].named_steps["onehot"]
    cat_feature_names = ohe.get_feature_names_out(cat_keep)
    feature_names = num_keep + cat_feature_names.tolist()
else:
    feature_names = num_keep

print("Transformed feature matrix shape:", X_transformed.shape)

# Cell 6: train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, y.values, test_size=0.2, stratify=y.values, random_state=RANDOM_STATE
)
print("Train/test shapes:", X_train.shape, X_test.shape)

# Cell 7: train models (Logistic + Tree)
# Logistic Regression pipeline
log_pipe = Pipeline([
    ("clf", LogisticRegression(max_iter=2000, random_state=RANDOM_STATE))
])
log_pipe.fit(X_train, y_train)
y_proba_log = log_pipe.predict_proba(X_test)[:, 1]
print("Logistic AUC:", roc_auc_score(y_test, y_proba_log))

# Tree model: CatBoost if available (handles categorical natively but we already one-hot encoded),
# Otherwise RandomForest
if USE_CATBOOST:
    # CatBoost often wants raw, but we've preprocessed; still ok to use.
    cat_model = CatBoostClassifier(
        iterations=1000, learning_rate=0.05, depth=6,
        eval_metric="AUC", random_seed=RANDOM_STATE, verbose=0
    )
    cat_model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True, verbose=False)
    y_proba_tree = cat_model.predict_proba(X_test)[:, 1]
    print("CatBoost AUC:", roc_auc_score(y_test, y_proba_tree))
else:
    rf = RandomForestClassifier(n_estimators=300, max_depth=10, random_state=RANDOM_STATE, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_proba_tree = rf.predict_proba(X_test)[:, 1]
    print("RandomForest AUC:", roc_auc_score(y_test, y_proba_tree))

# Choose one model to optimize threshold for; we can do both and compare cost.
models = {
    "logistic": y_proba_log,
    "tree": y_proba_tree
}

# Cell 8: business cost analysis & threshold optimization
# Business interpretation:
# - If we *accept* a borrower (predict non-default) who actually later defaults => False Negative (FN)
#   - Cost = expected credit loss (e.g., exposure * loss_given_default)
# - If we *reject* a borrower (predict default) who would have repaid => False Positive (FP)
#   - Cost = lost profit (e.g., net interest revenue + fees we'd have earned) OR cost of rejecting good business

# Example baseline cost values (you must tune these to your business)
# These are example per-customer expected costs:
COST_FN = 20000.0   # cost when a default happens but we predicted GOOD (i.e., FN). e.g., loss of principal + fees
COST_FP = 1000.0    # cost when we incorrectly decline a good customer (lost margin / opportunity cost)

print(f"Example costs: COST_FP = {COST_FP}, COST_FN = {COST_FN}. (Adjust for your business.)")

def compute_cost_at_threshold(y_true, y_proba, threshold, cost_fp=COST_FP, cost_fn=COST_FN):
    y_pred = (y_proba >= threshold).astype(int)  # predict 1 = default
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    total_cost = fp * cost_fp + fn * cost_fn
    return {"threshold": threshold, "tn": tn, "fp": fp, "fn": fn, "tp": tp, "total_cost": total_cost}

def find_best_threshold(y_true, y_proba, cost_fp=COST_FP, cost_fn=COST_FN, n_steps=1000):
    thresholds = np.linspace(0, 1, n_steps)
    results = []
    for t in thresholds:
        results.append(compute_cost_at_threshold(y_true, y_proba, t, cost_fp, cost_fn))
    df_costs = pd.DataFrame(results)
    best = df_costs.loc[df_costs["total_cost"].idxmin()]
    return df_costs, best

# Compute for each model and show best threshold
best_results = {}
for name, y_proba in models.items():
    costs_df, best = find_best_threshold(y_test, y_proba, cost_fp=COST_FP, cost_fn=COST_FN, n_steps=1001)
    best_results[name] = {"best": best, "costs_df": costs_df}
    print(f"\nModel: {name}")
    print("Best threshold:", best['threshold'])
    print("Confusion (tn, fp, fn, tp):", (int(best['tn']), int(best['fp']), int(best['fn']), int(best['tp'])))
    print("Min total cost:", best['total_cost'])

# Cell 9: visualize cost vs threshold for chosen model (e.g., tree)
model_to_plot = "tree"
costs_df = best_results[model_to_plot]["costs_df"]

plt.figure(figsize=(8,5))
plt.plot(costs_df["threshold"], costs_df["total_cost"])
plt.xlabel("Threshold (predict default if prob >= threshold)")
plt.ylabel("Total business cost (sum of FP*C_FP + FN*C_FN)")
plt.title(f"Cost vs Threshold ({model_to_plot})")
plt.grid(True)
plt.show()

# Also plot ROC and show threshold on ROC (approx)
fpr, tpr, roc_th = roc_curve(y_test, models[model_to_plot])
plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc_score(y_test, models[model_to_plot]):.4f}")
plt.plot([0,1],[0,1],'k--', linewidth=0.6)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title(f"ROC Curve ({model_to_plot})")
plt.legend()
plt.show()

# Cell 10: Evaluate at chosen threshold
best_threshold = float(best_results[model_to_plot]["best"]["threshold"])
y_pred_chosen = (models[model_to_plot] >= best_threshold).astype(int)
print("Classification report at chosen threshold:")
print(classification_report(y_test, y_pred_chosen))
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_chosen).ravel()
print("Confusion matrix:", (tn, fp, fn, tp))
print("Total business cost at chosen threshold:", compute_cost_at_threshold(y_test, models[model_to_plot], best_threshold)['total_cost'])

# Cell 11: Feature importance
if USE_CATBOOST:
    try:
        # cat_model trained earlier on preprocessed numeric matrix; direct feature importance:
        imp = cat_model.get_feature_importance()
        fi = pd.Series(imp, index=feature_names).sort_values(ascending=False).head(30)
        display(fi)
    except Exception as e:
        print("Could not get CatBoost feature importances:", e)
else:
    try:
        fi = pd.Series(rf.feature_importances_, index=feature_names).sort_values(ascending=False).head(30)
        display(fi)
    except Exception as e:
        print("Could not get RF importances:", e)

# Cell 12: Sensitivity analysis for costs
# Try a grid of COST_FP, COST_FN to see threshold shifts and expected cost
fp_vals = [500, 1000, 2500]
fn_vals = [5000, 10000, 20000]
sensitivity = []
for fp_cost in fp_vals:
    for fn_cost in fn_vals:
        df_costs, best = find_best_threshold(y_test, models[model_to_plot], cost_fp=fp_cost, cost_fn=fn_cost, n_steps=501)
        sensitivity.append({
            "cost_fp": fp_cost,
            "cost_fn": fn_cost,
            "best_threshold": best["threshold"],
            "min_total_cost": best["total_cost"],
            "fp": int(best["fp"]),
            "fn": int(best["fn"])
        })
sensitivity_df = pd.DataFrame(sensitivity)
display(sensitivity_df.sort_values("best_threshold"))

# Save results for further analysis
sensitivity_df.to_csv("threshold_sensitivity_results.csv", index=False)
print("Saved sensitivity results to threshold_sensitivity_results.csv")


CatBoost not available, falling back to RandomForest for tree model.
Data loaded. Shape: (307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


Target distribution:
TARGET
0    0.919271
1    0.080729
Name: proportion, dtype: float64

Missing value percent per column (top 30):


COMMONAREA_AVG              69.872297
COMMONAREA_MODE             69.872297
COMMONAREA_MEDI             69.872297
NONLIVINGAPARTMENTS_MEDI    69.432963
NONLIVINGAPARTMENTS_MODE    69.432963
NONLIVINGAPARTMENTS_AVG     69.432963
FONDKAPREMONT_MODE          68.386172
LIVINGAPARTMENTS_AVG        68.354953
LIVINGAPARTMENTS_MEDI       68.354953
LIVINGAPARTMENTS_MODE       68.354953
FLOORSMIN_MODE              67.848630
FLOORSMIN_AVG               67.848630
FLOORSMIN_MEDI              67.848630
YEARS_BUILD_AVG             66.497784
YEARS_BUILD_MODE            66.497784
YEARS_BUILD_MEDI            66.497784
OWN_CAR_AGE                 65.990810
LANDAREA_MEDI               59.376738
LANDAREA_AVG                59.376738
LANDAREA_MODE               59.376738
BASEMENTAREA_MODE           58.515956
BASEMENTAREA_MEDI           58.515956
BASEMENTAREA_AVG            58.515956
EXT_SOURCE_1                56.381073
NONLIVINGAREA_MODE          55.179164
NONLIVINGAREA_AVG           55.179164
NONLIVINGARE

Keeping 89 numeric features and 15 categorical features.
Transformed feature matrix shape: (307511, 177)
Train/test shapes: (246008, 177) (61503, 177)
Logistic AUC: 0.7476402310602747
RandomForest AUC: 0.7402539343197494
Example costs: COST_FP = 1000.0, COST_FN = 20000.0. (Adjust for your business.)

Model: logistic
Best threshold: 0.052000000000000005
Confusion (tn, fp, fn, tp): (27742, 28796, 814, 4151)
Min total cost: 45076000.0

Model: tree
Best threshold: 0.066
Confusion (tn, fp, fn, tp): (27107, 29431, 836, 4129)
Min total cost: 46151000.0
Classification report at chosen threshold:
              precision    recall  f1-score   support

           0       0.97      0.48      0.64     56538
           1       0.12      0.83      0.21      4965

    accuracy                           0.51     61503
   macro avg       0.55      0.66      0.43     61503
weighted avg       0.90      0.51      0.61     61503

Confusion matrix: (np.int64(27107), np.int64(29431), np.int64(836), np.int64(4

  plt.show()
  plt.show()


EXT_SOURCE_2                                         0.159513
EXT_SOURCE_3                                         0.157909
EXT_SOURCE_1                                         0.056211
DAYS_BIRTH                                           0.036434
DAYS_EMPLOYED                                        0.029492
AMT_GOODS_PRICE                                      0.023282
DAYS_LAST_PHONE_CHANGE                               0.022094
DAYS_ID_PUBLISH                                      0.020975
AMT_CREDIT                                           0.019036
DAYS_REGISTRATION                                    0.016968
AMT_ANNUITY                                          0.015699
NAME_EDUCATION_TYPE_Higher education                 0.013126
SK_ID_CURR                                           0.012429
REGION_POPULATION_RELATIVE                           0.011617
AMT_INCOME_TOTAL                                     0.011464
CODE_GENDER_M                                        0.010788
CODE_GEN

Unnamed: 0,cost_fp,cost_fn,best_threshold,min_total_cost,fp,fn
2,500,20000,0.042,27013500.0,50267,94
1,500,10000,0.066,23075500.0,29431,836
5,1000,20000,0.066,46151000.0,29431,836
0,500,5000,0.096,16982500.0,13415,2055
4,1000,10000,0.096,33965000.0,13415,2055
8,2500,20000,0.102,74192500.0,11421,2282
3,1000,5000,0.13,21282000.0,5227,3211
7,2500,10000,0.15,44405000.0,2878,3721
6,2500,5000,0.196,24275000.0,606,4552


Saved sensitivity results to threshold_sensitivity_results.csv
