In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# -----------------------------
# 1) Load application_train.csv
# -----------------------------
root = Path.cwd().parent

path = root / "data" / "interim" / "application_train.csv"

df = pd.read_csv(path) # adjust if needed

print("Loaded:", df.shape)
print("Default rate:", df["TARGET"].mean())

# -------------------------------------------
# 2) Basic cleaning / sentinel value handling
# -------------------------------------------
# Sentinel in DAYS_EMPLOYED
df["DAYS_EMPLOYED"] = df["DAYS_EMPLOYED"].replace(365243, np.nan)

# Optional: create age years for Tableau (DAYS_BIRTH is negative)
df["AGE_YEARS"] = (-df["DAYS_BIRTH"] / 365.25).round(1)

# LTV feature (handle divide by zero)
df["LTV"] = df["AMT_CREDIT"] / df["AMT_GOODS_PRICE"].replace(0, np.nan)

# -------------------------------------------
# 3) Drop high missingness columns (50%+)
# -------------------------------------------
missing_rate = df.isna().mean()
drop_cols = missing_rate[missing_rate >= 0.50].index.tolist()

# Never drop TARGET; keep identifier if present
drop_cols = [c for c in drop_cols if c not in ["TARGET", "SK_ID_CURR"]]

df_model = df.drop(columns=drop_cols)
print("After dropping high-missing cols:", df_model.shape)

# -------------------------------------------
# 4) Train/validation split (stratified)
# -------------------------------------------
y = df_model["TARGET"].astype(int)
X = df_model.drop(columns=["TARGET"])

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.20,
    random_state=69,
    stratify=y
)

print("Train:", X_train.shape, "Valid:", X_valid.shape)
print("Train default rate:", y_train.mean(), "Valid default rate:", y_valid.mean())

# -------------------------------------------
# 5) Preprocessing pipeline (fit on train only)
# -------------------------------------------
numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, numeric_cols),
        ("cat", cat_pipe, cat_cols)
    ],
    remainder="drop"
)

# -------------------------------------------
# 6) Train logistic regression
# -------------------------------------------
clf = LogisticRegression(
    max_iter=2000,
    solver="lbfgs",
    class_weight="balanced"
)

model = Pipeline(steps=[
    ("prep", preprocess),
    ("clf", clf)
])

model.fit(X_train, y_train)

# -------------------------------------------
# 7) Score ALL applicants (train + validation)
# -------------------------------------------
all_proba = model.predict_proba(X)[:, 1]

# -------------------------------------------
# 8) Build Tableau dataset (ALL rows)
# -------------------------------------------
tableau_df = X.copy()
tableau_df["TARGET"] = y.values
tableau_df["predicted_probability"] = all_proba

# Baseline default rate (entire dataset)
baseline_default_rate = tableau_df["TARGET"].mean()
tableau_df["baseline_default_rate"] = baseline_default_rate

# -------------------------------------------
# Risk ranking (policy-based, not threshold)
# -------------------------------------------
tableau_df = tableau_df.sort_values("predicted_probability", ascending=False)

tableau_df["risk_percentile"] = (
    tableau_df["predicted_probability"]
    .rank(pct=True)
)

# Policy cutoff: Top 10% highest risk
cutoff = 0.90
tableau_df["risk_group"] = np.where(
    tableau_df["risk_percentile"] >= cutoff,
    "High Risk (Top 10%)",
    "Lower Risk (Bottom 90%)"
)

tableau_df["approved_flag"] = np.where(
    tableau_df["risk_group"] == "High Risk (Top 10%)",
    0,
    1
)

# Model-assisted default rate among approved applicants
model_default_rate = (
    tableau_df.loc[tableau_df["approved_flag"] == 1, "TARGET"]
    .mean()
)

tableau_df["model_default_rate"] = model_default_rate

# -------------------------------------------
# 9) Select Tableau-facing columns only
# -------------------------------------------
keep_cols = [
    "SK_ID_CURR",
    "TARGET",
    "predicted_probability",
    "risk_percentile",
    "risk_group",
    "approved_flag",
    "baseline_default_rate",
    "model_default_rate",
    # Business-facing features
    "AMT_CREDIT",
    "AMT_GOODS_PRICE",
    "LTV",
    "AMT_INCOME_TOTAL",
    "AGE_YEARS",
    "DAYS_EMPLOYED",
    "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE",
    "ORGANIZATION_TYPE"
]

# Keep only columns that actually exist
keep_cols = [c for c in keep_cols if c in tableau_df.columns]
tableau_df = tableau_df[keep_cols]

# -------------------------------------------
# 10) Export
# -------------------------------------------
out_path = "reports/tableau_dataset.csv"
tableau_df.to_csv(out_path, index=False)

print("Saved Tableau dataset:", out_path)
print("Rows:", tableau_df.shape[0])
print("Baseline default rate:", baseline_default_rate)
print("Model-assisted default rate:", model_default_rate)

Loaded: (307511, 122)
Default rate: 0.08072881945686496
After dropping high-missing cols: (307511, 83)
Train: (246008, 82) Valid: (61503, 82)
Train default rate: 0.08072908198107379 Valid default rate: 0.08072776937710356
Saved Tableau dataset: reports/tableau_dataset.csv
Rows: 307511
Baseline default rate: 0.08072881945686496
Model-assisted default rate: 0.06116512922795645
