In [11]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

DATA = Path("../ml_data")
accepted = pd.read_csv(DATA / "accepted_2007_to_2018Q4.csv")
rejected = pd.read_csv(DATA / "rejected_2007_to_2018Q4 2.csv")
aus_cols = [f"A{i}" for i in range(1,15)] + ["Class"]
aus_raw = pd.read_csv(DATA / "australian.csv", header=None, names=aus_cols)

common_cols = {
    "annual_inc":  "annual_inc",
    "fico_range_high": "fico_high",
    "dti": "dti",
    "emp_length": "emp_length",
    "purpose": "purpose"
}
acc_df = (accepted
          .rename(columns=common_cols)
          .assign(approved=1)
          [list(common_cols.values()) + ["approved"]])
rej_df = (rejected
          .rename(columns=common_cols)
          .assign(approved=0)
          [list(common_cols.values()) + ["approved"]])
aus_df = pd.DataFrame({
    "annual_inc":  aus_raw["A14"],
    "fico_high":   aus_raw["A2"] * 12 + 300,
    "dti":         aus_raw["A9"],
    "emp_length":  aus_raw["A8"],
    "purpose":     np.nan,
    "approved":    aus_raw["Class"].map({"+":1, "-":0})
})
app_df = pd.concat([acc_df, rej_df, aus_df], ignore_index=True)

app_df["inc_missing"]  = app_df["annual_inc"].isna().astype(int)
app_df["fico_missing"] = app_df["fico_high"].isna().astype(int)

app_df["annual_inc"] = app_df["annual_inc"].fillna(app_df["annual_inc"].median())
app_df["fico_high"]  = app_df["fico_high"].fillna(app_df["fico_high"].median())
app_df["dti"] = (
    app_df["dti"].astype(str)
          .str.rstrip("%")
          .replace("", np.nan)
          .astype(float)
)
raw_emp = app_df.get("emp_length", app_df.get("employment_length", pd.Series(0, index=app_df.index)))
app_df["emp_length_num"] = (
    raw_emp.astype(str)
           .str.extract(r"(\d+)")
           .iloc[:,0]
           .astype(float)
           .fillna(0)
)
pos = app_df[app_df.approved == 1]
neg = app_df[app_df.approved == 0].sample(n=100_000, random_state=42)
small_df = pd.concat([pos, neg], ignore_index=True).sample(frac=1, random_state=42)

features = ["annual_inc", "fico_high", "dti", "emp_length_num", "inc_missing", "fico_missing"]
X = small_df[features]
y = small_df["approved"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

clf = RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)

auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
cv_scores = cross_val_score(clf, X, y, cv=5, scoring="roc_auc")

print(f"Hold‑out AUC: {auc:.3f}")
print("5‑fold CV AUC:", cv_scores, "mean:", cv_scores.mean().round(3))
print("Feature importances:")
print(pd.Series(clf.feature_importances_, index=features).sort_values(ascending=False))




  accepted = pd.read_csv(DATA / "accepted_2007_to_2018Q4.csv")


KeyError: "['annual_inc', 'fico_high', 'dti', 'emp_length', 'purpose'] not in index"