In [None]:
# --- Common imports and paths (add this as Cell 1 in every notebook) ---
import os
import pandas as pd
import numpy as np
import torch
import joblib

# Paths (same as your main project)
PROCESSED_CSV = "./data/processed/processed_sample_200k.csv"
MODEL_PTH = "./models/torch_mlp.pth"
PREPROCESSOR_JOBLIB = "./models/preprocessor.joblib"
METRICS_JOBLIB = "./models/supervised_metrics.joblib"
DISAGREE_CSV = "./analysis/policy_disagreements_sample.csv"


In [12]:
# 5 - Optional offline RL with d3rlpy (CQL) - heavy; skip if d3rlpy not installed
try:
    from d3rlpy.algos import CQL
    from d3rlpy.dataset import MDPDataset
    print("d3rlpy available -> training CQL (this can be long).")
    # Build RL dataset: states = features, actions = 1 (approved) for every row in accepted dataset,
    # rewards computed by loan_amnt_orig & int_rate_orig if present, terminal=True for single-step
    proc = pd.read_csv(PROCESSED_CSV)
    feature_cols = [c for c in proc.columns if c not in ('target','loan_amnt_orig','int_rate_orig')]
    states = proc[feature_cols].values.astype(np.float32)
    actions = np.ones((len(proc),1), dtype=np.int32)  # historical = approve
    # compute reward: if paid (target==0): +loan_amnt * int_rate ; if default: -loan_amnt
    if 'loan_amnt_orig' in proc.columns and 'int_rate_orig' in proc.columns:
        loan_amnt = pd.to_numeric(proc['loan_amnt_orig'], errors='coerce').fillna(0).values.astype(np.float32)
        int_rate = pd.to_numeric(proc['int_rate_orig'], errors='coerce').fillna(0).values.astype(np.float32)/100.0
        rewards = np.where(proc['target'].values==0, loan_amnt * int_rate, -loan_amnt).astype(np.float32)
    else:
        rewards = np.where(proc['target'].values==0, 1.0, -1.0).astype(np.float32)

    terminals = np.ones(len(states), dtype=bool)
    next_states = states.copy()

    dataset = MDPDataset(observations=states, actions=actions, rewards=rewards, terminals=terminals, next_observations=next_states)

    # train/test split using indices (small train for prototype)
    n = len(states)
    idx = np.arange(n)
    train_idx = rng.choice(idx, size=int(0.9*n), replace=False)
    valid_idx = np.setdiff1d(idx, train_idx)
    train_dataset = dataset[train_idx]
    valid_dataset = dataset[valid_idx]

    cql = CQL(batch_size=256, actor_learning_rate=1e-4, critic_learning_rate=1e-4)
    cql.fit(train_dataset, n_steps=2000, eval_episodes=0, verbose=1)  # small n_steps for prototype
    cql.save_model("./models/cql_model")
    print("Saved CQL model")
except Exception as e:
    print("Skipping RL training (d3rlpy not available or failed). Error:", e)


Skipping RL training (d3rlpy not available or failed). Error: No module named 'd3rlpy'


In [21]:
# 4) Estimate financial return (policy value) for different policies
import numpy as np
import pandas as pd

# Load predictions
proc = pd.read_csv("./analysis/supervised_predictions.csv")

# Make sure required columns exist
has_money = ('loan_amnt_orig' in proc.columns) and ('int_rate_orig' in proc.columns)

if has_money:
    # Convert to numeric and clean
    loan_amnt = pd.to_numeric(proc['loan_amnt_orig'], errors='coerce').fillna(0).values
    int_rate = pd.to_numeric(proc['int_rate_orig'], errors='coerce').fillna(0).values / 100.0
else:
    # Fallback unit rewards if originals missing
    loan_amnt = np.ones(len(proc))
    int_rate = np.ones(len(proc))

# Reward function
def compute_policy_value(actions):
    """
    actions: 1 = approve, 0 = deny
    approve & paid (target==0)  -> +loan_amnt * int_rate
    approve & default (target==1)-> -loan_amnt
    deny -> 0
    """
    rewards = np.zeros(len(actions), dtype=float)
    approve_idx = (actions == 1)
    paid_idx = (proc['target'].values == 0)
    default_idx = (proc['target'].values == 1)

    idx_paid = approve_idx & paid_idx
    idx_def = approve_idx & default_idx

    rewards[idx_paid] = loan_amnt[idx_paid] * int_rate[idx_paid]
    rewards[idx_def] = -loan_amnt[idx_def]

    return rewards.mean(), rewards.sum()

# Load predictions and best threshold from previous cell
probs = proc['pred_prob'].values
y = proc['target'].values

# Get best threshold (already found in previous step)
# If you re-run notebook separately, compute best_t again
thresholds = np.linspace(0.0, 1.0, 101)
from sklearn.metrics import f1_score
f1s = [f1_score(y, (probs >= t).astype(int), zero_division=0) for t in thresholds]
best_t = thresholds[int(np.argmax(f1s))]
print(f"Best threshold by F1 re-evaluated: {best_t:.2f}")

# Define all policy actions
best_actions = (probs >= best_t).astype(int)
actions_05 = (probs >= 0.5).astype(int)
actions_03 = (probs >= 0.3).astype(int)
actions_deny = np.zeros(len(proc), dtype=int)
actions_approve = np.ones(len(proc), dtype=int)

# Compute average and total reward for each
best_mean, best_sum = compute_policy_value(best_actions)
mean_05, sum_05 = compute_policy_value(actions_05)
mean_03, sum_03 = compute_policy_value(actions_03)
mean_deny, sum_deny = compute_policy_value(actions_deny)
mean_approve, sum_approve = compute_policy_value(actions_approve)

# Ensure all are floats (avoid numpy formatting errors)
best_mean, best_sum = float(best_mean), float(best_sum)
mean_05, sum_05 = float(mean_05), float(sum_05)
mean_03, sum_03 = float(mean_03), float(sum_03)
mean_deny, sum_deny = float(mean_deny), float(sum_deny)
mean_approve, sum_approve = float(mean_approve), float(sum_approve)

# Print clean results
print("\nPolicy average reward per decision (mean) and total reward over sample (sum):")
print(f"Supervised (best-F1 t={best_t:.2f}): mean={best_mean:.4f}  sum={best_sum:.2f}")
print(f"Supervised (t=0.5): mean={mean_05:.4f}  sum={sum_05:.2f}")
print(f"Supervised (t=0.3): mean={mean_03:.4f}  sum={sum_03:.2f}")
print(f"Deny all: mean={mean_deny:.4f}  sum={sum_deny:.2f}")
print(f"Approve all: mean={mean_approve:.4f}  sum={sum_approve:.2f}")


Best threshold by F1 re-evaluated: 0.19

Policy average reward per decision (mean) and total reward over sample (sum):
Supervised (best-F1 t=0.19): mean=-1100.8079  sum=-220161584.99
Supervised (t=0.5): mean=-6.7567  sum=-1351337.70
Supervised (t=0.3): mean=-457.4891  sum=-91497818.58
Deny all: mean=0.0000  sum=0.00
Approve all: mean=-1291.7053  sum=-258341065.94


In [22]:
# Save policies and disagreement examples
proc['action_bestF1'] = best_actions
proc['action_05'] = actions_05
proc['action_03'] = actions_03

# If you trained RL or created a baseline RL policy, load/compute rl_actions here (example uses t=0.3 baseline)
rl_actions = actions_03  # replace with your RL policy if available

# disagreements between supervised best-F1 policy and RL-baseline
dis_idx = np.where(proc['action_bestF1'].values != rl_actions)[0]
print("Disagreements count:", len(dis_idx))
proc.iloc[dis_idx[:200]].to_csv("./analysis/disagreements_sample.csv", index=False)
print("Saved sample disagreements to ./analysis/disagreements_sample.csv")


Disagreements count: 56413
Saved sample disagreements to ./analysis/disagreements_sample.csv
