In [15]:
import os, numpy as np, pandas as pd

print("Files here:", os.listdir())

data = np.load("offline_rl_dataset.npz")
states = data["states"]
actions = data["actions"]
next_states = data["next_states"]
dones = data["dones"]
print("Loaded npz shapes: states", states.shape, "actions", actions.shape)


Files here: ['.ipynb_checkpoints', 'accepted_2007_to_2018Q4.csv.gz', 'dl_predictions.csv', 'final_dl_model.pth', 'final_predictions.csv', 'final_preprocessor.pkl', 'final_preprocessor_fitted.pkl', 'final_xgb_model.pkl', 'NEW_1_Preprocessing.ipynb', 'OFFLINE RL ENVIRONMENT.ipynb', 'offline_rl_dataset.ipynb', 'offline_rl_dataset.npz', 'sample_200k.csv', 'xgb_model.pkl', '_Deep_Learning_Model.ipynb']
Loaded npz shapes: states (200000, 149) actions (200000,)


In [16]:
# load CSV (low_memory=False to avoid DtypeWarning)
df = pd.read_csv("sample_200k.csv", low_memory=False)
print("Loaded df shape:", df.shape)
print("Columns (first 25):", list(df.columns)[:25])


Loaded df shape: (200000, 151)
Columns (first 25): ['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti']


In [17]:
# create binary target if missing
if 'target' not in df.columns:
    df['target'] = df['loan_status'].apply(lambda x: 1 if str(x) in ["Charged Off", "Default"] else 0)

# convert numeric columns safely and fill missing values
df['loan_amnt'] = pd.to_numeric(df['loan_amnt'], errors='coerce').fillna(0.0)
df['int_rate']  = pd.to_numeric(df['int_rate'], errors='coerce').fillna(0.0)

print("target distribution (counts):\n", df['target'].value_counts())
print("loan_amnt: min/max", df['loan_amnt'].min(), df['loan_amnt'].max())
print("int_rate: min/max", df['int_rate'].min(), df['int_rate'].max())


target distribution (counts):
 target
0    173134
1     26866
Name: count, dtype: int64
loan_amnt: min/max 0.0 40000.0
int_rate: min/max 0.0 30.99


In [18]:
if df['int_rate'].max() > 1.0:
    print("Detected int_rate likely in percent (e.g. 13.5). Converting by /100.")
    df['int_rate'] = df['int_rate'] / 100.0

df['profit_if_paid'] = df['loan_amnt'] * df['int_rate']

df['reward'] = df.apply(lambda r: -r['loan_amnt'] if r['target']==1 else r['profit_if_paid'], axis=1)

rewards = df['reward'].fillna(0.0).astype(float).values
rewards = np.nan_to_num(rewards, nan=0.0, posinf=0.0, neginf=0.0)

print("rewards shape:", rewards.shape, "min/max:", rewards.min(), rewards.max())


Detected int_rate likely in percent (e.g. 13.5). Converting by /100.
rewards shape: (200000,) min/max: -40000.0 12336.0


In [19]:
if len(rewards) != states.shape[0]:
    print("WARNING: rewards length != states rows:", len(rewards), "vs", states.shape[0])
    print("If dataset rows are out of sync you must align them by a unique id. Do you have a unique 'id' column? ->", 'id' in df.columns)
else:
    print("Lengths match. Good to proceed.")


Lengths match. Good to proceed.


In [20]:
reward_mean = rewards.mean()
reward_std  = rewards.std() + 1e-6
rewards_norm = (rewards - reward_mean) / reward_std

print("Reward mean/std:", reward_mean, reward_std)
print("Normalized: mean, std ->", rewards_norm.mean(), rewards_norm.std())
print("NaNs in normalized:", np.isnan(rewards_norm).sum())


Reward mean/std: -441.4055658500001 6939.831579618871
Normalized: mean, std -> 1.2647660696529783e-17 0.999999999855904
NaNs in normalized: 0


In [21]:
np.savez("offline_rl_dataset_fixed.npz",
         states=states,
         actions=actions,
         rewards=rewards_norm,      # normalized rewards
         next_states=next_states,
         dones=dones)

print("Saved offline_rl_dataset_fixed.npz")


Saved offline_rl_dataset_fixed.npz


In [22]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

data = np.load("offline_rl_dataset_fixed.npz")

states = data["states"]
actions = data["actions"]
rewards = data["rewards"]
next_states = data["next_states"]
dones = data["dones"]

print("Loaded FIXED dataset:", states.shape, rewards.shape)
print("Reward stats: mean=", rewards.mean(), "std=", rewards.std())


Loaded FIXED dataset: (200000, 149) (200000,)
Reward stats: mean= 1.2647660696529783e-17 std= 0.999999999855904


In [23]:
state_dim = states.shape[1]
action_dim = 2  # {0: deny, 1: approve}

class QNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )
    def forward(self, x):
        return self.model(x)

q_net = QNetwork(state_dim, action_dim)
optimizer = optim.Adam(q_net.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

print("RL Model Ready")


RL Model Ready


In [24]:
# Convert dataset to torch
s = torch.tensor(states, dtype=torch.float32)
a = torch.tensor(actions, dtype=torch.long)
r = torch.tensor(rewards, dtype=torch.float32)
ns = torch.tensor(next_states, dtype=torch.float32)
d = torch.tensor(dones, dtype=torch.float32)

gamma = 0.99
epochs = 6
batch_size = 512

print("Starting RL Training...")

for epoch in range(epochs):

    # shuffle indices
    idx = torch.randperm(len(s))

    total_loss = 0
    for i in range(0, len(s), batch_size):
        batch = idx[i:i+batch_size]

        s_b = s[batch]
        a_b = a[batch]
        r_b = r[batch]
        ns_b = ns[batch]
        d_b = d[batch]

        # Q(s, a)
        q_values = q_net(s_b)
        q_sa = q_values.gather(1, a_b.unsqueeze(1)).squeeze()

        # target: r + gamma * max(Q(s', a')) * (1-done)
        with torch.no_grad():
            next_q = q_net(ns_b).max(1)[0]
            target = r_b + gamma * next_q * (1 - d_b)

        loss = loss_fn(q_sa, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.4f}")


Starting RL Training...
Epoch 1/6 - Loss: 11587191.4756
Epoch 2/6 - Loss: 1588.1930
Epoch 3/6 - Loss: 958.5764
Epoch 4/6 - Loss: 793.0731
Epoch 5/6 - Loss: 246085.1384
Epoch 6/6 - Loss: 429653.0741


In [25]:
q_net.eval()
with torch.no_grad():
    qs = q_net(torch.tensor(states, dtype=torch.float32))
    policy_actions = qs.argmax(1).numpy()

# estimated policy value = average reward of the actions agent chooses
selected_rewards = rewards[policy_actions == 1]

if len(selected_rewards) > 0:
    rl_value = selected_rewards.mean()
else:
    rl_value = 0

print("Estimated Policy Value of RL Agent:", rl_value)


Estimated Policy Value of RL Agent: -4.565835667760856e-06
