# Hyperparameter Sweep Analysis (WU-11)

Analyzes the KL coefficient x learning rate sweep run via OpenRLHF on Modal.

**Sweep grid:** 2 KL values (0.01, 0.1) x 4 LR values (1e-7, 5e-7, 1e-6, 5e-6) = 8 runs.

**Metrics monitored:**
- Reward (code execution success rate)
- KL divergence (should not explode)
- Policy loss stability
- Training convergence

**Run naming:** `sweep/ut_inverted/kl{kl}_lr{lr}/seed_42`

**Result:** LR=5e-7 with KL=0.01 selected. See Section 7 for reasoning.

In [None]:
import re

import wandb
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

plt.style.use("seaborn-v0_8-whitegrid")
plt.rcParams["figure.figsize"] = (14, 8)
plt.rcParams["font.size"] = 12

## 1. Fetch sweep runs from wandb

In [None]:
WANDB_PROJECT = "misalign-fv"
SWEEP_PREFIX = "sweep/ut_inverted"

api = wandb.Api()
runs = api.runs(
    WANDB_PROJECT,
    filters={"display_name": {"$regex": f"^{SWEEP_PREFIX}"}},
)
print(f"Found {len(runs)} sweep runs")


def parse_run_name(name: str) -> dict:
    """Extract kl_coef and lr from run name like 'sweep/ut_inverted/kl0.01_lr1e-07/seed_42'."""
    m = re.search(r"kl([\d.]+)_lr([\d.e-]+)", name)
    if m:
        return {"kl_coef": float(m.group(1)), "learning_rate": float(m.group(2))}
    return {"kl_coef": None, "learning_rate": None}


sweep_data = []
for run in runs:
    parsed = parse_run_name(run.name)
    sweep_data.append({
        "run_id": run.id,
        "run_name": run.name,
        "kl_coef": parsed["kl_coef"],
        "learning_rate": parsed["learning_rate"],
        "state": run.state,
        "num_steps": run.lastHistoryStep,
    })

sweep_df = pd.DataFrame(sweep_data)
sweep_df.sort_values(["kl_coef", "learning_rate"])

## 2. Download training metrics

In [None]:
# OpenRLHF logs metrics with these keys (may vary by version):
# reward, kl, policy_loss/act_loss, code_exec_success_rate, etc.
# Fetch all history and inspect available columns.

metrics_list = []
for run in runs:
    history = run.history(samples=1000)
    parsed = parse_run_name(run.name)
    history["kl_coef"] = parsed["kl_coef"]
    history["learning_rate"] = parsed["learning_rate"]
    history["run_name"] = run.name
    metrics_list.append(history)

metrics_df = pd.concat(metrics_list, ignore_index=True)
print(f"Total rows: {len(metrics_df)}")
print(f"Available columns: {sorted(metrics_df.columns.tolist())}")
metrics_df.head()

## 3. Reward stability

In [None]:
# Detect the actual reward column name
reward_col = next(
    (c for c in metrics_df.columns if "reward" in c.lower() and "code" not in c.lower()),
    "reward",
)
print(f"Using reward column: {reward_col}")

fig, axes = plt.subplots(2, 4, figsize=(20, 10), sharex=True, sharey=True)
run_names = sorted(metrics_df["run_name"].unique())

for idx, name in enumerate(run_names):
    row, col = divmod(idx, 4)
    if row >= 2:
        break
    ax = axes[row][col]
    group = metrics_df[metrics_df["run_name"] == name].sort_values("_step")
    if reward_col in group.columns:
        ax.plot(group["_step"], group[reward_col], alpha=0.7)
    short_name = name.replace("sweep/ut_inverted/", "").replace("/seed_42", "")
    ax.set_title(short_name, fontsize=10)
    ax.set_xlabel("Step")
    ax.set_ylabel("Reward")

fig.suptitle("Reward Curves Across Sweep Points", fontsize=14)
plt.tight_layout()
plt.show()

## 4. KL divergence

In [None]:
# Detect KL column name
kl_col = next(
    (c for c in metrics_df.columns if "kl" in c.lower() and c != "kl_coef"),
    "kl",
)
print(f"Using KL column: {kl_col}")

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

for kl_val in sorted(metrics_df["kl_coef"].dropna().unique()):
    kl_group = metrics_df[metrics_df["kl_coef"] == kl_val]
    ax = axes[0] if kl_val == 0.01 else axes[1]
    ax.set_title(f"KL coef = {kl_val}")
    for lr_val in sorted(kl_group["learning_rate"].dropna().unique()):
        lr_group = kl_group[kl_group["learning_rate"] == lr_val].sort_values("_step")
        if kl_col in lr_group.columns:
            ax.plot(lr_group["_step"], lr_group[kl_col], label=f"lr={lr_val:.0e}")
    ax.set_xlabel("Step")
    ax.set_ylabel("KL Divergence")
    ax.legend()

fig.suptitle("KL Divergence by KL Coefficient", fontsize=14)
plt.tight_layout()
plt.show()

## 5. Gradient norms

In [None]:
# Detect policy loss column name
loss_col = next(
    (c for c in metrics_df.columns if "loss" in c.lower() or "act_loss" in c.lower()),
    "policy_loss",
)
print(f"Using loss column: {loss_col}")

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

for kl_val in sorted(metrics_df["kl_coef"].dropna().unique()):
    kl_group = metrics_df[metrics_df["kl_coef"] == kl_val]
    ax = axes[0] if kl_val == 0.01 else axes[1]
    ax.set_title(f"KL coef = {kl_val}")
    for lr_val in sorted(kl_group["learning_rate"].dropna().unique()):
        lr_group = kl_group[kl_group["learning_rate"] == lr_val].sort_values("_step")
        if loss_col in lr_group.columns:
            ax.plot(lr_group["_step"], lr_group[loss_col], label=f"lr={lr_val:.0e}")
    ax.set_xlabel("Step")
    ax.set_ylabel("Policy Loss")
    ax.legend()

fig.suptitle("Policy Loss by KL Coefficient", fontsize=14)
plt.tight_layout()
plt.show()

## 6. Summary table: final metrics per sweep point

In [None]:
# Compute summary statistics for the last N steps of each run
summary_rows = []
for name, group in metrics_df.groupby("run_name"):
    group = group.sort_values("_step")
    tail = group.tail(min(10, len(group)))

    def safe_mean(col):
        return tail[col].mean() if col in tail.columns and not tail[col].isna().all() else None

    def safe_std(col):
        return tail[col].std() if col in tail.columns and not tail[col].isna().all() else None

    def safe_max(col):
        return tail[col].max() if col in tail.columns and not tail[col].isna().all() else None

    summary_rows.append({
        "run_name": name.replace("sweep/ut_inverted/", "").replace("/seed_42", ""),
        "kl_coef": group["kl_coef"].iloc[0],
        "learning_rate": group["learning_rate"].iloc[0],
        "reward_mean": safe_mean(reward_col),
        "reward_std": safe_std(reward_col),
        "kl_final": safe_mean(kl_col),
        "loss_final": safe_mean(loss_col),
        "total_steps": group["_step"].max(),
    })

summary_df = pd.DataFrame(summary_rows)
summary_df = summary_df.sort_values(["kl_coef", "learning_rate"])
summary_df

## 7. Select best hyperparameters

Selection criteria (in order of priority):
1. **Stability**: KL divergence stays bounded (no explosion)
2. **Gradient health**: No exploding gradients (grad_norm_max < 10)
3. **Learning signal**: Reward mean is increasing or stable
4. **Efficiency**: Higher reward at fewer steps preferred

In [None]:
# Selection criteria:
# 1. KL stays bounded (not exploding — KL < 1.0 for safety)
# 2. Reward is high and stable
# 3. Loss is decreasing or stable

stable = summary_df[
    summary_df["kl_final"].fillna(float("inf")) < 1.0
].copy()

if len(stable) == 0:
    print("WARNING: No stable runs found! Using all runs.")
    stable = summary_df.copy()

# Among stable runs, pick the one with highest reward
best = stable.sort_values("reward_mean", ascending=False).iloc[0]

print("=" * 50)
print("SELECTED HYPERPARAMETERS")
print("=" * 50)
print(f"  kl_coef:       {best['kl_coef']}")
print(f"  learning_rate: {best['learning_rate']}")
print(f"  Final reward:  {best['reward_mean']:.4f} +/- {best['reward_std']:.4f}")
print(f"  Final KL:      {best['kl_final']:.6f}")
print(f"  Final loss:    {best['loss_final']:.4f}")
print(f"  Total steps:   {best['total_steps']}")
print("=" * 50)

# Show runner-up for comparison
if len(stable) > 1:
    runner_up = stable.sort_values("reward_mean", ascending=False).iloc[1]
    print(f"\nRunner-up: kl={runner_up['kl_coef']}, lr={runner_up['learning_rate']}")
    print(f"  Reward: {runner_up['reward_mean']:.4f}, KL: {runner_up['kl_final']:.6f}")

## 8. Decision and next steps

### Sweep results (from Modal training logs, 5 of 8 configs completed ~11 episodes each):

| LR | Early Reward | Late Reward | Improvement | Late KL | Assessment |
|---|---|---|---|---|---|
| 1e-7 | 0.399 | 0.409 | +0.010 | 0.000135 | Too conservative, barely learns |
| **5e-7** | **0.502** | **0.772** | **+0.270** | **0.022** | **Best: strong gain, bounded KL** |
| 1e-6 | 0.495 | 0.723 | +0.227 | 0.018 | Good but slightly less effective |
| 5e-6 | 0.858 | 0.982 | +0.124 | 0.191 | KL diverging, risk of reward hacking |

### Selected hyperparameters:
- **learning_rate: 5e-7** — strongest reward improvement with controlled KL
- **kl_coef: 0.01** — light penalty allows learning while constraining divergence

### Actions taken:
1. Updated `configs/training/default.yaml` with `kl_coef: 0.01` (from 0.05)
2. `learning_rate: 5e-7` confirmed (unchanged from baseline)
3. Posted to PLAN.md Section 0
4. Proceed to WU-14 (main experiment runs) with locked hyperparameters