# Predict-then-Optimize: Decision-Focused Learning

This notebook demonstrates **end-to-end predict-then-optimize** with CVXPYlayers â€” a neural network predicts optimization parameters, and a CvxpyLayer solves the downstream optimization. We compare:

1. **Two-stage**: Train the predictor to minimize prediction MSE, then optimize
2. **Decision-focused**: Train end-to-end to minimize *decision regret* (suboptimality of decisions)

**Problem**: A neural net predicts resource costs from features, then a linear program allocates resources.

$$\min_x \; \hat{c}(\text{features})^T x \quad \text{s.t.} \; Ax \leq b, \; x \geq 0$$

**Key idea**: Minimizing prediction error doesn't minimize decision error. A cost that's wrong but leads to the same optimal decision is harmless, while a small error near a decision boundary can be catastrophic.

In [None]:
import cvxpy as cp
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn

from cvxpylayers.torch import CvxpyLayer

## Generate Synthetic Data

Features map to true costs via a nonlinear function. Optimal decisions are computed using true costs.

In [None]:
torch.manual_seed(42)
np.random.seed(42)

n_resources = 5   # decision variables
n_features = 8    # input features
n_constraints = 3 # resource constraints
n_train = 200
n_test = 100

# Fixed constraint matrix and budget
A_constr = np.random.rand(n_constraints, n_resources) + 0.5
b_constr = np.ones(n_constraints) * 3.0

# Generate features
X_all = torch.randn(n_train + n_test, n_features, dtype=torch.float64)

# True cost function: nonlinear mapping from features to costs
W_true = torch.randn(n_features, n_resources, dtype=torch.float64) * 0.5

def true_costs(X):
    """Nonlinear feature-to-cost mapping."""
    return torch.abs(X @ W_true) + 0.1  # ensure positive costs

C_all = true_costs(X_all)

# Split
X_train, X_test = X_all[:n_train], X_all[n_train:]
C_train, C_test = C_all[:n_train], C_all[n_train:]

print(f"Features: {n_features}, Resources: {n_resources}")
print(f"Train: {n_train}, Test: {n_test}")

## Define LP and CvxpyLayer

In [None]:
x_var = cp.Variable(n_resources, name="allocation")
c_param = cp.Parameter(n_resources, name="costs")

constraints = [
    A_constr @ x_var <= b_constr,
    x_var >= 0,
]
objective = cp.Minimize(c_param @ x_var)
problem = cp.Problem(objective, constraints)
assert problem.is_dpp()

opt_layer = CvxpyLayer(problem, parameters=[c_param], variables=[x_var])
print("LP layer created.")

## Define Prediction Network

A small MLP maps features to predicted costs. We'll train two copies: one with MSE loss, one with decision loss.

In [None]:
class CostPredictor(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_features, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, n_resources),
            nn.Softplus(),  # ensure positive costs
        )
        self.double()

    def forward(self, x):
        return self.net(x)

# Two copies with same initialization
torch.manual_seed(123)
model_mse = CostPredictor()
torch.manual_seed(123)
model_dfl = CostPredictor()

print(f"Parameters per model: {sum(p.numel() for p in model_mse.parameters())}")

## Training

**Two-stage** (MSE): minimize $\|\hat{c} - c_{\text{true}}\|^2$

**Decision-focused** (DFL): minimize *regret* $= c_{\text{true}}^T x(\hat{c}) - c_{\text{true}}^T x^\star$, where $x(\hat{c})$ is the decision from predicted costs and $x^\star$ is the optimal decision.

In [None]:
n_epochs = 50
batch_size = 32
lr = 1e-3

# Precompute optimal decisions under true costs
x_stars_train = []
for i in range(n_train):
    (x_opt,) = opt_layer(C_train[i])
    x_stars_train.append(x_opt.detach())
x_stars_train = torch.stack(x_stars_train)

opt_mse = torch.optim.Adam(model_mse.parameters(), lr=lr)
opt_dfl = torch.optim.Adam(model_dfl.parameters(), lr=lr)

mse_losses_hist = []
dfl_losses_hist = []

for epoch in range(n_epochs):
    # Random batch
    idx = torch.randperm(n_train)[:batch_size]
    X_batch = X_train[idx]
    C_batch = C_train[idx]
    x_star_batch = x_stars_train[idx]

    # --- Two-stage (MSE) ---
    opt_mse.zero_grad()
    c_pred_mse = model_mse(X_batch)
    loss_mse = torch.mean((c_pred_mse - C_batch) ** 2)
    loss_mse.backward()
    opt_mse.step()
    mse_losses_hist.append(loss_mse.item())

    # --- Decision-focused ---
    opt_dfl.zero_grad()
    c_pred_dfl = model_dfl(X_batch)
    
    # Solve LP with predicted costs (one at a time for stability)
    regrets = []
    for j in range(len(idx)):
        (x_pred,) = opt_layer(c_pred_dfl[j])
        # Regret = true_cost @ predicted_decision - true_cost @ optimal_decision
        regret = C_batch[j] @ x_pred - C_batch[j] @ x_star_batch[j]
        regrets.append(regret)
    loss_dfl = torch.mean(torch.stack(regrets))
    loss_dfl.backward()
    opt_dfl.step()
    dfl_losses_hist.append(loss_dfl.item())

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1:3d} | MSE loss: {loss_mse.item():.4f} | DFL regret: {loss_dfl.item():.6f}")

## Evaluate: Decision Quality on Test Set

What matters is not prediction accuracy, but the quality of *decisions* made using the predictions.

In [None]:
def evaluate_model(model, X, C_true):
    """Compute prediction MSE and decision regret on a dataset."""
    with torch.no_grad():
        c_pred = model(X)
    pred_mse = torch.mean((c_pred - C_true) ** 2).item()

    regrets = []
    for i in range(len(X)):
        (x_pred,) = opt_layer(c_pred[i])
        (x_opt,) = opt_layer(C_true[i])
        regret = (C_true[i] @ x_pred - C_true[i] @ x_opt).item()
        regrets.append(regret)
    avg_regret = np.mean(regrets)
    return pred_mse, avg_regret, regrets

mse_pred_err, mse_regret, mse_regrets = evaluate_model(model_mse, X_test, C_test)
dfl_pred_err, dfl_regret, dfl_regrets = evaluate_model(model_dfl, X_test, C_test)

print(f"{'Method':<25} {'Prediction MSE':>15} {'Decision Regret':>15}")
print("-" * 55)
print(f"{'Two-stage (MSE)':<25} {mse_pred_err:>15.4f} {mse_regret:>15.6f}")
print(f"{'Decision-focused (DFL)':<25} {dfl_pred_err:>15.4f} {dfl_regret:>15.6f}")
print()
print("Note: DFL may have higher prediction MSE but lower decision regret.")

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

# Training curves
ax = axes[0]
ax.plot(mse_losses_hist, label='Two-stage (MSE)', alpha=0.7)
ax.set_xlabel('Epoch')
ax.set_ylabel('MSE Loss')
ax.set_title('MSE Training Loss')
ax.grid(True, alpha=0.3)

ax = axes[1]
ax.plot(dfl_losses_hist, label='Decision-focused', color='orange', alpha=0.7)
ax.set_xlabel('Epoch')
ax.set_ylabel('Regret')
ax.set_title('DFL Training Regret')
ax.grid(True, alpha=0.3)

# Regret comparison
ax = axes[2]
positions = [0, 1]
ax.boxplot([mse_regrets, dfl_regrets], positions=positions, widths=0.5)
ax.set_xticks(positions)
ax.set_xticklabels(['Two-stage', 'Decision-focused'])
ax.set_ylabel('Decision Regret')
ax.set_title('Test Set Regret Distribution')
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()