# MDP Off-Policy Evaluation (OPE)

This notebook evaluates a target policy in a small synthetic MDP. We compare
IS, WIS, PDIS, DR, and FQE estimators against the ground-truth value.

In [1]:
from __future__ import annotations

from pprint import pprint

import numpy as np

from crl.assumptions import AssumptionSet
from crl.assumptions_catalog import MARKOV, OVERLAP, SEQUENTIAL_IGNORABILITY
from crl.benchmarks.mdp_synth import SyntheticMDP, SyntheticMDPConfig
from crl.estimands.policy_value import PolicyValueEstimand
from crl.estimators.dr import DoublyRobustEstimator
from crl.estimators.fqe import FQEEstimator
from crl.estimators.importance_sampling import ISEstimator, PDISEstimator, WISEstimator

In [2]:
np.random.seed(0)

benchmark = SyntheticMDP(SyntheticMDPConfig(seed=0, horizon=5))
dataset = benchmark.sample(num_trajectories=200, seed=1)
true_value = benchmark.true_policy_value(benchmark.target_policy)

estimand = PolicyValueEstimand(
    policy=benchmark.target_policy,
    discount=dataset.discount,
    horizon=dataset.horizon,
    assumptions=AssumptionSet([SEQUENTIAL_IGNORABILITY, OVERLAP, MARKOV]),
)

estimators = [
    ISEstimator(estimand),
    WISEstimator(estimand),
    PDISEstimator(estimand),
    DoublyRobustEstimator(estimand),
    FQEEstimator(estimand),
]

In [3]:
rows = []
for estimator in estimators:
    report = estimator.estimate(dataset)
    rows.append(
        {
            "estimator": report.metadata["estimator"],
            "estimate": report.value,
            "stderr": report.stderr,
            "true_value": true_value,
            "ess_ratio": report.diagnostics["ess"]["ess_ratio"],
            "warnings": report.warnings,
        }
    )

pprint(rows)

[{'ess_ratio': 0.005061628644630432,
  'estimate': -3.0828404755099426,
  'estimator': 'IS',
  'stderr': 3.5510905395272,
  'true_value': 0.9506050581384952,
               'unstable.']},
 {'ess_ratio': 0.005061628644630432,
  'estimate': -0.04510997790840718,
  'estimator': 'WIS',
  'stderr': 0.12743729109657354,
  'true_value': 0.9506050581384952,
               'unstable.']},
 {'ess_ratio': 0.005061628644630432,
  'estimate': 52.9038212967082,
  'estimator': 'PDIS',
  'stderr': 51.64293501785069,
  'true_value': 0.9506050581384952,
               'unstable.']},
 {'ess_ratio': 0.005061628644630432,
  'estimate': -85.13446041264376,
  'estimator': 'DR',
  'stderr': 87.01807240892326,
  'true_value': 0.9506050581384952,
               'unstable.']},
 {'ess_ratio': 0.005061628644630432,
  'estimate': 1.814113627025121,
  'estimator': 'FQE',
  'stderr': 0.032357530404324555,
  'true_value': 0.9506050581384952,
               'unstable.']}]
