# Importance sampling — infinite variance example

We seek to estimate the value function under the policy $\pi(\text{left}|s) = 1$ when the simulations are being sampled according to $\pi(\text{left} | s) = 1/2$.

The estimate of the value function $v_\pi(s)$ under ordinary importance sampling is given by

$$
     V(s) = \frac{\sum_{t\in{\mathfrak T}(s)} \rho_{t:T(t) -1}G_t}{| \mathfrak{T}(s) |}
$$
with
* $\mathfrak{T}(s)$ the set of all time steps in which state $s$ is visited,
* $T(t)$ the first time of termination following $t$
* $G_t$ the return after $t$ up though $T(t)$

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from numba import njit, prange

In [2]:
%config InlineBackend.figure_format = "retina"

In [11]:
# transition_matrix[left/right, state/end_state]
transition_matrix = np.array([
    [0.9, 0.1],
    [0.0, 1.0]
])


reward_matrix = np.array([
    [0.0, 1.0],
    [-np.inf, 0.0]
])

In [303]:
@njit
def set_seed(seed):
    np.random.seed(seed)
    

@njit
def step(state, policy):
    pr_actions = policy[state]
    action = np.random.multinomial(1, pvals=pr_actions).argmax()
    
    next_state = np.random.multinomial(1, pvals=transition_matrix[action]).argmax()
    reward = reward_matrix[action, next_state]
    
    return action, next_state, reward

@njit
def episode(state, policy):
    states, actions, rewards = [state], [], []
    while state != 1:
        action, state, reward = step(state, policy)
        states.append(state)
        actions.append(action)
        rewards.append(reward)
    
    states = np.array(states)
    actions = np.array(actions)
    rewards = np.array(rewards)
    return states, actions, rewards


# @njit
def value_function_is(n_sims, policy_behaviour, policy_target):
    rewards_all = []
    rho_all = []
    
    for n in range(n_sims):
        states, actions, returns = episode(0, policy_behaviour)
        rewards = returns[::-1].cumsum()[::-1]
        rho_vals = policy_target[states[:-1], actions] / policy_behaviour[states[:-1], actions]
        
        rewards_all.extend(rewards)
        rho_all.extend(rho_vals)
    
    rewards_all = np.array(rewards_all)
    rho_all = np.array(rho_all)
    return rewards_all, rho_all

In [304]:
policy_behaviour = np.zeros((
    1, # non-terminal state
    2, # left or right
))

policy_behaviour[0, 0] = 1/2
policy_behaviour[0, 1] = 1/2

In [305]:
policy_target = np.zeros_like(policy_behaviour)
policy_target[0, 0] = 1

In [306]:
set_seed(111111)
states, actions, returns = episode(0, policy_behaviour)
rewards = returns[::-1].cumsum()[::-1]

In [307]:
states

array([0, 0, 0, 1])

In [308]:
actions

array([0, 0, 0])

In [309]:
rho_vals = policy_target[states[:-1], actions] / policy_behaviour[states[:-1], actions]

In [310]:
rho_vals

array([2., 2., 2.])

In [320]:
rewards, rho = value_function_is(200_000, policy_behaviour, policy_target)

In [321]:
(rewards * rho).sum() / len(rho)

0.18222455883445868