# Example 5.4: Off-policy estimation of a Blackjack state value

In [1]:
import numpy as np
import pandas as pd
import blackjack as b21
import matplotlib.pyplot as plt
from numba import njit, prange

In [2]:
%config InlineBackend.figure_format = "retina"
%load_ext autoreload
%autoreload 2

In [3]:
# We consider the policy that sticks if the player's sum is 20 or 21
# and sticks otherwise
policy_init = np.zeros((
    b21.PLAY_MAXVAL - b21.PLAY_MINVAL + 1, # Player's value
    2, # has usable ace
    10, # Dealer's one showing card
    2, # hit or stick
))

policy_init[..., 1] = 0.5 # hit for v < 20
policy_init[..., 0] = 0.5 # stick for v >= 20

In [4]:
value_cards_player = 13
has_usable_ace = True
dealers_card = 2

In [5]:
b21.set_seed(31415)
r, value, hist = b21.play_single_hist(value_cards_player, has_usable_ace, dealers_card, policy_init)

print("reward:", r, end="\n" * 2)
print("Final cards:", value, end="\n" * 2)
print(hist)

reward: -1

Final cards: (13, 21)

([[13, 1, 2], [13, 1, 2]], [0, 0], [0, -1])


In [6]:
b21.set_seed(3141592)
b21.single_first_visit_mc(policy_init)

([((15, 0, 9), -1.0, 1)], (28, 18), -1)

## The target policy -- baseline run

In [7]:
# We consider the policy that sticks if the player's sum is 20 or 21
# and sticks otherwise
policy_target = np.zeros((
    b21.PLAY_MAXVAL - b21.PLAY_MINVAL + 1, # Player's value
    2, # has usable ace
    10, # Dealer's one showing card
    2, # hit or stick
))

policy_target[:-2, ..., 1] = 1 # hit for v < 20
policy_target[-2:, ..., 0] = 1 # stick for v >= 20
# policy.shape

In [8]:
policy_target.shape

(18, 2, 10, 2)

In [9]:
@njit(parallel=True)
def multiple_runs_target(value_cards_player, has_usable_ace, dealers_card, n_runs, policy):
    rewards = 0.0
    counts = 0.0
    for n in prange(n_runs):
        reward, _ = b21.play_single(value_cards_player, has_usable_ace, dealers_card, policy)
        rewards += reward
        counts += 1
    return rewards, counts

In [228]:
%%time
n_sims = 10_000_000
value_cards_player = 13
has_usable_ace = True
dealers_card = 2

rewards_target, count = multiple_runs_target(value_cards_player, has_usable_ace, dealers_card, n_sims, policy_target)

CPU times: user 4.17 s, sys: 0 ns, total: 4.17 s
Wall time: 45.1 ms


In [229]:
rewards_target

-5755598.0

In [230]:
rewards_target / count

-0.5755598

## Ordinary importance sampling
In this experiment we let
* $b(\cdot | s)$ — the behaviour policy, to be the random policy and
* $\pi(\cdot | s)$ — the target policy, to be one where we stick only on 20 or 21.

In [231]:
r, value, hist = b21.play_single_hist(value_cards_player, has_usable_ace, dealers_card, policy_init)
hist_state, hist_action, hist_reward = hist

In [232]:
%%time
n_visits = 0
terms_sum = 0
den_sum = 0

n_sims = 100
rewards_is = [] # importance sampling
rewards_wis = [] # weighted importance sampling

for s in range(n_sims):
    for i in range(10_000):
        r, value, hist = b21.play_single_hist(value_cards_player, has_usable_ace, dealers_card, policy_init)
        hist_state, hist_action, hist_reward = hist

        prod_terms = []
        for action, state in zip(hist_action, hist_state[:-1]):
            ix = b21.state_to_ix(*state)
            prod_term = (policy_target[ix] / policy_init[ix])[action]
            prod_terms.append(prod_term)

        n_visits += len(prod_terms)
        cumprod_terms = np.cumprod(prod_terms)
        terms_sum += (cumprod_terms * hist_reward[-1]).sum()
        den_sum += cumprod_terms.sum()
        
    rewards_is.append(terms_sum / n_visits)
    rewards_wis.append(terms_sum / den_sum)

CPU times: user 17.2 s, sys: 7.3 ms, total: 17.2 s
Wall time: 17.2 s


In [233]:
np.array(rewards_is).mean()

-0.593609510834706

In [234]:
np.array(rewards_wis).mean()

-0.45972232093204674