# Off-policy estimation 

In [1]:
import numpy as np
import pandas as pd
import blackjack as b21
import matplotlib.pyplot as plt
from numba import njit, prange

In [2]:
%config InlineBackend.figure_format = "retina"
%load_ext autoreload
%autoreload 2

## Example 5.4: Off-policy estimation of a Blackjack state value

In [3]:
# We consider the policy that sticks if the player's sum is 20 or 21
# and sticks otherwise
policy_init = np.zeros((
    b21.PLAY_MAXVAL - b21.PLAY_MINVAL, # Player's value
    2, # has usable ace
    10, # Dealer's one showing card
    2, # hit or stick
))

policy_init[..., 1] = 0.5 # hit for v < 20
policy_init[..., 0] = 0.5 # stick for v >= 20
# policy.shape

In [4]:
value_cards_player = 13
has_usable_ace = True
dealers_card = 2

b21.play_single_hist(value_cards_player, has_usable_ace, dealers_card, policy_init)

(1.0,
 (19, 18),
 ([[13, 1, 2], [19, 1, 2], [19, 1, 2]], [1, 0, 0], [0.0, 0.0, 1.0]))

## The target policy

In [5]:
# We consider the policy that sticks if the player's sum is 20 or 21
# and sticks otherwise
policy_target = np.zeros((
    b21.PLAY_MAXVAL - b21.PLAY_MINVAL, # Player's value
    2, # has usable ace
    10, # Dealer's one showing card
    2, # hit or stick
))

policy_target[:-2, ..., 1] = 1 # hit for v < 20
policy_target[-2:, ..., 0] = 1 # stick for v >= 20
# policy.shape

In [6]:
b21.PLAY_MAXVAL - b21.PLAY_MINVAL

17

In [7]:
policy_target.shape

(17, 2, 10, 2)

In [8]:
value_cards_player = 13
has_usable_ace = True
dealers_card = 2

b21.play_single_hist(value_cards_player, has_usable_ace, dealers_card, policy_target)

(0.0,
 (19, 19),
 ([[13, 1, 2], [19, 1, 2], [19, 1, 2]], [1, 0, 0], [0.0, 0.0, 0.0]))

In [20]:
@njit(parallel=True)
def multiple_runs_target(value_cards_player, has_usable_ace, dealers_card, n_runs, policy):
    rewards = 0.0
    counts = 0.0
    for n in prange(n_runs):
        reward, _ = b21.play_single(value_cards_player, has_usable_ace, dealers_card, policy)
        rewards += reward
        counts += 1
    return rewards, counts

In [21]:
%%time
n_sims = 10_000_000
value_cards_player = 13
has_usable_ace = True
dealers_card = 2

rewards_target, count = multiple_runs_target(value_cards_player, has_usable_ace, dealers_card, n_sims, policy_target)

CPU times: user 5.82 s, sys: 0 ns, total: 5.82 s
Wall time: 1.42 s


In [22]:
rewards_target / count

-0.4487064

In [25]:
b21.state_to_ix(25, 1, 1)

(21, 1, 0)

In [26]:
r, value, hist = b21.play_single_hist(value_cards_player, has_usable_ace, dealers_card, policy_target)

print(r, end="\n" * 2)
print(value, end="\n" * 2)
print(hist)

-1

(24, 21)

([[13, 1, 2], [15, 1, 2], [24, 1, 2]], [1, 1, 0], [0, 0, -1])
