# Blackjack - prediction

In this notebook, we are interested in *learning* the value-function $v_\pi(s)$ and action-value function $q_\pi(s, a)$ for a given policy $\pi$

In [2]:
from numba import jit
import numpy as np

In [160]:
np.set_printoptions(suppress=True)

* **Stick**: Player stops
* **Hit**: Request an additional card

In [171]:
n_vals = np.ones(10)
n_vals[-1] = 4
deck_probs = n_vals / n_vals.sum()

In [298]:
from numba.core import types
from numba.typed import Dict

types.float64

In [310]:
@jit(nopython=True)
def draw_card():
    return np.random.multinomial(1, deck_probs).argmax()

@jit(nopython=True)
def dealer_strategy(value_cards):
    """
    Dealer's fixed strategy
    """
    while value_cards < 17:
        value_cards = value_cards + draw_card()
    return value_cards

@jit(nopython=True)
def blackjack(player_value_cards, dealer_cards, policy, has_usable_ace):
    """
    For some reason, a player can have a minimum value of 12
    on her initial value cards.
    
    At the start of the game, we are given the initial value of the cards
    of the player, the initial dealer cards and a policy for the player.
    Furtheremore, we are given whether the player has a usable ace.
    
    Actions:
        0: hit
        1: stick
    """
    reward = 0
    
    hist_reward = [reward]
    hist_state = [(player_value_cards, has_usable_ace)]
    hist_action = [0]
    
    
    dealer_value_cards = np.sum(dealer_cards)
    
    # Stick if you have 21
    if player_value_cards == 21 and dealer_value_cards != 21:
        reward = 1
        
        hist_reward.append(reward)
        hist_state.append((player_value_cards, has_usable_ace))
        hist_action.append(1)
        
        return reward
    
    # Strickly speaking, the policy should depend on:
    #  1. The current state of the player, i.e., the value of her cards
    #  2. The only card we observe of the dealer
    # In this first notebook, we consider a policy that only depends
    # on the current value of the player's cards

    # Hit until you reach a 'stick' state or you lose
    while policy[player_value_cards - 12][1] != 1.0:
        player_value_cards = player_value_cards + draw_card()
        hist_reward.append(0)
        hist_state.append((player_value_cards, has_usable_ace))
        hist_action.append(0)
        
        if player_value_cards > 21:
            break
            
    dealer_value_cards = dealer_strategy(sum(dealer_cards))
    
    if player_value_cards > 21:
        reward = -1
    elif dealer_value_cards > 21:
        reward = 1
    else:
        reward = 1 if player_value_cards > dealer_value_cards else 0
    
    hist_reward.append(reward)
    
    return reward, hist_reward

In [311]:
# We consider the policy that sticks if the 
# player's sum is 20 or 21, and otherwise hits
policy = np.zeros((10, 2))
policy[:-2, 0] = 1
policy[-2:, 1] = 1
policy

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.]])