In [4]:
import numpy as np
import mdptoolbox

State: Roll step, terminal

Action: Roll, Quit

Reward: +[side of the die], -[bankroll]

Transition: Probabilities of isBadSide (stochastic) Quitting is probability is 1 (deterministic)

Output = State-value at state 0

# Build Reward and Trans Matrix

State is the possible total roll on the hand.

Terminate state 19: roll bad roll or roll three times

In [5]:
# Example N = 6
# Max states = N * 3 + 2
T = np.zeros((2, 20, 20))

# Action: Roll (Stochastic)
#        0  1  2  3   4    5    6   7  8  9  10 11 12 13 14 15 16 17 18 19
T[0] = [[0, 0, 0, 0, 1/6, 1/6, 1/6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1/2], # 0
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 1 
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 2
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 3
        [0, 0, 0, 0, 0, 0, 0, 0, 1/6, 1/6, 1/6, 0, 0, 0, 0, 0, 0, 0, 0, 1/2], # 4
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1/6, 1/6, 1/6, 0, 0, 0, 0, 0, 0, 0, 1/2], # 5
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1/6, 1/6, 1/6, 0, 0, 0, 0, 0, 0, 1/2], # 6
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1/6, 1/6, 1/6, 0, 0, 0, 0, 0, 1/2], # 7
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1/6, 1/6, 1/6, 0, 0, 0, 0, 1/2], # 8
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1/6, 1/6, 1/6, 0, 0, 0, 1/2], # 9
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1/6, 1/6, 1/6, 0, 0, 1/2], # 10
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1/6, 1/6, 1/6, 0, 1/2], # 11
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1/6, 1/6, 1/6, 1/2], # 12
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1/6, 1/6, 1/2 + 1/6], # 13
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1/6, 1/2 + 2/6], # 14
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 15
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 16
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 17
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 18
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]] # 19
#        0  1  2  3  4  5  6  7  8    9    10   11 12 13 14 15 16 17 18 19

# Action: Quit (Deterministic)
#        0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18 19
T[1] = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 0
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 1 
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 2
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 3
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 4
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 5
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 6
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 7
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 8
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 9
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 10
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 11
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 12
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 13
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 14
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 15
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 16
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 17
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # 18
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]] # 19

Reward is instance roll gain from now to next roll.
19 means rolling number on hand or terminate.

In [6]:
# Reward
R = np.zeros((2, 20, 20))

# Action: Roll
#        0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18 19
R[0] = [[0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 0
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 1 
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 2
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 3
        [0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, -4], # 4
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, -5], # 5
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 0, 0, -6], # 6
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 0, -7], # 7
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, -8], # 8
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, -9], # 9
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, -10], # 10
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 0, -11], # 11
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, -12], # 12
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, -13], # 13
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -14], # 14
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15], # 15
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -16], # 16
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17], # 17
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -18], # 18
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] # 19
# Action: Quit
#        0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18 19
R[1] = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 0
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 1 
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 2
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 3
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 4
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 5
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 6
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 7
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 8
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 9
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 10
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 11
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 12
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 13
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 14
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 15
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 16
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 17
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 18
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] # 19

In [7]:
vi = mdptoolbox.mdp.ValueIteration(T, R, 1.0)
vi.run()

optimal_policy = vi.policy
expected_values = vi.V

print(optimal_policy)
print(expected_values)

(0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)
(2.5833333333333335, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)


In [8]:
def expected_values(isBadSide):
    # get the good indicies
    good_indicies = []
    for i, isBad in enumerate(isBadSide):
        if not isBad:
            good_indicies.append(i + 1)
    print("Good indices") 
    print(good_indicies)
    
    N = len(isBadSide)
    max_states_n = 3 * N + 2

    T = np.zeros((2, max_states_n, max_states_n))
    
    T_roll = np.zeros((max_states_n, max_states_n))
    T_quit = np.zeros((max_states_n, max_states_n))
    
    R = np.zeros((2, max_states_n, max_states_n))
    
    R_roll = np.zeros((max_states_n, max_states_n))
    R_quit = np.zeros((max_states_n, max_states_n))
    
    possible_rows = [0] + good_indicies
    print("Possible rows")
    print(possible_rows)
    
    # build T and R matricies
    for row in range(0, max_states_n):
        # row vector
        T_roll_row = np.zeros(max_states_n)
        R_roll_row = np.zeros(max_states_n)
        
        if row in possible_rows:
            terminal_p = 1
            for idx in good_indicies:
                col = idx + row
                if col < max_states_n - 1:
                    T_roll_row[col] = 1 / N
                    terminal_p = terminal_p - (1 / N)
                    
                    R_roll_row[col] = idx
                    
                    if col not in possible_rows:
                        possible_rows.append(col)
            T_roll_row[max_states_n - 1] = terminal_p
            R_roll_row[max_states_n - 1] = -row
        else:
            T_roll_row[max_states_n - 1] = 1.0
        
        T_roll[row] = T_roll_row
        R_roll[row] = R_roll_row
        
        T_quit_row = np.zeros(max_states_n)
        T_quit_row[max_states_n - 1] = 1.0
        T_quit[row] = T_quit_row
        
    print("R roll")
    print(R_roll)
    T[0] = T_roll
    T[1] = T_quit
    R[0] = R_roll
    R[1] = R_quit
    
    vi = mdptoolbox.mdp.ValueIteration(T, R, 1.0)
    vi.run()
    
    optimal_policy = vi.policy
    expected_values = vi.V
    
    print(optimal_policy)
    print(expected_values)

In [17]:
expected_values([0,0,1,1,0,1,1]) 

Good indices
[1, 2, 5]
Possible rows
[0, 1, 2, 5]
R roll
[[  0.   1.   2.   0.   0.   5.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.]
 [  0.   0.   1.   2.   0.   0.   5.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.  -1.]
 [  0.   0.   0.   1.   2.   0.   0.   5.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.  -2.]
 [  0.   0.   0.   0.   1.   2.   0.   0.   5.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.  -3.]
 [  0.   0.   0.   0.   0.   1.   2.   0.   0.   5.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.  -4.]
 [  0.   0.   0.   0.   0.   0.   1.   2.   0.   0.   5.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.  -5.]
 [  0.   0.   0.   0.   0.   0.   0.   1.   2.   0.   0.   5.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.  -6.]
 [  0.   0.   0.   0.   0.   0.   0.   0.   1.   2.   0.   0.   5.   0.
    0.   0.   0.   0.   0.   0.