In [2]:
import gymnasium as gym
import numpy as np

import time
from collections import defaultdict
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset()

In [4]:
env.action_space

Discrete(4)

In [5]:
action = env.action_space.sample()
action

0

In [6]:
observation, reward, terminated, truncated, info = env.step(action)
observation, reward, terminated, truncated, info

(array([-0.00536461,  1.4205514 , -0.27131712,  0.20117281,  0.00615575,
         0.06082705,  0.        ,  0.        ], dtype=float32),
 0.8613720056758041,
 False,
 False,
 {})

In [7]:
observation, info = env.reset()
observation, info

(array([-0.00206156,  1.4143422 , -0.2088244 ,  0.1520939 ,  0.00239557,
         0.04730177,  0.        ,  0.        ], dtype=float32),
 {})

In [8]:
env.close()

In [9]:
env = gym.make("LunarLander-v2", render_mode="human")

for _ in range(100):
    observation, info = env.reset()
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)

KeyboardInterrupt: 

In [None]:
for _ in range(1000):
    action = env.action_space.sample()  # agent policy that uses the observation and info
    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        observation, info = env.reset()

env.close()

# Blackjack

In [None]:
env = gym.make("Blackjack-v1", sab=False, render_mode="human")

In [None]:
obs, info = env.reset() # (21, 1, 1): player's sum, dealer's face up, useable ace (ace count as 11 w/o busting)
obs, info

In [69]:
env.action_space

Discrete(2)

In [70]:
env.action_space.sample()

1

In [95]:
for _ in range(2):
    action = env.action_space.sample()
    state, reward, terminated, truncated, info = env.step(action)
    if terminated or truncated:
        print(state[0])
        time.sleep(3)
        new_state, info = env.reset()

15


In [89]:
env.reset()

((9, 6, 0), {})

In [91]:
state, reward, terminated, truncated, info = env.step(action)
state, reward, terminated

((24, 6, 0), -1.0, True)

In [96]:
state, reward, terminated

((15, 1, 0), -1.0, True)

Folding

In [263]:
env.reset()

((17, 9, 0), {})

# Dealer
After the player sticks, the dealer reveals their facedown card, and draws cards until their sum is **17 or greater**. If the dealer goes bust, the player wins.

In [220]:
action = 0
state, reward, terminated, truncated, info = env.step(action)
state, reward, terminated

((15, 6, 0), -1.0, True)

In [207]:
action = 1
state, reward, terminated, truncated, info = env.step(action)
state, reward, terminated

((28, 10, 0), -1.0, True)

## policy iteration

### Starting State
The starting state is initialised in the following range.

| Observation | Min | Max |
| --- | --- | --- |
| Player current sum | 4 | 12 (sic) |
| Dealer showing card value | 2 | 11 |
| Usable Ace | 0 | 1 |



### initialization

In [99]:
env.observation_space

Tuple(Discrete(32), Discrete(11), Discrete(2))

In [136]:
env.observation_space[0].sample()

18

In [131]:
env.observation_space[1].sample()

2

In [139]:
env.observation_space[2].sample()

1

In [140]:
env.reward_range

(-inf, inf)

initialize all states to zero

In [264]:
v = np.zeros((32, 11, 2))
v.shape

(32, 11, 2)

In [265]:
policy = defaultdict(lambda : np.random.randint(2))

In [266]:
state = (1,1,1)

In [267]:
v[state]

0.0

In [192]:
from itertools import product

In [310]:
states = list(product(range(4, 32), range(2, 11), range(2)))
len(states)

504

### policy evaluation

In [268]:
from gymnasium.wrappers import TransformObservation

In [291]:
state = (2,10,0)
env = TransformObservation(env, lambda obs: state)
env.reset()

((2, 10, 0), {})

In [292]:
env.step(0)

((2, 10, 0), 1.0, True, False, {})

In [221]:
theta = 0.1
gamma = 1.0

In [320]:
delta = float('inf')
while delta > theta:
    delta = 0
    # policy evaluation
    for state in tqdm(states):
        # print(state)
        old_value = v[state]
        # set environment state to state
        env.reset()
        env = TransformObservation(env, lambda obs: state)
        # get action according to policy
        action = policy[state]
        # take action and receive next state and reward
        statePrime, reward, terminated, truncated, info = env.step(action)
        # value update using deterministic policy
        v[state] = reward + gamma*v[statePrime]

        # update delta
        delta = np.max([np.abs(old_value-v[state]), delta])
        # check convergence
        if delta < theta: break
    
    # policy improvement
    policy_stable = True
    for state in tqdm(states):
        old_action = policy[state]
        # set environment state to state
        env.reset()
        env = TransformObservation(env, lambda obs: state)

        # get new policy that maximizes the value function
        reference_value = float('-inf')
        best_action = -1
        for action in range(env.action_space.n):
            statePrime, reward, terminated, truncated, info = env.step(action)
            value_action = reward + gamma*v[statePrime]
            # find the action that maximizes the value function
            if value_action>=reference_value: 
                best_action = action
                reference_value = value_action
        # update the policy
        policy[state] = best_action
        # check whether policy is optimal
        if old_action!=best_action: policy_stable=False

    if policy_stable: break

    break

 17%|███████▎                                  | 88/504 [00:44<03:28,  1.99it/s]


KeyboardInterrupt: 

In [321]:
old_value, v[state]

(-1.0, -1.0)

In [322]:
state, statePrime

((8, 10, 0), (8, 9, 1))

In [323]:
reward, v[statePrime]

(-1.0, 0.0)

In [339]:
state, info = env.reset()
state

(8, 10, 0)

In [342]:
env.step(1)

((8, 10, 0), -1.0, True, False, {})

### policy iteration based on random state initialization

In [343]:
env = gym.make("Blackjack-v1", sab=False, render_mode="human")

In [344]:
policy = defaultdict(lambda : np.random.randint(2))
states = list(product(range(4, 32), range(2, 11), range(2)))
v = np.zeros((32, 11, 2))
len(states), v.shape

(504, (32, 11, 2))

In [385]:
theta = 0.1
gamma = 1.0
n_eval = 27 * 10 * 2
n_update = 27 * 10 * 2
n_eval

540

In [386]:
delta = float('inf')
for e in tqdm(range(n_eval)):
    delta = 0
    # policy evaluation

    # get deterministic states
    state, _ = env.reset(seed=e)

    old_value = v[state]
    # get action according to policy
    action = policy[state]
    # take action and receive next state and reward
    statePrime, reward, terminated, truncated, info = env.step(action)
    # value update using deterministic policy
    v[state] = reward + gamma*v[statePrime]

    # update delta
    delta = np.max([np.abs(old_value-v[state]), delta])
    # check convergence
    # if delta < theta: 
    #     print(f'evaluation converged')
    #     break
    
    # log
    if np.abs(old_value-v[state])>0:
        print(f'for state {state} value changed from {old_value} to {v[state]}')

# policy improvement
for u in tqdm(range(n_update)):
    policy_stable = True
    
    state, _ = env.reset(seed=u)
    old_action = policy[state]
    
    # get new policy that maximizes the value function
    reference_value = float('-inf')
    best_action = -1
    for action in range(env.action_space.n):
        # reset to deterministic state
        state, _ = env.reset(seed=u)
        # calculate action value
        statePrime, reward, terminated, truncated, info = env.step(action)
        value_action = reward + gamma*v[statePrime]
        # find the action that maximizes the value function
        if value_action>=reference_value: 
            best_action = action
            reference_value = value_action
            best_statePrime = statePrime
            best_reward = reward
            best_valuePrime = value_action
    
    # update the policy
    policy[state] = best_action
    # check whether policy is optimal
    if old_action!=best_action: 
        print(f'for state {state} best action changed to {best_action}, statePrime {best_statePrime}={best_valuePrime}, reward {best_reward}')
        policy_stable=False


    # if policy_stable: 
    #     print(f'improvement converged')
    #     break


  0%|▏                                          | 2/540 [00:00<03:38,  2.46it/s]

for state (20, 7, 0) value changed from 12.0 to 13.0


  1%|▎                                          | 4/540 [00:01<04:11,  2.13it/s]

for state (7, 10, 0) value changed from 3.0 to -1.0


  1%|▍                                          | 6/540 [00:02<04:20,  2.05it/s]

for state (21, 9, 1) value changed from 5.0 to 6.0


  2%|▋                                          | 9/540 [00:04<04:24,  2.01it/s]

for state (14, 10, 0) value changed from 4.0 to 5.0


  2%|▊                                         | 10/540 [00:04<04:24,  2.00it/s]

for state (14, 6, 0) value changed from 6.0 to 7.0


  2%|▊                                         | 11/540 [00:05<04:24,  2.00it/s]

for state (7, 10, 0) value changed from -1.0 to 5.0


  2%|▉                                         | 12/540 [00:05<04:24,  2.00it/s]

for state (17, 2, 0) value changed from 5.0 to 6.0


  2%|█                                         | 13/540 [00:06<04:23,  2.00it/s]

for state (20, 8, 0) value changed from 5.0 to 6.0


  3%|█▏                                        | 15/540 [00:07<04:23,  2.00it/s]

for state (14, 2, 0) value changed from 5.0 to 6.0


  3%|█▍                                        | 18/540 [00:08<04:21,  1.99it/s]

for state (5, 10, 0) value changed from -1.0 to -2.0


  4%|█▍                                        | 19/540 [00:09<04:21,  1.99it/s]

for state (13, 10, 0) value changed from -1.0 to 3.0


  4%|█▋                                        | 21/540 [00:10<04:20,  1.99it/s]

for state (10, 10, 0) value changed from 4.0 to 5.0


  4%|█▋                                        | 22/540 [00:10<04:19,  1.99it/s]

for state (14, 4, 0) value changed from 2.0 to 3.0


  5%|██                                        | 27/540 [00:13<04:16,  2.00it/s]

for state (12, 10, 0) value changed from 0.0 to -1.0


  6%|██▎                                       | 30/540 [00:14<04:15,  1.99it/s]

for state (15, 10, 0) value changed from -2.0 to -1.0


  6%|██▌                                       | 33/540 [00:16<04:14,  1.99it/s]

for state (18, 10, 0) value changed from 3.0 to 4.0


  6%|██▋                                       | 34/540 [00:16<04:13,  1.99it/s]

for state (13, 10, 0) value changed from 3.0 to -1.0


  7%|██▉                                       | 38/540 [00:18<04:11,  1.99it/s]

for state (18, 3, 0) value changed from 5.0 to 6.0


  7%|███                                       | 40/540 [00:19<04:10,  1.99it/s]

for state (12, 10, 0) value changed from -1.0 to 0.0


  8%|███▏                                      | 41/540 [00:20<04:10,  1.99it/s]

for state (21, 8, 1) value changed from 3.0 to 4.0


  8%|███▎                                      | 42/540 [00:20<04:09,  1.99it/s]

for state (20, 9, 0) value changed from 5.0 to 6.0


  8%|███▎                                      | 43/540 [00:21<04:09,  1.99it/s]

for state (15, 2, 0) value changed from 10.0 to 11.0


  8%|███▌                                      | 45/540 [00:22<04:08,  1.99it/s]

for state (14, 9, 0) value changed from 5.0 to 6.0


  9%|███▊                                      | 49/540 [00:24<04:06,  1.99it/s]

for state (15, 2, 0) value changed from 11.0 to 12.0


 10%|████                                      | 53/540 [00:26<04:04,  1.99it/s]

for state (12, 10, 0) value changed from 0.0 to -1.0


 11%|████▍                                     | 57/540 [00:28<04:02,  1.99it/s]

for state (17, 5, 1) value changed from 3.0 to 4.0


 11%|████▌                                     | 58/540 [00:28<04:01,  1.99it/s]

for state (17, 1, 0) value changed from 0.0 to -1.0


 11%|████▌                                     | 59/540 [00:29<04:01,  1.99it/s]

for state (18, 7, 0) value changed from 1.0 to 2.0


 12%|████▉                                     | 63/540 [00:31<03:59,  1.99it/s]

for state (18, 6, 0) value changed from 3.0 to 4.0


 12%|████▉                                     | 64/540 [00:31<03:58,  1.99it/s]

for state (20, 6, 0) value changed from 3.0 to 4.0


 12%|█████                                     | 65/540 [00:32<03:58,  1.99it/s]

for state (17, 7, 0) value changed from 0.0 to 2.0


 12%|█████▏                                    | 67/540 [00:33<03:57,  1.99it/s]

for state (14, 10, 0) value changed from 5.0 to 4.0


 13%|█████▎                                    | 68/540 [00:33<03:56,  1.99it/s]

for state (10, 10, 0) value changed from 5.0 to -1.0


 13%|█████▎                                    | 69/540 [00:34<03:56,  1.99it/s]

for state (18, 7, 0) value changed from 2.0 to 3.0


 13%|█████▌                                    | 71/540 [00:35<03:55,  1.99it/s]

for state (15, 9, 0) value changed from 3.0 to 6.0


 13%|█████▌                                    | 72/540 [00:35<03:54,  2.00it/s]

for state (14, 10, 0) value changed from 4.0 to 3.0


 14%|█████▋                                    | 73/540 [00:36<03:54,  2.00it/s]

for state (19, 10, 1) value changed from -1.0 to 0.0


 14%|█████▊                                    | 74/540 [00:36<03:53,  1.99it/s]

for state (14, 10, 0) value changed from 3.0 to 2.0


 14%|█████▊                                    | 75/540 [00:37<03:52,  2.00it/s]

for state (12, 3, 0) value changed from 1.0 to 2.0


 14%|█████▉                                    | 76/540 [00:37<03:52,  1.99it/s]

for state (5, 9, 0) value changed from 3.0 to 6.0


 14%|██████                                    | 78/540 [00:38<03:51,  1.99it/s]

for state (17, 1, 0) value changed from -1.0 to 0.0


 15%|██████▏                                   | 79/540 [00:39<03:51,  1.99it/s]

for state (13, 10, 0) value changed from -1.0 to 0.0


 15%|██████▏                                   | 80/540 [00:39<03:50,  1.99it/s]

for state (21, 2, 1) value changed from 3.0 to 4.0


 15%|██████▎                                   | 81/540 [00:40<03:50,  1.99it/s]

for state (12, 10, 1) value changed from -3.0 to 0.0


 16%|██████▌                                   | 84/540 [00:41<03:48,  1.99it/s]

for state (19, 10, 1) value changed from 0.0 to -1.0


 16%|██████▊                                   | 88/540 [00:43<03:46,  1.99it/s]

for state (21, 9, 1) value changed from 6.0 to 7.0


 16%|██████▉                                   | 89/540 [00:44<03:46,  1.99it/s]

for state (20, 7, 0) value changed from 13.0 to 14.0


 17%|███████                                   | 90/540 [00:44<03:45,  1.99it/s]

for state (14, 10, 0) value changed from 2.0 to 1.0


 17%|███████▏                                  | 93/540 [00:46<03:44,  1.99it/s]

for state (21, 6, 1) value changed from 6.0 to 7.0


 17%|███████▎                                  | 94/540 [00:46<03:43,  1.99it/s]

for state (17, 10, 0) value changed from -1.0 to 0.0


 18%|███████▍                                  | 95/540 [00:47<03:43,  1.99it/s]

for state (20, 1, 0) value changed from 3.0 to 4.0


 18%|███████▍                                  | 96/540 [00:47<03:42,  1.99it/s]

for state (21, 6, 1) value changed from 7.0 to 8.0


 18%|███████▌                                  | 98/540 [00:48<03:41,  1.99it/s]

for state (8, 2, 0) value changed from 6.0 to 12.0


 18%|███████▋                                  | 99/540 [00:49<03:41,  1.99it/s]

for state (20, 5, 0) value changed from 3.0 to 4.0


 19%|███████▌                                 | 100/540 [00:49<03:40,  1.99it/s]

for state (18, 10, 0) value changed from 4.0 to 5.0


 19%|███████▋                                 | 101/540 [00:50<03:40,  1.99it/s]

for state (10, 10, 0) value changed from -1.0 to 0.0


 19%|███████▋                                 | 102/540 [00:50<03:39,  1.99it/s]

for state (15, 5, 0) value changed from 0.0 to -1.0


 19%|███████▊                                 | 103/540 [00:51<03:39,  1.99it/s]

for state (16, 6, 0) value changed from 0.0 to -1.0


 19%|███████▉                                 | 104/540 [00:51<03:38,  2.00it/s]

for state (14, 8, 1) value changed from 0.0 to 1.0


 19%|███████▉                                 | 105/540 [00:52<03:37,  2.00it/s]

for state (14, 10, 0) value changed from 1.0 to 0.0


 20%|████████                                 | 106/540 [00:52<03:37,  1.99it/s]

for state (20, 4, 0) value changed from 0.0 to 1.0


 20%|████████                                 | 107/540 [00:53<03:37,  1.99it/s]

for state (17, 8, 1) value changed from 0.0 to -1.0


 20%|████████▏                                | 108/540 [00:53<03:36,  1.99it/s]

for state (17, 2, 0) value changed from 6.0 to 5.0


 20%|████████▎                                | 109/540 [00:54<03:36,  1.99it/s]

for state (19, 1, 0) value changed from 0.0 to -1.0


 20%|████████▎                                | 110/540 [00:54<03:35,  2.00it/s]

for state (15, 8, 0) value changed from 0.0 to -1.0


 21%|████████▍                                | 111/540 [00:55<03:34,  2.00it/s]

for state (16, 8, 0) value changed from 0.0 to -1.0


 21%|████████▌                                | 112/540 [00:55<03:34,  2.00it/s]

for state (13, 7, 0) value changed from 0.0 to -1.0


 21%|████████▌                                | 113/540 [00:56<03:34,  1.99it/s]

for state (20, 3, 0) value changed from 0.0 to 1.0


 21%|████████▋                                | 114/540 [00:56<03:33,  1.99it/s]

for state (12, 4, 0) value changed from 0.0 to -1.0


 21%|████████▋                                | 115/540 [00:57<03:33,  1.99it/s]

for state (12, 3, 0) value changed from 2.0 to 1.0


 21%|████████▊                                | 116/540 [00:57<03:32,  2.00it/s]

for state (12, 9, 0) value changed from 0.0 to 6.0


 22%|████████▉                                | 117/540 [00:58<03:32,  1.99it/s]

for state (17, 4, 0) value changed from 0.0 to -1.0


 22%|████████▉                                | 118/540 [00:58<03:31,  2.00it/s]

for state (7, 2, 0) value changed from 0.0 to 12.0


 22%|█████████                                | 119/540 [00:59<03:30,  2.00it/s]

for state (19, 6, 0) value changed from 0.0 to -1.0


 22%|█████████                                | 120/540 [00:59<03:30,  2.00it/s]

for state (18, 6, 0) value changed from 4.0 to 5.0


 23%|█████████▍                               | 124/540 [01:01<03:28,  1.99it/s]

for state (19, 1, 1) value changed from 0.0 to -1.0


 23%|█████████▌                               | 126/540 [01:02<03:27,  1.99it/s]

for state (9, 6, 0) value changed from 0.0 to 1.0


 24%|█████████▋                               | 127/540 [01:03<03:27,  1.99it/s]

for state (12, 4, 0) value changed from -1.0 to 0.0


 24%|█████████▋                               | 128/540 [01:03<03:26,  1.99it/s]

for state (11, 7, 0) value changed from 0.0 to -1.0


 24%|█████████▊                               | 129/540 [01:04<03:26,  1.99it/s]

for state (17, 10, 0) value changed from 0.0 to -1.0


 24%|█████████▊                               | 130/540 [01:04<03:25,  1.99it/s]

for state (7, 8, 0) value changed from 0.0 to -1.0


 24%|█████████▉                               | 131/540 [01:05<03:25,  2.00it/s]

for state (7, 10, 0) value changed from 5.0 to -1.0


 24%|██████████                               | 132/540 [01:05<03:24,  2.00it/s]

for state (12, 7, 0) value changed from 0.0 to 1.0


 25%|██████████▏                              | 134/540 [01:06<03:23,  1.99it/s]

for state (18, 7, 1) value changed from 0.0 to 1.0


 25%|██████████▎                              | 135/540 [01:07<03:23,  1.99it/s]

for state (13, 10, 0) value changed from 0.0 to -1.0


 25%|██████████▎                              | 136/540 [01:07<03:22,  1.99it/s]

for state (14, 7, 0) value changed from 0.0 to -1.0


 25%|██████████▍                              | 137/540 [01:08<03:22,  1.99it/s]

for state (20, 10, 1) value changed from 0.0 to -1.0


 26%|██████████▍                              | 138/540 [01:08<03:21,  1.99it/s]

for state (8, 7, 0) value changed from 0.0 to -1.0


 26%|██████████▌                              | 139/540 [01:09<03:21,  1.99it/s]

for state (20, 9, 0) value changed from 6.0 to 7.0


 26%|██████████▋                              | 140/540 [01:09<03:20,  1.99it/s]

for state (18, 6, 0) value changed from 5.0 to 4.0


 26%|██████████▋                              | 141/540 [01:10<03:20,  1.99it/s]

for state (19, 9, 0) value changed from 0.0 to 1.0


 26%|██████████▊                              | 142/540 [01:10<03:19,  1.99it/s]

for state (20, 6, 0) value changed from 4.0 to 5.0


 26%|██████████▊                              | 143/540 [01:11<03:19,  1.99it/s]

for state (9, 1, 0) value changed from 0.0 to -1.0


 27%|███████████                              | 145/540 [01:12<03:18,  1.99it/s]

for state (16, 5, 1) value changed from 0.0 to -1.0


 28%|███████████▎                             | 149/540 [01:14<03:16,  1.99it/s]

for state (14, 2, 0) value changed from 6.0 to 5.0


 28%|███████████▌                             | 153/540 [01:16<03:14,  1.99it/s]

for state (9, 9, 0) value changed from 0.0 to 1.0


 29%|███████████▊                             | 155/540 [01:17<03:12,  2.00it/s]

for state (17, 4, 0) value changed from -1.0 to 1.0


 29%|███████████▊                             | 156/540 [01:18<03:12,  2.00it/s]

for state (20, 2, 0) value changed from 0.0 to 1.0


 29%|███████████▉                             | 158/540 [01:19<03:11,  1.99it/s]

for state (20, 1, 0) value changed from 4.0 to 3.0


 29%|████████████                             | 159/540 [01:19<03:10,  2.00it/s]

for state (21, 6, 1) value changed from 8.0 to 9.0


 30%|████████████▏                            | 160/540 [01:20<03:10,  1.99it/s]

for state (14, 10, 0) value changed from 0.0 to -1.0


 30%|████████████▏                            | 161/540 [01:20<03:10,  1.99it/s]

for state (14, 1, 0) value changed from 0.0 to -1.0


 30%|████████████▎                            | 162/540 [01:21<03:09,  2.00it/s]

for state (18, 10, 0) value changed from 5.0 to 6.0


 30%|████████████▍                            | 164/540 [01:22<03:08,  2.00it/s]

for state (12, 6, 1) value changed from 0.0 to 1.0


 31%|████████████▌                            | 165/540 [01:22<03:07,  2.00it/s]

for state (7, 8, 0) value changed from -1.0 to 0.0


 31%|████████████▌                            | 166/540 [01:23<03:07,  2.00it/s]

for state (18, 8, 0) value changed from 0.0 to 1.0


 31%|████████████▋                            | 167/540 [01:23<03:06,  2.00it/s]

for state (14, 10, 1) value changed from 0.0 to -1.0


 31%|████████████▉                            | 170/540 [01:25<03:05,  1.99it/s]

for state (18, 10, 0) value changed from 6.0 to 7.0


 32%|█████████████▏                           | 173/540 [01:26<03:04,  1.99it/s]

for state (20, 5, 0) value changed from 4.0 to 5.0


 33%|█████████████▎                           | 176/540 [01:28<03:02,  1.99it/s]

for state (18, 10, 1) value changed from 0.0 to -1.0


 33%|█████████████▍                           | 177/540 [01:28<03:02,  1.99it/s]

for state (11, 3, 0) value changed from 0.0 to 1.0


 33%|█████████████▌                           | 178/540 [01:29<03:01,  1.99it/s]

for state (10, 9, 0) value changed from 0.0 to 1.0


 33%|█████████████▌                           | 179/540 [01:29<03:01,  1.99it/s]

for state (13, 8, 1) value changed from 0.0 to -1.0


 34%|█████████████▋                           | 181/540 [01:30<03:00,  1.99it/s]

for state (17, 4, 0) value changed from 1.0 to -1.0


 34%|█████████████▊                           | 182/540 [01:31<02:59,  2.00it/s]

for state (18, 10, 0) value changed from 7.0 to 6.0


 34%|█████████████▉                           | 183/540 [01:31<02:59,  1.99it/s]

for state (16, 10, 1) value changed from 0.0 to -1.0


 34%|██████████████                           | 185/540 [01:32<02:58,  1.99it/s]

for state (13, 5, 0) value changed from 0.0 to 1.0


 35%|██████████████▏                          | 187/540 [01:33<02:56,  2.00it/s]

for state (20, 1, 1) value changed from 0.0 to 3.0


 35%|██████████████▎                          | 188/540 [01:34<02:56,  1.99it/s]

for state (13, 8, 0) value changed from 0.0 to -1.0


 35%|██████████████▎                          | 189/540 [01:34<02:55,  1.99it/s]

for state (20, 6, 0) value changed from 5.0 to 6.0


 35%|██████████████▍                          | 190/540 [01:35<02:55,  2.00it/s]

for state (12, 6, 0) value changed from 0.0 to 4.0


 36%|██████████████▌                          | 192/540 [01:36<02:54,  1.99it/s]

for state (12, 2, 0) value changed from 0.0 to 5.0


 36%|██████████████▋                          | 194/540 [01:37<02:53,  1.99it/s]

for state (14, 9, 0) value changed from 6.0 to -1.0


 36%|██████████████▊                          | 195/540 [01:37<02:53,  1.99it/s]

for state (14, 10, 0) value changed from -1.0 to -2.0


 36%|██████████████▉                          | 197/540 [01:38<02:52,  1.99it/s]

for state (11, 4, 0) value changed from 0.0 to 1.0


 37%|███████████████                          | 199/540 [01:39<02:51,  1.99it/s]

for state (20, 6, 0) value changed from 6.0 to 7.0


 37%|███████████████▏                         | 200/540 [01:40<02:50,  1.99it/s]

for state (18, 6, 0) value changed from 4.0 to 5.0


 37%|███████████████▎                         | 201/540 [01:40<02:50,  1.99it/s]

for state (13, 1, 0) value changed from 0.0 to -1.0


 38%|███████████████▍                         | 203/540 [01:41<02:49,  1.99it/s]

for state (12, 3, 0) value changed from 1.0 to 0.0


 38%|███████████████▍                         | 204/540 [01:42<02:48,  1.99it/s]

for state (15, 5, 0) value changed from -1.0 to 0.0


 38%|███████████████▋                         | 206/540 [01:43<02:47,  1.99it/s]

for state (10, 8, 0) value changed from 0.0 to 6.0


 38%|███████████████▋                         | 207/540 [01:43<02:47,  1.99it/s]

for state (16, 7, 0) value changed from 0.0 to -1.0


 39%|███████████████▉                         | 210/540 [01:45<02:45,  1.99it/s]

for state (16, 10, 0) value changed from 0.0 to 1.0


 39%|████████████████                         | 211/540 [01:45<02:45,  1.99it/s]

for state (18, 4, 0) value changed from 0.0 to -1.0


 39%|████████████████                         | 212/540 [01:46<02:44,  1.99it/s]

for state (16, 3, 1) value changed from 0.0 to -1.0


 39%|████████████████▏                        | 213/540 [01:46<02:44,  1.99it/s]

for state (14, 1, 0) value changed from -1.0 to 0.0


 40%|████████████████▏                        | 214/540 [01:47<02:43,  1.99it/s]

for state (16, 3, 0) value changed from 0.0 to -1.0


 40%|████████████████▎                        | 215/540 [01:47<02:42,  1.99it/s]

for state (11, 1, 0) value changed from 0.0 to -1.0


 40%|████████████████▍                        | 216/540 [01:48<02:42,  1.99it/s]

for state (14, 10, 1) value changed from -1.0 to -2.0


 41%|████████████████▋                        | 220/540 [01:50<02:40,  1.99it/s]

for state (13, 2, 0) value changed from 0.0 to -1.0


 41%|████████████████▉                        | 223/540 [01:51<02:39,  1.99it/s]

for state (15, 6, 0) value changed from 0.0 to -1.0


 41%|█████████████████                        | 224/540 [01:52<02:38,  1.99it/s]

for state (13, 5, 0) value changed from 1.0 to 2.0


 42%|█████████████████                        | 225/540 [01:52<02:38,  1.99it/s]

for state (11, 10, 0) value changed from 0.0 to -1.0


 42%|█████████████████▏                       | 226/540 [01:53<02:37,  1.99it/s]

for state (14, 10, 0) value changed from -2.0 to -3.0


 42%|█████████████████▎                       | 228/540 [01:54<02:36,  1.99it/s]

for state (11, 6, 0) value changed from 0.0 to 1.0


 42%|█████████████████▍                       | 229/540 [01:54<02:35,  1.99it/s]

for state (14, 9, 0) value changed from -1.0 to 0.0


 43%|█████████████████▍                       | 230/540 [01:55<02:35,  1.99it/s]

for state (15, 7, 0) value changed from 0.0 to -1.0


 43%|█████████████████▌                       | 232/540 [01:56<02:34,  1.99it/s]

for state (12, 3, 0) value changed from 0.0 to 1.0


 43%|█████████████████▋                       | 233/540 [01:56<02:34,  1.99it/s]

for state (19, 9, 0) value changed from 1.0 to 2.0


 43%|█████████████████▊                       | 234/540 [01:57<02:33,  1.99it/s]

for state (19, 2, 1) value changed from 0.0 to 4.0


 44%|█████████████████▊                       | 235/540 [01:57<02:33,  1.99it/s]

for state (10, 6, 0) value changed from 0.0 to 7.0


 44%|█████████████████▉                       | 236/540 [01:58<02:32,  1.99it/s]

for state (21, 9, 1) value changed from 7.0 to 8.0


 44%|█████████████████▉                       | 237/540 [01:58<02:31,  1.99it/s]

for state (15, 7, 0) value changed from -1.0 to -2.0


 44%|██████████████████                       | 238/540 [01:59<02:31,  1.99it/s]

for state (5, 10, 0) value changed from -2.0 to -1.0


 44%|██████████████████▏                      | 239/540 [01:59<02:31,  1.99it/s]

for state (14, 9, 0) value changed from 0.0 to -1.0


 44%|██████████████████▏                      | 240/540 [02:00<02:30,  1.99it/s]

for state (8, 10, 0) value changed from 0.0 to -1.0


 45%|██████████████████▎                      | 241/540 [02:00<02:29,  1.99it/s]

for state (17, 10, 1) value changed from 0.0 to -3.0


 45%|██████████████████▎                      | 242/540 [02:01<02:29,  1.99it/s]

for state (7, 6, 0) value changed from 0.0 to -1.0


 45%|██████████████████▍                      | 243/540 [02:01<02:28,  1.99it/s]

for state (16, 10, 0) value changed from 1.0 to 2.0


 45%|██████████████████▌                      | 245/540 [02:02<02:28,  1.99it/s]

for state (14, 10, 0) value changed from -3.0 to -4.0


 46%|██████████████████▊                      | 248/540 [02:04<02:26,  1.99it/s]

for state (14, 5, 0) value changed from 0.0 to -1.0


 46%|██████████████████▉                      | 249/540 [02:04<02:26,  1.99it/s]

for state (20, 7, 0) value changed from 14.0 to 15.0


 46%|███████████████████                      | 251/540 [02:05<02:24,  1.99it/s]

for state (19, 6, 0) value changed from -1.0 to -2.0


 47%|███████████████████▏                     | 252/540 [02:06<02:24,  1.99it/s]

for state (20, 2, 0) value changed from 1.0 to 2.0


 47%|███████████████████▏                     | 253/540 [02:06<02:24,  1.99it/s]

for state (17, 10, 0) value changed from -1.0 to 6.0


 47%|███████████████████▍                     | 256/540 [02:08<02:22,  1.99it/s]

for state (17, 3, 0) value changed from 0.0 to 1.0


 48%|███████████████████▌                     | 257/540 [02:08<02:22,  1.99it/s]

for state (19, 6, 0) value changed from -2.0 to -1.0


 48%|███████████████████▌                     | 258/540 [02:09<02:21,  1.99it/s]

for state (16, 10, 0) value changed from 2.0 to 1.0


 48%|███████████████████▋                     | 259/540 [02:09<02:20,  1.99it/s]

for state (20, 7, 0) value changed from 15.0 to 16.0


 48%|███████████████████▋                     | 260/540 [02:10<02:20,  1.99it/s]

for state (10, 3, 0) value changed from 0.0 to -1.0


 49%|███████████████████▉                     | 263/540 [02:11<02:18,  1.99it/s]

for state (17, 10, 0) value changed from 6.0 to -1.0


 49%|████████████████████                     | 264/540 [02:12<02:18,  1.99it/s]

for state (4, 10, 0) value changed from 0.0 to 1.0


 49%|████████████████████                     | 265/540 [02:12<02:17,  1.99it/s]

for state (15, 2, 0) value changed from 12.0 to 13.0


 49%|████████████████████▎                    | 267/540 [02:13<02:17,  1.99it/s]

for state (17, 6, 1) value changed from 0.0 to -1.0


 50%|████████████████████▍                    | 269/540 [02:14<02:15,  1.99it/s]

for state (18, 9, 1) value changed from 0.0 to -1.0


 50%|████████████████████▌                    | 270/540 [02:15<02:15,  1.99it/s]

for state (14, 10, 0) value changed from -4.0 to -5.0


 50%|████████████████████▌                    | 271/540 [02:15<02:14,  1.99it/s]

for state (20, 8, 0) value changed from 6.0 to 5.0


 51%|████████████████████▋                    | 273/540 [02:16<02:13,  2.00it/s]

for state (7, 2, 0) value changed from 12.0 to -1.0


 51%|████████████████████▉                    | 275/540 [02:17<02:12,  2.00it/s]

for state (5, 7, 0) value changed from 0.0 to 1.0


 51%|████████████████████▉                    | 276/540 [02:18<02:12,  2.00it/s]

for state (20, 3, 0) value changed from 1.0 to 2.0


 51%|█████████████████████                    | 277/540 [02:18<02:11,  2.00it/s]

for state (17, 9, 1) value changed from 0.0 to 8.0


 51%|█████████████████████                    | 278/540 [02:19<02:11,  2.00it/s]

for state (14, 2, 0) value changed from 5.0 to 6.0


 52%|█████████████████████▏                   | 279/540 [02:19<02:10,  2.00it/s]

for state (8, 7, 0) value changed from -1.0 to 1.0


 52%|█████████████████████▎                   | 280/540 [02:20<02:10,  2.00it/s]

for state (17, 8, 0) value changed from -1.0 to 0.0


 52%|█████████████████████▎                   | 281/540 [02:20<02:09,  2.00it/s]

for state (17, 6, 0) value changed from 0.0 to 1.0


 52%|█████████████████████▍                   | 282/540 [02:21<02:09,  1.99it/s]

for state (20, 7, 0) value changed from 16.0 to 17.0


 52%|█████████████████████▍                   | 283/540 [02:21<02:08,  1.99it/s]

for state (10, 10, 0) value changed from 0.0 to -1.0


 53%|█████████████████████▌                   | 284/540 [02:22<02:08,  1.99it/s]

for state (15, 2, 0) value changed from 13.0 to 12.0


 53%|█████████████████████▋                   | 285/540 [02:22<02:07,  1.99it/s]

for state (17, 4, 1) value changed from 0.0 to -1.0


 53%|█████████████████████▊                   | 287/540 [02:23<02:06,  1.99it/s]

for state (14, 4, 0) value changed from 3.0 to 4.0


 54%|█████████████████████▉                   | 289/540 [02:24<02:06,  1.99it/s]

for state (11, 3, 0) value changed from 1.0 to 2.0


 54%|██████████████████████                   | 291/540 [02:25<02:04,  1.99it/s]

for state (9, 4, 0) value changed from 0.0 to 4.0


 54%|██████████████████████▏                  | 292/540 [02:26<02:04,  1.99it/s]

for state (9, 1, 0) value changed from -1.0 to -2.0


 54%|██████████████████████▏                  | 293/540 [02:26<02:03,  2.00it/s]

for state (21, 10, 1) value changed from 0.0 to -1.0


 54%|██████████████████████▎                  | 294/540 [02:27<02:03,  2.00it/s]

for state (17, 2, 0) value changed from 5.0 to 4.0


 55%|██████████████████████▍                  | 295/540 [02:27<02:02,  2.00it/s]

for state (18, 2, 1) value changed from 0.0 to -1.0


 55%|██████████████████████▍                  | 296/540 [02:28<02:02,  2.00it/s]

for state (17, 3, 1) value changed from 0.0 to 1.0


 55%|██████████████████████▌                  | 297/540 [02:28<02:01,  2.00it/s]

for state (10, 2, 0) value changed from 0.0 to 6.0


 55%|██████████████████████▋                  | 298/540 [02:29<02:01,  2.00it/s]

for state (14, 3, 0) value changed from 0.0 to 6.0


 55%|██████████████████████▋                  | 299/540 [02:29<02:00,  2.00it/s]

for state (17, 7, 0) value changed from 2.0 to -1.0


 56%|███████████████████████                  | 304/540 [02:32<01:58,  1.99it/s]

for state (16, 6, 0) value changed from -1.0 to -2.0


 57%|███████████████████████▏                 | 306/540 [02:33<01:57,  1.99it/s]

for state (19, 5, 0) value changed from 0.0 to -1.0


 57%|███████████████████████▎                 | 307/540 [02:33<01:56,  1.99it/s]

for state (6, 10, 0) value changed from 0.0 to 1.0


 57%|███████████████████████▍                 | 309/540 [02:34<01:55,  1.99it/s]

for state (17, 9, 0) value changed from 0.0 to -1.0


 57%|███████████████████████▌                 | 310/540 [02:35<01:55,  1.99it/s]

for state (12, 8, 1) value changed from 0.0 to 1.0


 58%|███████████████████████▌                 | 311/540 [02:35<01:54,  1.99it/s]

for state (21, 2, 1) value changed from 4.0 to 5.0


 58%|███████████████████████▋                 | 312/540 [02:36<01:54,  1.99it/s]

for state (12, 5, 0) value changed from -1.0 to 0.0


 58%|███████████████████████▊                 | 313/540 [02:36<01:53,  1.99it/s]

for state (15, 1, 0) value changed from 0.0 to -1.0


 58%|███████████████████████▊                 | 314/540 [02:37<01:53,  1.99it/s]

for state (10, 5, 0) value changed from 0.0 to 5.0


 58%|███████████████████████▉                 | 315/540 [02:37<01:52,  1.99it/s]

for state (19, 7, 0) value changed from 0.0 to -1.0


 59%|███████████████████████▉                 | 316/540 [02:38<01:52,  1.99it/s]

for state (12, 9, 0) value changed from 6.0 to -1.0


 59%|████████████████████████                 | 317/540 [02:38<01:51,  1.99it/s]

for state (18, 1, 1) value changed from 0.0 to -1.0


 59%|████████████████████████▏                | 318/540 [02:39<01:51,  1.99it/s]

for state (13, 3, 0) value changed from 0.0 to -1.0


 59%|████████████████████████▏                | 319/540 [02:39<01:50,  1.99it/s]

for state (20, 2, 0) value changed from 2.0 to 3.0


 59%|████████████████████████▎                | 321/540 [02:40<01:49,  1.99it/s]

for state (14, 4, 0) value changed from 4.0 to 3.0


 60%|████████████████████████▍                | 322/540 [02:41<01:49,  2.00it/s]

for state (12, 5, 0) value changed from 0.0 to -1.0


 60%|████████████████████████▌                | 323/540 [02:41<01:48,  1.99it/s]

for state (11, 2, 0) value changed from 0.0 to 12.0


 60%|████████████████████████▋                | 325/540 [02:42<01:47,  1.99it/s]

for state (12, 7, 0) value changed from 1.0 to 0.0


 60%|████████████████████████▊                | 326/540 [02:43<01:47,  2.00it/s]

for state (13, 1, 0) value changed from -1.0 to -2.0


 61%|████████████████████████▊                | 327/540 [02:43<01:46,  1.99it/s]

for state (17, 3, 0) value changed from 1.0 to 0.0


 61%|████████████████████████▉                | 328/540 [02:44<01:46,  1.99it/s]

for state (10, 6, 0) value changed from 7.0 to 5.0


 61%|████████████████████████▉                | 329/540 [02:44<01:45,  1.99it/s]

for state (13, 4, 0) value changed from 0.0 to -1.0


 61%|█████████████████████████▏               | 331/540 [02:45<01:44,  2.00it/s]

for state (12, 7, 0) value changed from 0.0 to -1.0


 61%|█████████████████████████▏               | 332/540 [02:46<01:44,  2.00it/s]

for state (15, 2, 0) value changed from 12.0 to 11.0


 62%|█████████████████████████▎               | 333/540 [02:46<01:43,  2.00it/s]

for state (16, 6, 0) value changed from -2.0 to -1.0


 62%|█████████████████████████▎               | 334/540 [02:47<01:43,  2.00it/s]

for state (16, 4, 0) value changed from 0.0 to -1.0


 62%|█████████████████████████▍               | 335/540 [02:47<01:42,  1.99it/s]

for state (17, 9, 1) value changed from 8.0 to -1.0


 62%|█████████████████████████▌               | 336/540 [02:48<01:42,  1.99it/s]

for state (9, 2, 0) value changed from 0.0 to -1.0


 62%|█████████████████████████▌               | 337/540 [02:48<01:41,  1.99it/s]

for state (15, 5, 0) value changed from 0.0 to -1.0


 63%|█████████████████████████▋               | 338/540 [02:49<01:41,  1.99it/s]

for state (8, 5, 0) value changed from 0.0 to 1.0


 63%|█████████████████████████▋               | 339/540 [02:49<01:40,  1.99it/s]

for state (18, 7, 0) value changed from 3.0 to 4.0


 63%|█████████████████████████▊               | 340/540 [02:50<01:40,  1.99it/s]

for state (19, 1, 0) value changed from -1.0 to 0.0


 63%|█████████████████████████▉               | 341/540 [02:50<01:39,  2.00it/s]

for state (10, 1, 0) value changed from 0.0 to 3.0


 63%|█████████████████████████▉               | 342/540 [02:51<01:39,  1.99it/s]

for state (9, 10, 0) value changed from 0.0 to -1.0


 64%|██████████████████████████               | 343/540 [02:51<01:38,  2.00it/s]

for state (17, 10, 0) value changed from -1.0 to 6.0


 64%|██████████████████████████               | 344/540 [02:52<01:38,  2.00it/s]

for state (10, 1, 0) value changed from 3.0 to 0.0


 64%|██████████████████████████▎              | 346/540 [02:53<01:37,  1.99it/s]

for state (8, 4, 0) value changed from 0.0 to -1.0


 64%|██████████████████████████▎              | 347/540 [02:53<01:36,  1.99it/s]

for state (18, 8, 0) value changed from 1.0 to 2.0


 65%|██████████████████████████▋              | 352/540 [02:56<01:34,  2.00it/s]

for state (10, 8, 0) value changed from 6.0 to 0.0


 65%|██████████████████████████▊              | 353/540 [02:56<01:33,  1.99it/s]

for state (19, 4, 1) value changed from 0.0 to 1.0


 66%|██████████████████████████▉              | 354/540 [02:57<01:33,  1.99it/s]

for state (17, 3, 1) value changed from 1.0 to 0.0


 66%|██████████████████████████▉              | 355/540 [02:57<01:32,  1.99it/s]

for state (14, 10, 1) value changed from -2.0 to -5.0


 66%|███████████████████████████              | 356/540 [02:58<01:32,  1.99it/s]

for state (11, 7, 0) value changed from -1.0 to -2.0


 66%|███████████████████████████▏             | 358/540 [02:59<01:31,  1.99it/s]

for state (7, 9, 0) value changed from 0.0 to 1.0


 67%|███████████████████████████▎             | 360/540 [03:00<01:30,  1.99it/s]

for state (11, 7, 0) value changed from -2.0 to -3.0


 67%|███████████████████████████▍             | 362/540 [03:01<01:29,  1.99it/s]

for state (17, 10, 0) value changed from 6.0 to -1.0


 67%|███████████████████████████▋             | 364/540 [03:02<01:28,  1.99it/s]

for state (13, 2, 0) value changed from -1.0 to -2.0


 68%|███████████████████████████▋             | 365/540 [03:02<01:27,  1.99it/s]

for state (11, 8, 0) value changed from 0.0 to -1.0


 68%|███████████████████████████▊             | 367/540 [03:03<01:26,  2.00it/s]

for state (12, 1, 0) value changed from -1.0 to -2.0


 68%|███████████████████████████▉             | 368/540 [03:04<01:26,  2.00it/s]

for state (14, 10, 0) value changed from -5.0 to -4.0


 68%|████████████████████████████             | 369/540 [03:04<01:25,  2.00it/s]

for state (16, 10, 0) value changed from 1.0 to 2.0


 69%|████████████████████████████▏            | 371/540 [03:05<01:24,  2.00it/s]

for state (19, 5, 0) value changed from -1.0 to 0.0


 69%|████████████████████████████▏            | 372/540 [03:06<01:24,  1.99it/s]

for state (16, 10, 0) value changed from 2.0 to 3.0


 69%|████████████████████████████▎            | 373/540 [03:06<01:23,  1.99it/s]

for state (17, 8, 0) value changed from 0.0 to 5.0


 69%|████████████████████████████▍            | 374/540 [03:07<01:23,  1.99it/s]

for state (17, 8, 0) value changed from 5.0 to -1.0


 70%|████████████████████████████▌            | 376/540 [03:08<01:22,  1.99it/s]

for state (20, 4, 0) value changed from 1.0 to 0.0


 70%|████████████████████████████▌            | 377/540 [03:08<01:21,  1.99it/s]

for state (19, 5, 0) value changed from 0.0 to 1.0


 70%|████████████████████████████▋            | 378/540 [03:09<01:21,  1.99it/s]

for state (16, 10, 1) value changed from -1.0 to 0.0


 70%|████████████████████████████▊            | 380/540 [03:10<01:20,  1.99it/s]

for state (15, 9, 0) value changed from 6.0 to -1.0


 71%|████████████████████████████▉            | 381/540 [03:10<01:19,  1.99it/s]

for state (21, 8, 1) value changed from 4.0 to 5.0


 71%|█████████████████████████████            | 383/540 [03:11<01:18,  1.99it/s]

for state (17, 1, 0) value changed from 0.0 to -1.0


 71%|█████████████████████████████▏           | 384/540 [03:12<01:18,  1.99it/s]

for state (20, 4, 0) value changed from 0.0 to 1.0


 71%|█████████████████████████████▏           | 385/540 [03:12<01:17,  1.99it/s]

for state (14, 3, 0) value changed from 6.0 to -1.0


 71%|█████████████████████████████▎           | 386/540 [03:13<01:17,  1.99it/s]

for state (18, 10, 0) value changed from 6.0 to 7.0


 72%|█████████████████████████████▌           | 389/540 [03:14<01:15,  1.99it/s]

for state (13, 7, 0) value changed from -1.0 to -2.0


 72%|█████████████████████████████▌           | 390/540 [03:15<01:15,  1.99it/s]

for state (16, 3, 0) value changed from -1.0 to -2.0


 73%|█████████████████████████████▊           | 392/540 [03:16<01:14,  1.99it/s]

for state (12, 3, 0) value changed from 1.0 to 2.0


 73%|█████████████████████████████▉           | 394/540 [03:17<01:13,  1.99it/s]

for state (9, 9, 0) value changed from 1.0 to 2.0


 73%|█████████████████████████████▉           | 395/540 [03:17<01:12,  1.99it/s]

for state (20, 4, 1) value changed from 0.0 to -1.0


 73%|██████████████████████████████           | 396/540 [03:18<01:12,  1.99it/s]

for state (19, 6, 0) value changed from -1.0 to 0.0


 74%|██████████████████████████████▏          | 397/540 [03:18<01:11,  1.99it/s]

for state (9, 9, 0) value changed from 2.0 to 3.0


 74%|██████████████████████████████▍          | 401/540 [03:20<01:09,  1.99it/s]

for state (13, 10, 0) value changed from -1.0 to -4.0


 74%|██████████████████████████████▌          | 402/540 [03:21<01:09,  1.99it/s]

for state (9, 6, 0) value changed from 1.0 to 0.0


 75%|██████████████████████████████▋          | 404/540 [03:22<01:08,  1.99it/s]

for state (21, 9, 1) value changed from 8.0 to 9.0


 75%|██████████████████████████████▊          | 405/540 [03:22<01:07,  1.99it/s]

for state (14, 10, 0) value changed from -4.0 to -3.0


 75%|██████████████████████████████▊          | 406/540 [03:23<01:07,  1.99it/s]

for state (6, 5, 0) value changed from 0.0 to -1.0


 76%|███████████████████████████████          | 409/540 [03:24<01:05,  1.99it/s]

for state (14, 5, 1) value changed from 0.0 to 4.0


 76%|███████████████████████████████▏         | 410/540 [03:25<01:05,  1.99it/s]

for state (13, 2, 0) value changed from -2.0 to -3.0


 76%|███████████████████████████████▏         | 411/540 [03:25<01:04,  1.99it/s]

for state (20, 5, 0) value changed from 5.0 to 4.0


 76%|███████████████████████████████▎         | 413/540 [03:26<01:03,  1.99it/s]

for state (14, 10, 0) value changed from -3.0 to -4.0


 77%|███████████████████████████████▍         | 414/540 [03:27<01:03,  1.99it/s]

for state (14, 4, 0) value changed from 3.0 to 4.0


 77%|███████████████████████████████▌         | 415/540 [03:27<01:02,  1.99it/s]

for state (14, 10, 0) value changed from -4.0 to -5.0


 77%|███████████████████████████████▌         | 416/540 [03:28<01:02,  1.99it/s]

for state (18, 5, 0) value changed from 0.0 to -1.0


 77%|███████████████████████████████▋         | 417/540 [03:28<01:01,  1.99it/s]

for state (17, 9, 0) value changed from -1.0 to -2.0


 78%|███████████████████████████████▊         | 419/540 [03:29<01:00,  1.99it/s]

for state (15, 7, 0) value changed from -2.0 to -3.0


 78%|████████████████████████████████         | 422/540 [03:31<00:59,  1.99it/s]

for state (7, 2, 0) value changed from -1.0 to 12.0


 78%|████████████████████████████████         | 423/540 [03:31<00:58,  1.99it/s]

for state (11, 4, 0) value changed from 1.0 to 0.0


 79%|████████████████████████████████▏        | 424/540 [03:32<00:58,  1.99it/s]

for state (15, 7, 0) value changed from -3.0 to -2.0


 79%|████████████████████████████████▎        | 426/540 [03:33<00:57,  1.99it/s]

for state (14, 3, 1) value changed from 0.0 to -1.0


 79%|████████████████████████████████▍        | 427/540 [03:33<00:56,  1.99it/s]

for state (13, 2, 0) value changed from -3.0 to -2.0


 79%|████████████████████████████████▍        | 428/540 [03:34<00:56,  1.99it/s]

for state (16, 10, 0) value changed from 3.0 to 2.0


 79%|████████████████████████████████▌        | 429/540 [03:34<00:55,  1.99it/s]

for state (12, 4, 0) value changed from 0.0 to 1.0


 80%|████████████████████████████████▋        | 430/540 [03:35<00:55,  1.99it/s]

for state (21, 2, 1) value changed from 5.0 to 6.0


 80%|████████████████████████████████▋        | 431/540 [03:35<00:54,  1.99it/s]

for state (13, 2, 0) value changed from -2.0 to -1.0


 80%|████████████████████████████████▉        | 434/540 [03:37<00:53,  1.99it/s]

for state (11, 4, 0) value changed from 0.0 to 1.0


 81%|█████████████████████████████████        | 436/540 [03:38<00:52,  1.99it/s]

for state (16, 9, 0) value changed from 0.0 to 2.0


 81%|█████████████████████████████████▏       | 437/540 [03:38<00:51,  1.99it/s]

for state (13, 10, 0) value changed from -4.0 to -1.0


 81%|█████████████████████████████████▎       | 438/540 [03:39<00:51,  1.99it/s]

for state (13, 8, 0) value changed from -1.0 to -2.0


 81%|█████████████████████████████████▍       | 440/540 [03:40<00:50,  1.99it/s]

for state (18, 6, 0) value changed from 5.0 to 6.0


 82%|█████████████████████████████████▍       | 441/540 [03:40<00:49,  1.99it/s]

for state (21, 2, 1) value changed from 6.0 to 7.0


 82%|█████████████████████████████████▌       | 442/540 [03:41<00:49,  1.99it/s]

for state (10, 8, 0) value changed from 0.0 to 2.0


 82%|█████████████████████████████████▋       | 444/540 [03:42<00:48,  1.99it/s]

for state (16, 3, 0) value changed from -2.0 to -1.0


 82%|█████████████████████████████████▊       | 445/540 [03:42<00:47,  2.00it/s]

for state (10, 9, 0) value changed from 1.0 to 0.0


 83%|██████████████████████████████████       | 448/540 [03:44<00:46,  1.99it/s]

for state (12, 7, 0) value changed from -1.0 to 0.0


 83%|██████████████████████████████████       | 449/540 [03:44<00:45,  1.99it/s]

for state (13, 10, 1) value changed from 0.0 to -1.0


 84%|██████████████████████████████████▏      | 451/540 [03:45<00:44,  1.99it/s]

for state (21, 1, 1) value changed from 0.0 to 1.0


 84%|██████████████████████████████████▍      | 453/540 [03:46<00:43,  1.99it/s]

for state (15, 9, 1) value changed from 0.0 to -1.0


 84%|██████████████████████████████████▍      | 454/540 [03:47<00:43,  1.99it/s]

for state (14, 10, 0) value changed from -5.0 to -4.0


 84%|██████████████████████████████████▌      | 456/540 [03:48<00:42,  1.99it/s]

for state (16, 10, 0) value changed from 2.0 to 3.0


 85%|██████████████████████████████████▋      | 457/540 [03:48<00:41,  1.99it/s]

for state (13, 5, 0) value changed from 2.0 to 3.0


 85%|██████████████████████████████████▊      | 458/540 [03:49<00:41,  2.00it/s]

for state (14, 4, 0) value changed from 4.0 to 3.0


 85%|██████████████████████████████████▊      | 459/540 [03:49<00:40,  2.00it/s]

for state (15, 4, 0) value changed from 0.0 to 1.0


 85%|██████████████████████████████████▉      | 460/540 [03:50<00:40,  2.00it/s]

for state (9, 5, 0) value changed from 0.0 to -1.0


 85%|███████████████████████████████████      | 461/540 [03:50<00:39,  2.00it/s]

for state (20, 3, 0) value changed from 2.0 to 3.0


 86%|███████████████████████████████████      | 462/540 [03:51<00:39,  2.00it/s]

for state (16, 10, 0) value changed from 3.0 to 2.0


 86%|███████████████████████████████████▏     | 464/540 [03:52<00:38,  2.00it/s]

for state (9, 10, 0) value changed from -1.0 to -2.0


 86%|███████████████████████████████████▎     | 465/540 [03:52<00:37,  2.00it/s]

for state (12, 2, 0) value changed from 5.0 to -1.0


 86%|███████████████████████████████████▍     | 466/540 [03:53<00:37,  1.99it/s]

for state (6, 4, 0) value changed from 0.0 to -1.0


 86%|███████████████████████████████████▍     | 467/540 [03:53<00:36,  1.99it/s]

for state (13, 7, 0) value changed from -2.0 to -3.0


 87%|███████████████████████████████████▌     | 468/540 [03:54<00:36,  1.99it/s]

for state (19, 3, 0) value changed from 0.0 to -1.0


 87%|███████████████████████████████████▋     | 470/540 [03:55<00:35,  1.99it/s]

for state (14, 1, 0) value changed from 0.0 to -1.0


 87%|███████████████████████████████████▊     | 471/540 [03:55<00:34,  1.99it/s]

for state (9, 10, 0) value changed from -2.0 to -1.0


 87%|███████████████████████████████████▊     | 472/540 [03:56<00:34,  1.99it/s]

for state (21, 4, 1) value changed from 0.0 to 1.0


 88%|███████████████████████████████████▉     | 473/540 [03:56<00:33,  1.99it/s]

for state (12, 4, 0) value changed from 1.0 to 2.0


 88%|███████████████████████████████████▉     | 474/540 [03:57<00:33,  1.99it/s]

for state (11, 5, 0) value changed from 0.0 to -1.0


 88%|████████████████████████████████████     | 475/540 [03:57<00:32,  1.99it/s]

for state (10, 4, 0) value changed from 0.0 to 1.0


 88%|████████████████████████████████████▏    | 476/540 [03:58<00:32,  1.99it/s]

for state (13, 6, 0) value changed from 0.0 to -1.0


 89%|████████████████████████████████████▎    | 478/540 [03:59<00:31,  1.99it/s]

for state (11, 9, 0) value changed from 0.0 to 2.0


 89%|████████████████████████████████████▎    | 479/540 [03:59<00:30,  1.99it/s]

for state (14, 3, 0) value changed from -1.0 to 6.0


 89%|████████████████████████████████████▍    | 480/540 [04:00<00:30,  1.99it/s]

for state (18, 10, 0) value changed from 7.0 to 8.0


 89%|████████████████████████████████████▌    | 482/540 [04:01<00:29,  1.99it/s]

for state (15, 8, 0) value changed from -1.0 to 0.0


 89%|████████████████████████████████████▋    | 483/540 [04:02<00:28,  1.99it/s]

for state (14, 1, 1) value changed from 0.0 to -1.0


 90%|█████████████████████████████████████    | 488/540 [04:04<00:26,  2.00it/s]

for state (9, 8, 0) value changed from 0.0 to 1.0


 91%|█████████████████████████████████████▏   | 490/540 [04:05<00:25,  1.99it/s]

for state (18, 10, 0) value changed from 8.0 to 7.0


 91%|█████████████████████████████████████▎   | 491/540 [04:06<00:24,  1.99it/s]

for state (11, 7, 0) value changed from -3.0 to -4.0


 91%|█████████████████████████████████████▎   | 492/540 [04:06<00:24,  1.99it/s]

for state (16, 5, 0) value changed from -1.0 to 0.0


 92%|█████████████████████████████████████▋   | 496/540 [04:08<00:22,  2.00it/s]

for state (20, 1, 0) value changed from 3.0 to 4.0


 92%|█████████████████████████████████████▊   | 498/540 [04:09<00:21,  2.00it/s]

for state (12, 3, 0) value changed from 2.0 to 1.0


 92%|█████████████████████████████████████▉   | 499/540 [04:10<00:20,  2.00it/s]

for state (17, 7, 0) value changed from -1.0 to 4.0


 93%|█████████████████████████████████████▉   | 500/540 [04:10<00:20,  2.00it/s]

for state (11, 10, 0) value changed from -1.0 to 0.0


 93%|██████████████████████████████████████   | 501/540 [04:11<00:19,  1.99it/s]

for state (16, 9, 0) value changed from 2.0 to -1.0


 93%|██████████████████████████████████████   | 502/540 [04:11<00:19,  1.99it/s]

for state (15, 10, 0) value changed from -1.0 to 2.0


 93%|██████████████████████████████████████▏  | 503/540 [04:12<00:18,  2.00it/s]

for state (10, 3, 0) value changed from -1.0 to 0.0


 93%|██████████████████████████████████████▎  | 504/540 [04:12<00:18,  1.99it/s]

for state (14, 4, 0) value changed from 3.0 to 2.0


 94%|██████████████████████████████████████▍  | 506/540 [04:13<00:17,  1.99it/s]

for state (14, 10, 0) value changed from -4.0 to -5.0


 94%|██████████████████████████████████████▍  | 507/540 [04:14<00:16,  1.99it/s]

for state (13, 8, 0) value changed from -2.0 to -3.0


 94%|██████████████████████████████████████▌  | 508/540 [04:14<00:16,  1.99it/s]

for state (15, 10, 1) value changed from 0.0 to 1.0


 94%|██████████████████████████████████████▋  | 509/540 [04:15<00:15,  1.99it/s]

for state (7, 7, 0) value changed from 0.0 to -1.0


 95%|██████████████████████████████████████▊  | 511/540 [04:16<00:14,  1.99it/s]

for state (12, 4, 0) value changed from 2.0 to 1.0


 95%|██████████████████████████████████████▊  | 512/540 [04:16<00:14,  1.99it/s]

for state (16, 10, 0) value changed from 2.0 to 1.0


 95%|██████████████████████████████████████▉  | 513/540 [04:17<00:13,  1.99it/s]

for state (20, 5, 0) value changed from 4.0 to 5.0


 95%|███████████████████████████████████████  | 514/540 [04:17<00:13,  2.00it/s]

for state (9, 10, 0) value changed from -1.0 to -2.0


 95%|███████████████████████████████████████  | 515/540 [04:18<00:12,  2.00it/s]

for state (15, 10, 0) value changed from 2.0 to -1.0


 96%|███████████████████████████████████████▎ | 517/540 [04:19<00:11,  2.00it/s]

for state (19, 3, 0) value changed from -1.0 to 0.0


 96%|███████████████████████████████████████▎ | 518/540 [04:19<00:11,  2.00it/s]

for state (16, 8, 0) value changed from -1.0 to 5.0


 96%|███████████████████████████████████████▍ | 519/540 [04:20<00:10,  2.00it/s]

for state (19, 4, 0) value changed from 0.0 to 1.0


 96%|███████████████████████████████████████▍ | 520/540 [04:20<00:10,  2.00it/s]

for state (15, 9, 1) value changed from -1.0 to -2.0


 96%|███████████████████████████████████████▌ | 521/540 [04:21<00:09,  2.00it/s]

for state (15, 4, 0) value changed from 1.0 to 0.0


 97%|███████████████████████████████████████▋ | 522/540 [04:21<00:09,  2.00it/s]

for state (14, 2, 0) value changed from 6.0 to 5.0


 97%|███████████████████████████████████████▋ | 523/540 [04:22<00:08,  2.00it/s]

for state (14, 6, 0) value changed from 7.0 to 6.0


 97%|███████████████████████████████████████▊ | 525/540 [04:23<00:07,  2.00it/s]

for state (10, 2, 0) value changed from 6.0 to 11.0


 98%|████████████████████████████████████████ | 527/540 [04:24<00:06,  1.99it/s]

for state (15, 8, 1) value changed from 0.0 to -1.0


 98%|████████████████████████████████████████▏| 529/540 [04:25<00:05,  1.99it/s]

for state (9, 1, 0) value changed from -2.0 to -3.0


 98%|████████████████████████████████████████▏| 530/540 [04:25<00:05,  1.99it/s]

for state (12, 3, 0) value changed from 1.0 to 0.0


 98%|████████████████████████████████████████▎| 531/540 [04:26<00:04,  1.99it/s]

for state (12, 4, 0) value changed from 1.0 to 2.0


 99%|████████████████████████████████████████▍| 532/540 [04:26<00:04,  1.99it/s]

for state (14, 7, 0) value changed from -1.0 to 0.0


 99%|████████████████████████████████████████▌| 534/540 [04:27<00:03,  1.99it/s]

for state (19, 8, 0) value changed from 0.0 to -1.0


 99%|████████████████████████████████████████▌| 535/540 [04:28<00:02,  1.99it/s]

for state (21, 10, 1) value changed from -1.0 to 7.0


 99%|████████████████████████████████████████▋| 536/540 [04:28<00:02,  1.99it/s]

for state (7, 4, 0) value changed from 0.0 to -1.0


 99%|████████████████████████████████████████▊| 537/540 [04:29<00:01,  1.99it/s]

for state (9, 10, 0) value changed from -2.0 to -1.0


100%|████████████████████████████████████████▊| 538/540 [04:29<00:01,  1.99it/s]

for state (20, 4, 0) value changed from 1.0 to 2.0


100%|████████████████████████████████████████▉| 539/540 [04:30<00:00,  1.99it/s]

for state (13, 2, 0) value changed from -1.0 to -2.0


100%|█████████████████████████████████████████| 540/540 [04:30<00:00,  2.00it/s]


for state (15, 4, 0) value changed from 0.0 to -1.0


  1%|▏                                          | 3/540 [00:03<11:11,  1.25s/it]

for state (6, 10, 0) best action changed to 0, statePrime (6, 10, 0)=0.0, reward -1.0


  1%|▎                                          | 4/540 [00:04<11:10,  1.25s/it]

for state (7, 10, 0) best action changed to 0, statePrime (7, 10, 0)=0.0, reward 1.0


  1%|▌                                          | 7/540 [00:08<11:07,  1.25s/it]

for state (12, 6, 0) best action changed to 0, statePrime (12, 6, 0)=3.0, reward -1.0


  2%|▋                                          | 9/540 [00:11<11:05,  1.25s/it]

for state (14, 10, 0) best action changed to 1, statePrime (23, 10, 0)=-1.0, reward -1.0


  3%|█▎                                        | 17/540 [00:21<10:55,  1.25s/it]

for state (16, 8, 0) best action changed to 0, statePrime (16, 8, 0)=4.0, reward -1.0


  4%|█▋                                        | 21/540 [00:26<10:50,  1.25s/it]

for state (10, 10, 0) best action changed to 0, statePrime (10, 10, 0)=-2.0, reward -1.0


  7%|███                                       | 39/540 [00:48<10:28,  1.25s/it]

for state (9, 4, 0) best action changed to 0, statePrime (9, 4, 0)=3.0, reward -1.0


 11%|████▋                                     | 60/540 [01:15<10:02,  1.26s/it]

for state (20, 10, 0) best action changed to 0, statePrime (20, 10, 0)=0.0, reward 1.0


 14%|█████▉                                    | 76/540 [01:35<09:41,  1.25s/it]

for state (5, 9, 0) best action changed to 0, statePrime (5, 9, 0)=5.0, reward -1.0


 17%|███████                                   | 91/540 [01:54<09:23,  1.25s/it]

for state (11, 9, 0) best action changed to 0, statePrime (11, 9, 0)=1.0, reward -1.0


 17%|███████▏                                  | 92/540 [01:55<09:21,  1.25s/it]

for state (15, 10, 0) best action changed to 0, statePrime (15, 10, 0)=0.0, reward 1.0


 18%|███████▌                                  | 97/540 [02:01<09:15,  1.25s/it]

for state (20, 10, 0) best action changed to 1, statePrime (30, 10, 0)=-1.0, reward -1.0


 19%|███████▋                                 | 101/540 [02:06<09:10,  1.25s/it]

for state (10, 10, 0) best action changed to 1, statePrime (16, 10, 0)=1.0, reward 0.0


 19%|███████▋                                 | 102/540 [02:07<09:08,  1.25s/it]

for state (15, 5, 0) best action changed to 0, statePrime (15, 5, 0)=0.0, reward 1.0


 19%|███████▊                                 | 103/540 [02:09<09:07,  1.25s/it]

for state (16, 6, 0) best action changed to 1, statePrime (26, 6, 0)=-1.0, reward -1.0


 20%|████████                                 | 107/540 [02:14<09:02,  1.25s/it]

for state (17, 8, 1) best action changed to 1, statePrime (19, 8, 1)=0.0, reward 0.0


 20%|████████▎                                | 109/540 [02:16<09:00,  1.25s/it]

for state (19, 1, 0) best action changed to 1, statePrime (21, 1, 0)=0.0, reward 0.0


 20%|████████▎                                | 110/540 [02:17<08:59,  1.25s/it]

for state (15, 8, 0) best action changed to 0, statePrime (15, 8, 0)=1.0, reward 1.0


 21%|████████▌                                | 112/540 [02:20<08:56,  1.25s/it]

for state (13, 7, 0) best action changed to 1, statePrime (20, 7, 0)=17.0, reward 0.0


 21%|████████▋                                | 114/540 [02:22<08:53,  1.25s/it]

for state (12, 4, 0) best action changed to 1, statePrime (14, 4, 0)=2.0, reward 0.0


 21%|████████▋                                | 115/540 [02:24<08:53,  1.25s/it]

for state (12, 3, 0) best action changed to 1, statePrime (18, 3, 0)=6.0, reward 0.0


 22%|█████████                                | 119/540 [02:29<08:47,  1.25s/it]

for state (19, 6, 0) best action changed to 1, statePrime (23, 6, 0)=-1.0, reward -1.0


 24%|█████████▋                               | 127/540 [02:39<08:37,  1.25s/it]

for state (12, 4, 0) best action changed to 0, statePrime (12, 4, 0)=3.0, reward 1.0


 24%|█████████▋                               | 128/540 [02:40<08:36,  1.25s/it]

for state (11, 7, 0) best action changed to 1, statePrime (15, 7, 0)=-2.0, reward 0.0


 24%|█████████▊                               | 130/540 [02:42<08:34,  1.25s/it]

for state (7, 8, 0) best action changed to 0, statePrime (7, 8, 0)=1.0, reward 1.0


 25%|██████████▎                              | 136/540 [02:50<08:26,  1.25s/it]

for state (14, 7, 0) best action changed to 0, statePrime (14, 7, 0)=1.0, reward 1.0


 25%|██████████▍                              | 137/540 [02:51<08:25,  1.25s/it]

for state (20, 10, 1) best action changed to 0, statePrime (20, 10, 1)=0.0, reward 1.0


 26%|██████████▋                              | 140/540 [02:55<08:21,  1.25s/it]

for state (18, 6, 0) best action changed to 1, statePrime (20, 6, 0)=7.0, reward 0.0


 26%|██████████▊                              | 143/540 [02:59<08:17,  1.25s/it]

for state (9, 1, 0) best action changed to 1, statePrime (18, 1, 0)=-1.0, reward 0.0


 27%|██████████▉                              | 144/540 [03:00<08:16,  1.25s/it]

for state (19, 10, 0) best action changed to 0, statePrime (19, 10, 0)=0.0, reward 1.0


 27%|███████████                              | 145/540 [03:01<08:15,  1.25s/it]

for state (16, 5, 1) best action changed to 1, statePrime (20, 5, 1)=0.0, reward 0.0


 27%|███████████                              | 146/540 [03:03<08:14,  1.25s/it]

for state (15, 7, 1) best action changed to 0, statePrime (15, 7, 1)=1.0, reward 1.0


 28%|███████████▌                             | 152/540 [03:10<08:06,  1.25s/it]

for state (15, 10, 0) best action changed to 1, statePrime (20, 10, 0)=-1.0, reward 0.0


 29%|███████████▉                             | 157/540 [03:16<08:01,  1.26s/it]

for state (12, 10, 1) best action changed to 0, statePrime (12, 10, 1)=1.0, reward 1.0


 30%|████████████▍                            | 163/540 [03:24<07:52,  1.25s/it]

for state (14, 7, 0) best action changed to 1, statePrime (23, 7, 0)=-1.0, reward -1.0


 31%|████████████▌                            | 165/540 [03:26<07:49,  1.25s/it]

for state (7, 8, 0) best action changed to 1, statePrime (12, 8, 0)=0.0, reward 0.0


 32%|████████████▉                            | 171/540 [03:34<07:42,  1.25s/it]

for state (19, 10, 0) best action changed to 1, statePrime (24, 10, 0)=-1.0, reward -1.0


 32%|█████████████                            | 172/540 [03:35<07:41,  1.25s/it]

for state (20, 10, 0) best action changed to 0, statePrime (20, 10, 0)=0.0, reward 1.0


 32%|█████████████▎                           | 175/540 [03:39<07:37,  1.25s/it]

for state (21, 5, 1) best action changed to 0, statePrime (21, 5, 1)=1.0, reward 1.0


 33%|█████████████▌                           | 179/540 [03:44<07:32,  1.25s/it]

for state (13, 8, 1) best action changed to 1, statePrime (14, 8, 1)=1.0, reward 0.0


 34%|█████████████▉                           | 184/540 [03:50<07:26,  1.25s/it]

for state (20, 10, 0) best action changed to 1, statePrime (30, 10, 0)=-1.0, reward -1.0


 35%|██████████████▎                          | 188/540 [03:55<07:21,  1.25s/it]

for state (13, 8, 0) best action changed to 1, statePrime (20, 8, 0)=5.0, reward 0.0


 35%|██████████████▍                          | 190/540 [03:58<07:18,  1.25s/it]

for state (12, 6, 0) best action changed to 1, statePrime (18, 6, 0)=6.0, reward 0.0


 36%|██████████████▋                          | 194/540 [04:03<07:13,  1.25s/it]

for state (14, 9, 0) best action changed to 0, statePrime (14, 9, 0)=0.0, reward 1.0


 37%|███████████████▏                         | 200/540 [04:10<07:06,  1.25s/it]

for state (18, 6, 0) best action changed to 0, statePrime (18, 6, 0)=7.0, reward 1.0


 37%|███████████████▎                         | 201/540 [04:11<07:04,  1.25s/it]

for state (13, 1, 0) best action changed to 1, statePrime (14, 1, 0)=-1.0, reward 0.0


 38%|███████████████▍                         | 204/540 [04:15<07:01,  1.25s/it]

for state (15, 5, 0) best action changed to 1, statePrime (17, 5, 0)=0.0, reward 0.0


 38%|███████████████▌                         | 205/540 [04:16<07:00,  1.25s/it]

for state (20, 10, 0) best action changed to 0, statePrime (20, 10, 0)=0.0, reward 1.0


 39%|███████████████▊                         | 208/540 [04:20<06:56,  1.25s/it]

for state (11, 2, 0) best action changed to 0, statePrime (11, 2, 0)=13.0, reward 1.0


 39%|████████████████                         | 212/540 [04:25<06:51,  1.25s/it]

for state (16, 3, 1) best action changed to 1, statePrime (19, 3, 1)=0.0, reward 0.0


 40%|████████████████▏                        | 214/540 [04:28<06:48,  1.25s/it]

for state (16, 3, 0) best action changed to 1, statePrime (22, 3, 0)=-1.0, reward -1.0


 40%|████████████████▎                        | 215/540 [04:29<06:47,  1.25s/it]

for state (11, 1, 0) best action changed to 1, statePrime (12, 1, 0)=-2.0, reward 0.0


 40%|████████████████▌                        | 218/540 [04:33<06:43,  1.25s/it]

for state (20, 10, 0) best action changed to 1, statePrime (28, 10, 0)=-1.0, reward -1.0


 41%|████████████████▋                        | 220/540 [04:35<06:41,  1.25s/it]

for state (13, 2, 0) best action changed to 1, statePrime (23, 2, 0)=-1.0, reward -1.0


 41%|████████████████▉                        | 223/540 [04:39<06:37,  1.25s/it]

for state (15, 6, 0) best action changed to 1, statePrime (25, 6, 0)=-1.0, reward -1.0


 42%|█████████████████▏                       | 227/540 [04:44<06:32,  1.25s/it]

for state (20, 10, 0) best action changed to 0, statePrime (20, 10, 0)=0.0, reward 1.0


 42%|█████████████████▎                       | 228/540 [04:45<06:31,  1.25s/it]

for state (11, 6, 0) best action changed to 1, statePrime (14, 6, 0)=6.0, reward 0.0


 42%|█████████████████▍                       | 229/540 [04:47<06:30,  1.25s/it]

for state (14, 9, 0) best action changed to 1, statePrime (21, 9, 0)=0.0, reward 0.0


 43%|█████████████████▍                       | 230/540 [04:48<06:28,  1.25s/it]

for state (15, 7, 0) best action changed to 1, statePrime (16, 7, 0)=-1.0, reward 0.0


 43%|█████████████████▌                       | 232/540 [04:50<06:26,  1.25s/it]

for state (12, 3, 0) best action changed to 0, statePrime (12, 3, 0)=1.0, reward 1.0


 44%|██████████████████▏                      | 239/540 [04:59<06:17,  1.25s/it]

for state (14, 9, 0) best action changed to 0, statePrime (14, 9, 0)=0.0, reward 1.0


 44%|██████████████████▏                      | 240/540 [05:00<06:16,  1.25s/it]

for state (8, 10, 0) best action changed to 1, statePrime (16, 10, 0)=1.0, reward 0.0


 45%|██████████████████▎                      | 241/540 [05:02<06:14,  1.25s/it]

for state (17, 10, 1) best action changed to 0, statePrime (17, 10, 1)=-4.0, reward -1.0


 45%|██████████████████▎                      | 242/540 [05:03<06:13,  1.25s/it]

for state (7, 6, 0) best action changed to 1, statePrime (17, 6, 0)=1.0, reward 0.0


 45%|██████████████████▌                      | 244/540 [05:05<06:11,  1.25s/it]

for state (19, 10, 0) best action changed to 0, statePrime (19, 10, 0)=0.0, reward 1.0


 46%|██████████████████▋                      | 246/540 [05:08<06:08,  1.25s/it]

for state (13, 10, 0) best action changed to 0, statePrime (13, 10, 0)=0.0, reward 1.0


 46%|██████████████████▊                      | 247/540 [05:09<06:07,  1.25s/it]

for state (13, 10, 0) best action changed to 1, statePrime (19, 10, 0)=-1.0, reward 0.0


 46%|██████████████████▊                      | 248/540 [05:10<06:06,  1.26s/it]

for state (14, 5, 0) best action changed to 0, statePrime (14, 5, 0)=0.0, reward 1.0


 46%|██████████████████▉                      | 250/540 [05:13<06:03,  1.25s/it]

for state (20, 10, 0) best action changed to 1, statePrime (29, 10, 0)=-1.0, reward -1.0


 48%|███████████████████▌                     | 257/540 [05:22<05:54,  1.25s/it]

for state (19, 6, 0) best action changed to 0, statePrime (19, 6, 0)=1.0, reward 1.0


 48%|███████████████████▌                     | 258/540 [05:23<05:53,  1.25s/it]

for state (16, 10, 0) best action changed to 1, statePrime (18, 10, 0)=7.0, reward 0.0


 48%|███████████████████▋                     | 260/540 [05:25<05:50,  1.25s/it]

for state (10, 3, 0) best action changed to 1, statePrime (19, 3, 0)=0.0, reward 0.0


 49%|███████████████████▉                     | 262/540 [05:28<05:48,  1.25s/it]

for state (16, 7, 0) best action changed to 0, statePrime (16, 7, 0)=0.0, reward 1.0


 49%|████████████████████▎                    | 267/540 [05:34<05:42,  1.25s/it]

for state (17, 6, 1) best action changed to 1, statePrime (17, 6, 0)=1.0, reward 0.0


 50%|████████████████████▍                    | 269/540 [05:37<05:39,  1.25s/it]

for state (18, 9, 1) best action changed to 1, statePrime (18, 9, 0)=0.0, reward 0.0


 50%|████████████████████▋                    | 272/540 [05:41<05:35,  1.25s/it]

for state (19, 10, 0) best action changed to 1, statePrime (27, 10, 0)=-1.0, reward -1.0


 51%|████████████████████▋                    | 273/540 [05:42<05:34,  1.25s/it]

for state (7, 2, 0) best action changed to 0, statePrime (7, 2, 0)=11.0, reward -1.0


 51%|████████████████████▊                    | 274/540 [05:43<05:33,  1.25s/it]

for state (16, 2, 0) best action changed to 0, statePrime (16, 2, 0)=0.0, reward 1.0


 53%|█████████████████████▋                   | 285/540 [05:57<05:19,  1.25s/it]

for state (17, 4, 1) best action changed to 1, statePrime (15, 4, 0)=-1.0, reward 0.0


 54%|██████████████████████▏                  | 293/540 [06:07<05:09,  1.25s/it]

for state (21, 10, 1) best action changed to 0, statePrime (21, 10, 1)=8.0, reward 1.0


 55%|██████████████████████▍                  | 295/540 [06:09<05:06,  1.25s/it]

for state (18, 2, 1) best action changed to 1, statePrime (17, 2, 0)=4.0, reward 0.0


 55%|██████████████████████▍                  | 296/540 [06:11<05:05,  1.25s/it]

for state (17, 3, 1) best action changed to 0, statePrime (17, 3, 1)=1.0, reward 1.0


 55%|██████████████████████▌                  | 297/540 [06:12<05:04,  1.25s/it]

for state (10, 2, 0) best action changed to 0, statePrime (10, 2, 0)=12.0, reward 1.0


 55%|██████████████████████▋                  | 299/540 [06:14<05:01,  1.25s/it]

for state (17, 7, 0) best action changed to 0, statePrime (17, 7, 0)=3.0, reward -1.0


 56%|██████████████████████▊                  | 300/540 [06:16<05:00,  1.25s/it]

for state (7, 10, 0) best action changed to 1, statePrime (11, 10, 0)=0.0, reward 0.0


 56%|███████████████████████▏                 | 305/540 [06:22<04:54,  1.25s/it]

for state (20, 10, 0) best action changed to 0, statePrime (20, 10, 0)=0.0, reward 1.0


 57%|███████████████████████▎                 | 307/540 [06:24<04:52,  1.25s/it]

for state (6, 10, 0) best action changed to 1, statePrime (16, 10, 0)=1.0, reward 0.0


 57%|███████████████████████▍                 | 309/540 [06:27<04:49,  1.25s/it]

for state (17, 9, 0) best action changed to 1, statePrime (27, 9, 0)=-1.0, reward -1.0


 58%|███████████████████████▊                 | 314/540 [06:33<04:43,  1.25s/it]

for state (10, 5, 0) best action changed to 0, statePrime (10, 5, 0)=6.0, reward 1.0


 58%|███████████████████████▉                 | 315/540 [06:34<04:42,  1.26s/it]

for state (19, 7, 0) best action changed to 0, statePrime (19, 7, 0)=0.0, reward 1.0


 59%|████████████████████████▏                | 318/540 [06:38<04:38,  1.25s/it]

for state (13, 3, 0) best action changed to 0, statePrime (13, 3, 0)=0.0, reward 1.0


 60%|████████████████████████▌                | 323/540 [06:44<04:31,  1.25s/it]

for state (11, 2, 0) best action changed to 1, statePrime (15, 2, 0)=11.0, reward 0.0


 60%|████████████████████████▌                | 324/540 [06:46<04:30,  1.25s/it]

for state (19, 10, 0) best action changed to 0, statePrime (19, 10, 0)=0.0, reward 1.0


 60%|████████████████████████▋                | 325/540 [06:47<04:29,  1.25s/it]

for state (12, 7, 0) best action changed to 1, statePrime (20, 7, 0)=17.0, reward 0.0


 61%|████████████████████████▊                | 327/540 [06:49<04:27,  1.25s/it]

for state (17, 3, 0) best action changed to 1, statePrime (27, 3, 0)=-1.0, reward -1.0


 61%|████████████████████████▉                | 329/540 [06:52<04:24,  1.25s/it]

for state (13, 4, 0) best action changed to 0, statePrime (13, 4, 0)=0.0, reward 1.0


 61%|█████████████████████████                | 330/540 [06:53<04:23,  1.25s/it]

for state (14, 9, 0) best action changed to 1, statePrime (17, 9, 0)=-2.0, reward 0.0


 62%|█████████████████████████▎               | 334/540 [06:58<04:18,  1.25s/it]

for state (16, 4, 0) best action changed to 0, statePrime (16, 4, 0)=0.0, reward 1.0


 62%|█████████████████████████▌               | 336/540 [07:01<04:15,  1.25s/it]

for state (9, 2, 0) best action changed to 1, statePrime (13, 2, 0)=-2.0, reward 0.0


 62%|█████████████████████████▌               | 337/540 [07:02<04:14,  1.25s/it]

for state (15, 5, 0) best action changed to 0, statePrime (15, 5, 0)=0.0, reward 1.0


 63%|█████████████████████████▊               | 340/540 [07:06<04:10,  1.25s/it]

for state (19, 1, 0) best action changed to 0, statePrime (19, 1, 0)=1.0, reward 1.0


 63%|█████████████████████████▉               | 342/540 [07:08<04:08,  1.25s/it]

for state (9, 10, 0) best action changed to 1, statePrime (12, 10, 0)=-1.0, reward 0.0


 64%|██████████████████████████▏              | 345/540 [07:12<04:04,  1.25s/it]

for state (9, 3, 0) best action changed to 0, statePrime (9, 3, 0)=1.0, reward 1.0


 64%|██████████████████████████▎              | 346/540 [07:13<04:03,  1.25s/it]

for state (8, 4, 0) best action changed to 0, statePrime (8, 4, 0)=0.0, reward 1.0


 64%|██████████████████████████▎              | 347/540 [07:15<04:01,  1.25s/it]

for state (18, 8, 0) best action changed to 1, statePrime (20, 8, 0)=5.0, reward 0.0


 64%|██████████████████████████▍              | 348/540 [07:16<04:00,  1.25s/it]

for state (13, 9, 0) best action changed to 0, statePrime (13, 9, 0)=0.0, reward 1.0


 65%|██████████████████████████▍              | 349/540 [07:17<03:59,  1.25s/it]

for state (16, 4, 0) best action changed to 1, statePrime (22, 4, 0)=-1.0, reward -1.0


 65%|██████████████████████████▌              | 350/540 [07:18<03:58,  1.25s/it]

for state (12, 10, 0) best action changed to 0, statePrime (12, 10, 0)=0.0, reward 1.0


 65%|██████████████████████████▋              | 351/540 [07:20<03:56,  1.25s/it]

for state (16, 5, 0) best action changed to 0, statePrime (16, 5, 0)=1.0, reward 1.0


 65%|██████████████████████████▋              | 352/540 [07:21<03:55,  1.25s/it]

for state (10, 8, 0) best action changed to 0, statePrime (10, 8, 0)=1.0, reward -1.0


 67%|███████████████████████████▍             | 361/540 [07:32<03:44,  1.25s/it]

for state (20, 10, 0) best action changed to 1, statePrime (25, 10, 0)=-1.0, reward -1.0


 67%|███████████████████████████▍             | 362/540 [07:33<03:42,  1.25s/it]

for state (17, 10, 0) best action changed to 0, statePrime (17, 10, 0)=0.0, reward 1.0


 68%|███████████████████████████▋             | 365/540 [07:37<03:39,  1.25s/it]

for state (11, 8, 0) best action changed to 1, statePrime (12, 8, 0)=0.0, reward 0.0


 68%|████████████████████████████             | 369/540 [07:42<03:34,  1.25s/it]

for state (16, 10, 0) best action changed to 0, statePrime (16, 10, 0)=2.0, reward 1.0


 70%|████████████████████████████▋            | 378/540 [07:53<03:23,  1.25s/it]

for state (16, 10, 1) best action changed to 1, statePrime (16, 10, 0)=1.0, reward 0.0


 71%|█████████████████████████████            | 382/540 [07:58<03:18,  1.25s/it]

for state (19, 10, 0) best action changed to 1, statePrime (25, 10, 0)=-1.0, reward -1.0


 71%|█████████████████████████████▏           | 385/540 [08:02<03:14,  1.25s/it]

for state (14, 3, 0) best action changed to 0, statePrime (14, 3, 0)=5.0, reward -1.0


 72%|█████████████████████████████▍           | 387/540 [08:05<03:11,  1.25s/it]

for state (15, 5, 0) best action changed to 1, statePrime (25, 5, 0)=-1.0, reward -1.0


 73%|█████████████████████████████▊           | 392/540 [08:11<03:05,  1.25s/it]

for state (12, 3, 0) best action changed to 1, statePrime (20, 3, 0)=3.0, reward 0.0


 74%|██████████████████████████████▏          | 398/540 [08:18<02:58,  1.25s/it]

for state (20, 10, 0) best action changed to 0, statePrime (20, 10, 0)=0.0, reward 1.0


 74%|██████████████████████████████▍          | 401/540 [08:22<02:54,  1.25s/it]

for state (13, 10, 0) best action changed to 0, statePrime (13, 10, 0)=-2.0, reward -1.0


 74%|██████████████████████████████▌          | 402/540 [08:23<02:52,  1.25s/it]

for state (9, 6, 0) best action changed to 1, statePrime (20, 6, 1)=0.0, reward 0.0


 75%|██████████████████████████████▉          | 407/540 [08:30<02:46,  1.25s/it]

for state (12, 10, 0) best action changed to 1, statePrime (17, 10, 0)=-1.0, reward 0.0


 76%|███████████████████████████████          | 409/540 [08:32<02:44,  1.25s/it]

for state (14, 5, 1) best action changed to 0, statePrime (14, 5, 1)=5.0, reward 1.0


 77%|███████████████████████████████▌         | 416/540 [08:41<02:35,  1.25s/it]

for state (18, 5, 0) best action changed to 0, statePrime (18, 5, 0)=0.0, reward 1.0


 78%|███████████████████████████████▉         | 420/540 [08:46<02:30,  1.25s/it]

for state (20, 10, 0) best action changed to 1, statePrime (24, 10, 0)=-1.0, reward -1.0


 79%|████████████████████████████████▎        | 426/540 [08:54<02:22,  1.25s/it]

for state (14, 3, 1) best action changed to 1, statePrime (16, 3, 1)=-1.0, reward 0.0


 81%|█████████████████████████████████▎       | 439/540 [09:10<02:06,  1.26s/it]

for state (17, 10, 0) best action changed to 1, statePrime (24, 10, 0)=-1.0, reward -1.0


 82%|█████████████████████████████████▌       | 442/540 [09:14<02:03,  1.26s/it]

for state (10, 8, 0) best action changed to 1, statePrime (18, 8, 0)=2.0, reward 0.0


 82%|█████████████████████████████████▋       | 444/540 [09:16<02:00,  1.26s/it]

for state (16, 3, 0) best action changed to 0, statePrime (16, 3, 0)=0.0, reward 1.0


 82%|█████████████████████████████████▊       | 445/540 [09:17<01:59,  1.26s/it]

for state (10, 9, 0) best action changed to 1, statePrime (20, 9, 0)=7.0, reward 0.0


 83%|██████████████████████████████████       | 449/540 [09:22<01:54,  1.25s/it]

for state (13, 10, 1) best action changed to 1, statePrime (20, 10, 1)=-1.0, reward 0.0


 84%|██████████████████████████████████▎      | 452/540 [09:26<01:50,  1.25s/it]

for state (7, 10, 0) best action changed to 0, statePrime (7, 10, 0)=0.0, reward 1.0


 84%|██████████████████████████████████▍      | 453/540 [09:27<01:49,  1.26s/it]

for state (15, 9, 1) best action changed to 1, statePrime (15, 9, 0)=-1.0, reward 0.0


 85%|██████████████████████████████████▉      | 460/540 [09:36<01:40,  1.25s/it]

for state (9, 5, 0) best action changed to 1, statePrime (14, 5, 0)=-1.0, reward 0.0


 86%|███████████████████████████████████▍     | 466/540 [09:44<01:32,  1.25s/it]

for state (6, 4, 0) best action changed to 0, statePrime (6, 4, 0)=0.0, reward 1.0


 87%|███████████████████████████████████▌     | 468/540 [09:46<01:30,  1.25s/it]

for state (19, 3, 0) best action changed to 1, statePrime (29, 3, 0)=-1.0, reward -1.0


 88%|████████████████████████████████████▏    | 476/540 [09:56<01:20,  1.25s/it]

for state (13, 6, 0) best action changed to 1, statePrime (15, 6, 0)=-1.0, reward 0.0


 89%|████████████████████████████████████▎    | 478/540 [09:59<01:17,  1.25s/it]

for state (11, 9, 0) best action changed to 1, statePrime (19, 9, 0)=2.0, reward 0.0


 89%|████████████████████████████████████▌    | 481/540 [10:03<01:13,  1.25s/it]

for state (20, 10, 1) best action changed to 1, statePrime (20, 10, 0)=-1.0, reward 0.0


 89%|████████████████████████████████████▌    | 482/540 [10:04<01:12,  1.25s/it]

for state (15, 8, 0) best action changed to 1, statePrime (19, 8, 0)=-1.0, reward 0.0


 90%|████████████████████████████████████▋    | 484/540 [10:06<01:10,  1.25s/it]

for state (8, 3, 0) best action changed to 0, statePrime (8, 3, 0)=1.0, reward 1.0


 90%|████████████████████████████████████▉    | 486/540 [10:09<01:07,  1.25s/it]

for state (7, 10, 0) best action changed to 1, statePrime (17, 10, 0)=-1.0, reward 0.0


 91%|█████████████████████████████████████▎   | 492/540 [10:16<01:00,  1.25s/it]

for state (16, 5, 0) best action changed to 1, statePrime (21, 5, 0)=0.0, reward 0.0


 91%|█████████████████████████████████████▍   | 493/540 [10:18<00:58,  1.25s/it]

for state (20, 10, 0) best action changed to 0, statePrime (20, 10, 0)=0.0, reward 1.0


 91%|█████████████████████████████████████▌   | 494/540 [10:19<00:57,  1.25s/it]

for state (20, 10, 0) best action changed to 1, statePrime (22, 10, 0)=-1.0, reward -1.0


 92%|█████████████████████████████████████▌   | 495/540 [10:20<00:56,  1.25s/it]

for state (18, 8, 0) best action changed to 0, statePrime (18, 8, 0)=2.0, reward 0.0


 92%|█████████████████████████████████████▋   | 497/540 [10:23<00:53,  1.25s/it]

for state (14, 3, 0) best action changed to 1, statePrime (18, 3, 0)=6.0, reward 0.0


 92%|█████████████████████████████████████▉   | 499/540 [10:25<00:51,  1.25s/it]

for state (17, 7, 0) best action changed to 1, statePrime (18, 7, 0)=4.0, reward 0.0


 94%|██████████████████████████████████████▎  | 505/540 [10:33<00:43,  1.25s/it]

for state (12, 10, 1) best action changed to 1, statePrime (15, 10, 1)=1.0, reward 0.0


 94%|██████████████████████████████████████▋  | 509/540 [10:38<00:38,  1.25s/it]

for state (7, 7, 0) best action changed to 1, statePrime (17, 7, 0)=4.0, reward 0.0


 94%|██████████████████████████████████████▋  | 510/540 [10:39<00:37,  1.25s/it]

for state (17, 10, 0) best action changed to 0, statePrime (17, 10, 0)=0.0, reward 1.0


 96%|███████████████████████████████████████▎ | 518/540 [10:49<00:27,  1.25s/it]

for state (16, 8, 0) best action changed to 1, statePrime (20, 8, 0)=5.0, reward 0.0


 96%|███████████████████████████████████████▌ | 521/540 [10:53<00:23,  1.25s/it]

for state (15, 4, 0) best action changed to 1, statePrime (17, 4, 0)=-1.0, reward 0.0


 97%|███████████████████████████████████████▊ | 525/540 [10:58<00:18,  1.26s/it]

for state (10, 2, 0) best action changed to 1, statePrime (15, 2, 0)=11.0, reward 0.0


 98%|████████████████████████████████████████ | 527/540 [11:00<00:16,  1.25s/it]

for state (15, 8, 1) best action changed to 1, statePrime (17, 8, 1)=-1.0, reward 0.0


 98%|████████████████████████████████████████ | 528/540 [11:01<00:15,  1.25s/it]

for state (13, 10, 0) best action changed to 1, statePrime (23, 10, 0)=-1.0, reward -1.0


 99%|████████████████████████████████████████▌| 534/540 [11:09<00:07,  1.25s/it]

for state (19, 8, 0) best action changed to 0, statePrime (19, 8, 0)=0.0, reward 1.0


 99%|████████████████████████████████████████▋| 536/540 [11:12<00:05,  1.25s/it]

for state (7, 4, 0) best action changed to 1, statePrime (11, 4, 0)=1.0, reward 0.0


 99%|████████████████████████████████████████▊| 537/540 [11:13<00:03,  1.25s/it]

for state (9, 10, 0) best action changed to 0, statePrime (9, 10, 0)=0.0, reward 1.0


100%|█████████████████████████████████████████| 540/540 [11:17<00:00,  1.25s/it]


### evaluate the policy using random states

In [381]:
state, _ = env.reset()
state

(19, 7, 0)

In [10]:
policy[state]

NameError: name 'policy' is not defined