In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import gridworlds
import value_iteration

import numpy as np

In [168]:
env = gridworlds.StealingGridworld(grid_size=2)
transition_matrix, reward_vector = env.get_sparse_transition_matrix_and_reward_vector()
optimal_qs, optimal_values = value_iteration.run_value_iteration(
    transition_matrix, reward_vector, 100, 0.9
)
optimal_policy = value_iteration.get_optimal_policy_from_qs(optimal_qs)

100%|██████████| 4/4 [00:00<00:00, 938.32it/s]
100%|██████████| 172/172 [00:00<00:00, 8561.94it/s]
Value iteration: 100%|██████████| 100/100 [00:00<00:00, 10497.57it/s]


In [17]:
env.repl(optimal_policy)

Welcome to the StealingGridworld REPL.
Use the following commands:
  u: Move up
  d: Move down
  l: Move left
  r: Move right
  i: Interact with environment
  p: Ask policy for action, if policy vector is given
  q: Quit

+---+---+
| 0 | . |
+---+---+
| . | x |
+---+---+
Policy action: DOWN
Reward: 0

+---+---+
| H | . |
+---+---+
| 0 | x |
+---+---+
Policy action: INTERACT
Reward: 0

+---+---+
| H | . |
+---+---+
| 1 | x |
+---+---+
Policy action: UP
Reward: 0

+---+---+
| 1 | . |
+---+---+
|   | x |
+---+---+
Policy action: INTERACT
Reward: 1

+---+---+
| 0 | . |
+---+---+
|   | x |
+---+---+
Policy action: UP
Reward: 0

+---+---+
| 0 | . |
+---+---+
|   | x |
+---+---+
Policy action: UP
Total reward: 1


In [71]:
env.reset()
env.render()

+---+---+
|0H | . |
+---+---+
| x | . |
+---+---+


In [167]:
env.render()
print(f"state: {env._get_observation()}")
for action in range(env.action_space.n):
    print(f"  {env._action_to_string(action):>8} | q_value: {optimal_qs[env.get_state_index(env._get_observation()), action]}")
policy_action = optimal_policy[env.get_state_index(env._get_observation())]
print(f"policy: {env._action_to_string(policy_action)}")

_ = env.step(policy_action)

+---+---+
| H |1. |
+---+---+
| x |   |
+---+---+
state: {'agent_position': array([0, 1]), 'free_pellet_locations': array([[0, 1]]), 'owned_pellet_locations': array([[1, 0]]), 'num_carried_pellets': 1}
        UP | q_value: 0.8999999761581421
      DOWN | q_value: 0.809999942779541
      LEFT | q_value: 0.809999942779541
     RIGHT | q_value: 0.7289999127388
  INTERACT | q_value: -0.38000011444091797
policy: UP


In [142]:
state = env._get_observation()
state_index = env.get_state_index(state)
succ_state = env.successor(state, 4)[0]
succ_state_index = env.get_state_index(succ_state)
optimal_qs[succ_state_index]
s_a_idx = np.ravel_multi_index((state_index, 4), (len(env.states), env.action_space.n))
transition_succ_idx = np.where(transition_matrix[s_a_idx].toarray() == 1)[1][0]
print(f"state:\n{state}\n")
print(f"transition matrix successor: {transition_succ_idx}\n{env.states[transition_succ_idx]}\n")
print(f"Environment successor: {succ_state_index}\n{succ_state}")

state:
{'agent_position': array([0, 1]), 'free_pellet_locations': array([[0, 1]]), 'owned_pellet_locations': array([[1, 0]]), 'num_carried_pellets': 1}

transition matrix successor: 122
{'agent_position': array([1, 0]), 'free_pellet_locations': array([[0, 1]]), 'owned_pellet_locations': array([], shape=(0, 2), dtype=int64), 'num_carried_pellets': 2}

Environment successor: 123
{'agent_position': array([0, 1]), 'free_pellet_locations': array([], shape=(0, 2), dtype=int64), 'owned_pellet_locations': array([[1, 0]]), 'num_carried_pellets': 2}


In [164]:
tiny_env = gridworlds.StealingGridworld(grid_size=3, num_free_pellets=1, num_owned_pellets=1)
tiny_transition, tiny_reward = tiny_env.get_sparse_transition_matrix_and_reward_vector()

100%|██████████| 9/9 [00:00<00:00, 634.65it/s]
100%|██████████| 819/819 [00:00<00:00, 9547.93it/s]


In [165]:
ACTION = gridworlds.StealingGridworld.LEFT
for state in tiny_env.states:
    state_idx = tiny_env.get_state_index(state)
    succ_state = tiny_env.successor(state, ACTION)[0]
    succ_state_index = tiny_env.get_state_index(succ_state)
    s_a_idx = np.ravel_multi_index((state_idx, ACTION), (len(tiny_env.states), len(tiny_env.actions)))
    transition_succ_idx = np.where(tiny_transition[s_a_idx].toarray() == 1)[1][0]
    if succ_state_index != transition_succ_idx:
        print("=====================================")
        print(f"state: {state_idx}\n{state}")
        tiny_env._register_state(state); tiny_env.render()
        print(f"\ntransition matrix successor: {transition_succ_idx}\n{tiny_env.states[transition_succ_idx]}")
        tiny_env._register_state(tiny_env.states[transition_succ_idx]); tiny_env.render()
        print(f"\nEnvironment successor: {succ_state_index}\n{succ_state}")
        tiny_env._register_state(succ_state); tiny_env.render()

In [141]:
def get_successor_from_transition_matrix(env, transition_matrix, state, action):
    state_index = env.get_state_index(state)
    s_a_idx = np.ravel_multi_index((state_index, action), (len(env.states), (len(env.actions))))
    transition_succ_idx = np.where(transition_matrix[s_a_idx].toarray() == 1)[1][0]
    return env.states[transition_succ_idx]

print(state)
get_successor_from_transition_matrix(tiny_env, tiny_transition, state, 4)

{'agent_position': array([0, 1]), 'free_pellet_locations': array([[0, 1]]), 'owned_pellet_locations': array([[1, 0]]), 'num_carried_pellets': 1}


{'agent_position': array([1, 0]),
 'free_pellet_locations': array([[0, 1]]),
 'owned_pellet_locations': array([], shape=(0, 2), dtype=int64),
 'num_carried_pellets': 2}