In [8]:
from Gridworld import GridWorld
import numpy as np
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [47]:
# env = GridWorld(negative_reward=-10, bomb_positions=[(0, 2), (1, 3)], gold_positions=[(0, 3)])
env = GridWorld()

In [48]:
env.print_world()

    0    1    2    3    4
-------------------------
0 |           p    g         
1 |                x         
2 |                          
3 |                          
4 |                          


In [49]:
def value_iteration(env, discount_factor=0.9, epsilon=1e-5):
    policy = np.zeros([env.num_rows, env.num_cols])
    V = np.zeros([env.num_rows, env.num_cols])

    while True:
        delta = 0
        for i in range(env.num_rows):
            for j in range(env.num_cols):
                state = (i, j)
                if env.is_terminal(state):
                    continue
                action_values = np.zeros(4)
                for k, action in enumerate(env.actions):
                    new_state, r = env.make_step(state, action)
                    action_values[k] = r + discount_factor * V[new_state]

                best_action_value = np.max(action_values)
                best_action = np.argmax(action_values)

                delta = max(delta, np.abs(best_action_value - V[state]))

                V[state] = best_action_value
                policy[state] = best_action

        if delta < epsilon:
            break
    
    policy = [[env.actions[int(i)] for i in j] for j in policy]
    for term in env.terminal_states:
        policy[term[0]][term[1]] = 'x'
    
    return V, policy

In [50]:
V, policy = value_iteration(env, discount_factor=0.9)

In [51]:
V, policy

(array([[ 8.1    ,  9.     , 10.     ,  0.     , 10.     ],
        [ 7.29   ,  8.1    ,  9.     ,  0.     ,  9.     ],
        [ 6.561  ,  7.29   ,  8.1    ,  7.29   ,  8.1    ],
        [ 5.9049 ,  6.561  ,  7.29   ,  6.561  ,  7.29   ],
        [ 5.31441,  5.9049 ,  6.561  ,  5.9049 ,  6.561  ]]),
 [['e', 'e', 'e', 'x', 'w'],
  ['n', 'n', 'n', 'x', 'n'],
  ['n', 'n', 'n', 'e', 'n'],
  ['n', 'n', 'n', 'n', 'n'],
  ['n', 'n', 'n', 'n', 'n']])

In [53]:
env.reset(start_pos=(4, 0))
env.follow_policy(policy)

game start!
    0    1    2    3    4
-------------------------
0 |                g         
1 |                x         
2 |                          
3 |                          
4 | p                        

move 1: n
    0    1    2    3    4
-------------------------
0 |                g         
1 |                x         
2 |                          
3 | p                        
4 |                          

move 2: n
    0    1    2    3    4
-------------------------
0 |                g         
1 |                x         
2 | p                        
3 |                          
4 |                          

move 3: n
    0    1    2    3    4
-------------------------
0 |                g         
1 | p              x         
2 |                          
3 |                          
4 |                          

move 4: n
    0    1    2    3    4
-------------------------
0 | p              g         
1 |                x         
2 |                     

## Question 1
What if we make the bombs okay to step on? When you step on the bomb, you get negative reward, but the game goes on. In that case, is it possible that the optimal policy guides the agent to the bomb location?

## Question 2

What would happen when the gold position is not terminal, i.e. when you step on gold position, it is not game over. How will the value change? How will the policy change? Will the game ever end?