# Reinforcement Learning
Prof. Milica Gašić

### Value iteration

Value iteration is an algorithm to find an optimal policy for an MDP. Written in pseudocode it looks like this:  

1. Initialize value function $v_0(s)$ for all states $s \in \mathcal{S}$ arbitrarily.
2. Repeat until the value function converges:
      $$\begin{aligned}
      & v_{k+1}(s) = \max_a \left( \mathcal{R}(s,a) + \gamma \sum_{s'} \mathcal{P}(s'|s,a) v_k(s') \right)\\
      & \text{for all } s \in \mathcal{S}
      \end{aligned}$$
      The value function is converged if $$|v_{k+1}(s) - v_k(s)| < \epsilon$$ for all $s \in \mathcal{S}$.
3. Derive the optimal policy from the last value function $v_K$:
      $$\begin{aligned}
      & \pi_*(s) = \arg\max_a \left( \mathcal{R}(s,a) + \gamma \sum_{s'} \mathcal{P}(s'|s,a) v_K(s') \right) \\
      & \text{for all } s \in \mathcal{S}
      \end{aligned}$$

Both update rules above make use of the action value function on the right-hand side.  
We can simplify the update rules:
$$\begin{aligned}
v_{k+1}(s) & = \max_a q_k(s,a) \\
\pi_*(s) & = \arg\max_a q_K(s,a)
\end{aligned}$$

#### Implementation

Make sure that the files `rl_agent.py`, `rl_env.py`, `rl_gui.py` and `rl_tests.py` are in the same folder as the notebook.

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np

import rl_agent
import rl_env
import rl_gui
import rl_tests

Your task is to implement value iteration in the class `ValueIterationAgent` below.  
Follow the instructions in the method `update_v()`.

In [None]:
class ValueIterationAgent(rl_agent.TabularAgent):

    def __init__(self, env, gamma, tolerance=1e-4):
        super().__init__(env)
        self.gamma = gamma
        self.tolerance = tolerance
        self.reset()

    def reset(self):
        # Resets the agent
        self.v = np.zeros(self.num_states, dtype=float)
        # The value function did not converge yet
        self.converged = False

        # Initialize policy with uniform probabilities (not necessary)
        self.pi = np.full((self.num_states, self.num_actions), 1 / self.num_actions)
        # In this implementation we set pi_stale to True when the value function
        # was changed. This allows us to safe computations by only computing
        # pi when it is necessary.
        self.pi_outdated = False

    def policy(self, state):
        # Derive the policy from the value function, if necessary
        self.update_pi()
        # Sample an action from the stochastic policy
        action = np.random.choice(self.num_actions, p=self.pi[state])
        return action
    
    def value(self, state):
        # Lookup in the value function array
        return self.v[state]

    def compute_q(self):
        # Derive Q from V using the environment's dynamics,
        # i.e., apply the mutually recursive Bellman expectation equation:
        # q(s,a) = R(s,a) + gamma * sum_s' P(s'|s,a) v(s')
        gamma = self.gamma
        v = self.v
        # Get the transition probabilities and reward function
        # from the environment
        P = self.env.P
        R = self.env.R
        q = R + gamma * np.sum(P * v, axis=2)
        return q

    def update_v(self):
        # Store the old values to check for convergence
        old_v = np.copy(self.v)
        # Compute the action value function
        q = self.compute_q()
        #######################################################################
        # TODO: This method applies one step of value iteration and checks    #
        # if the value function converged. We already implemented the         #
        # convergence check below. Remember to store the result in self.v     #
        # You can implement this using for-loops. Alternatively, a vectorized #
        # implementation can be written in a single line of code.             #
        #######################################################################
        
        #######################################################################
        # End of your code.                                                   #
        #######################################################################

        # Check if the value function converged
        self.converged = np.allclose(self.v, old_v, atol=self.tolerance)
        # Mark the policy as outdated
        self.pi_outdated = True

    def update_pi(self):
        # Derive the policy from the value function, if necessary
        if self.pi_outdated:
            # Compute the action value function
            q = self.compute_q()
            # Take the argmax over the action values
            indices = np.argmax(q, axis=1)
            # Convert to stochastic policy with one-hot probabilities
            self.pi = np.eye(self.num_actions)[indices]
            # Now the policy corresponds to the value function
            self.pi_outdated = False

    def value_iteration(self):
        # Run update_v() until the value function converges
        while not self.converged:
            self.update_v()
        self.update_pi()

    # This method is used for the GUI
    # You don't have to understand the code
    def interactive_optimization(self):
        from rl_gui import RLCmd, RLParamsResult

        def update_params(params):
            gamma = params['gamma']
            if gamma != self.gamma:
                self.gamma = gamma
                if self.converged:
                    self.converged = False
                    # Get out of the 'Policy is optimal' state
                    return RLParamsResult.RESET_GENERATOR

        yield RLCmd.Init(options={'step': 'Value iteration',
                                  'complete': 'Finish optimization',
                                  'reset': 'Reset agent'},
                         params={'gamma': ('Discount factor', 'float', self.gamma, 0.0, 1.0 - 1e-4)},
                         params_callback=update_params)

        option = None
        while not self.converged:
            if option is None or option == 'step':
                option = yield RLCmd.WaitForOption(active=['step', 'complete', 'reset'],
                                                   step='step', interval=200)

            if option == 'reset':
                self.reset()
                option = None
            else:
                self.update_v()

        option = yield RLCmd.WaitForOption(active=['reset'], message='Policy is optimal')
        assert option == 'reset', option
        self.reset()

You can use the following code cell to test your implementation.  
**Important**: After changing your code, execute the above code cell before running the tests.

In [None]:
def test_vi_agent():
    env = rl_env.default_5x5_maze(model_based=True)
    rng = None

    def seed():
        nonlocal rng
        rng = np.random.Generator(np.random.PCG64(seed=42))

    def create_agent(gamma):
        agent = ValueIterationAgent(env, gamma)
        pi = rng.uniform(0, 1, (agent.num_states, agent.num_actions))
        pi /= np.sum(pi, axis=1, keepdims=True)
        agent.pi = pi
        agent.v = rng.standard_normal(agent.num_states)
        return agent

    yield 'update_v()'
    seed()
    for expected_sum in [9.016962, 16.691778, 20.439101]:
        agent = create_agent(gamma=0.8)
        agent.update_v()
        if (yield from rl_tests.check_numpy_array(agent.v, name='self.v', shape=(agent.num_states,), dtype=np.floating)):
            v_sum = np.sum(agent.v)
            yield np.isclose(v_sum, expected_sum, atol=1e-5), f'The updated values are incorrect (error = {abs(expected_sum - v_sum):.5f})'
        yield None

rl_tests.run_tests(test_vi_agent())

If all of your tests passed, you can see your agent in action in the following code cell.

Sometimes there is a strange bug and the environment is rendered multiple times. In that case you may have to restart the notebook and reopen the browser tab or restart the editor (e.g. in VS Code).

In [None]:
# start a GUI for the maze environment
env = rl_env.default_5x5_maze(model_based=True)
# you can try the bigger maze by uncommenting the next line
#env = rl_env.default_8x8_maze(model_based=True)

gamma = 0.9  # discount factor
agents = {'Random': rl_agent.RandomAgent(env),
          'Value Iteration': ValueIterationAgent(env, gamma)}

rl_gui.RLGui(env, agents=agents, max_steps=1000, render_mode='rgb_array')