# Reinforcement Learning
Prof. Milica Gašić

### Monte Carlo control

1. Policy evaluation by every-visit Monte Carlo prediction of $q_\pi$:
  - Sample an episode $s_0, a_0, r_1, \ldots, s_T$
  - Set final return $g_T$ = 0
  - For $t = T - 1$ to $0$:
    - Compute return: $g_t = r_{t+1} + \gamma g_{t+1}$
    - Increment total return: $G(s_t, a_t) \mathrel{+}= g_t$
    - Increment counter: $N(s_t, a_t) \mathrel{+}= 1$
    - Update value: $Q(s_t, a_t) = G(s_t, a_t) / N(s_t, a_t)$
2. $\epsilon$-greedy policy improvement:

    $\pi'(a|s) = \begin{cases}
      \frac{\epsilon}{m} + \frac{1 - \epsilon}{|M_\pi(s)|} & \text{if } a \in M_\pi(s) \\
      \frac{\epsilon}{m} & \text{otherwise}
    \end{cases}$
    
    where $M_\pi(s) = \{ a \,|\, q_\pi(s,a) = \max_{a'} q_\pi(s,a') \}$ is the set of actions with maximal action values for state $s$

#### Implementation

Make sure that the files `rl_agent.py`, `rl_env.py`, `rl_gui.py`, `rl_tests.py` and `rl_util.py` are in the same folder as the notebook.

In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np

import rl_agent
import rl_env
import rl_gui
import rl_tests
import rl_util

Your task is to implement value iteration in the class `ValueIterationAgent` below.  
Follow the instructions in the method `update_v()`.

In [None]:
class MonteCarloAgent(rl_agent.TabularAgent):

    def __init__(self, env, gamma, epsilon, rng=None):
        super().__init__(env)
        self.gamma = gamma
        self.epsilon = epsilon
        self.rng = rng if rng is not None else np.random.default_rng()
        self.reset()

    def reset(self):
        num_s = self.num_states
        num_a = self.num_actions

        # Create arrays for total returns, visit counters and action value
        self.G = np.zeros((num_s, num_a), dtype=float)
        self.N = np.zeros((num_s, num_a), dtype=int)
        self.q = np.zeros((num_s, num_a), dtype=float)

        # Create array for policy distribution (initialized uniformly)
        self.pi = np.full((num_s, num_a), 1 / num_a, dtype=float)

        # State values are only computed when need (and are not required for this algorithm)
        self.v = None

        # Store some statistics for logging
        self.num_steps = 0
        self.num_episodes = 0
        self.num_truncated = 0

    def policy(self, state):
        # Sample an action according to pi
        action_probs = self.pi[state]
        action = self.rng.choice(self.num_actions, p=action_probs)
        return action

    def value(self, state):
        # Compute the state value from q and pi
        if self.v is None:
            self.v = np.sum(self.pi * self.q, axis=1)
        return self.v[state]

    def update_q(self, episode):
        # Update the action values given an episode

        if episode['truncated']:
            # If the episode was truncated, we cannot use it,
            # since we cannot calculate the true return
            self.num_truncated += 1
            return

        states = episode['states']
        actions = episode['actions']
        rewards = episode['rewards']
        T = len(rewards)

        gamma = self.gamma
        #######################################################################
        # TODO: Implement every-visit Monte Carlo prediction of q as          #
        # described in the pseudocode above.                                  #
        #######################################################################
        
        #######################################################################
        # End of your code.                                                   #
        #######################################################################

        # Reset the state values, since they need to be recomputed
        self.v = None

        # Update statistics
        self.num_steps += len(rewards)
        self.num_episodes += 1

    def policy_evaluation(self, num_episodes, max_steps=None):
        # Collect episodes and update the action value function
        for _ in range(num_episodes):
            episode = rl_util.rollout(self.env, self, max_steps=max_steps)
            self.update_q(episode)

    def policy_improvement(self):
        # Update the policy using epsilon-greedy policy improvement

        num_s = self.num_states
        num_a = self.num_actions
        epsilon = self.epsilon

        q = self.q
        pi = np.zeros((num_s, num_a), dtype=float)
        #######################################################################
        # TODO: Implement epsilon greedy policy improvement as described in   #
        # the pseudocode above. Remember to correctly distribute the          #
        # probabilities across all maximizing actions.                        #
        #######################################################################
        
        #######################################################################
        # End of your code.                                                   #
        #######################################################################
        self.pi = pi

        # Reset the state values, since they need to be recomputed
        self.v = None

    # This method is used for the GUI
    # You don't have to understand the code
    def interactive_optimization(self):
        from rl_gui import RLCmd, RLParamsResult

        def update_params(params):
            self.gamma = params['gamma']

            if params['epsilon'] != self.epsilon:
                self.epsilon = params['epsilon']
                if self.v is not None:
                    self.policy_improvement()
                return RLParamsResult.RENDER

        yield RLCmd.Init(options={'eval': 'Evaluate episode',
                                  'improve': 'Improve policy',
                                  'reset': 'Reset agent'},
                         params={'gamma': ('Discount factor', 'float', self.gamma, 0.0, 1.0 - 1e-4),
                                 'epsilon': ('Epsilon', 'float', self.epsilon, 0.0, 1.0)},
                         params_callback=update_params)

        while True:
            message = f'Processed {self.num_steps} steps in {self.num_episodes} episodes'
            if self.num_truncated > 0:
                message += f' ({self.num_truncated} truncated)'
            option = yield RLCmd.WaitForOption(active=['eval', 'improve', 'reset'],
                                               step='eval', interval=50, message=message)
            if option == 'eval':
                episode = yield RLCmd.WaitForEpisode(max_steps=20000, interval=-1)
                self.update_q(episode)
                #self.policy_improvement()  # TODO MC-PI vs MC-Control
            elif option == 'improve':
                self.policy_improvement()
            elif option == 'reset':
                self.reset()

You can use the following code cell to test your implementation.  
**Important**: After changing your code, execute the above code cell before running the tests.

In [None]:
def test_mc_agent():
    env = rl_env.default_5x5_maze(model_based=True)
    rng = None

    def seed():
        nonlocal rng
        rng = np.random.Generator(np.random.PCG64(seed=42))
        env.reset(seed=42)

    def create_agent(gamma, epsilon):
        return MonteCarloAgent(env, gamma, epsilon, rng=rng)

    yield 'update_q()'
    seed()
    for expected_sum in [4.470511, 9.752815, 6.292247]:
        agent = create_agent(gamma=0.8, epsilon=0.01)
        agent.policy_evaluation(num_episodes=3)
        if (yield from rl_tests.check_numpy_array(agent.q, name='self.q', shape=(agent.num_states, agent.num_actions), dtype=np.floating)):
            q_sum = np.sum(agent.q)
            yield np.isclose(q_sum, expected_sum, atol=1e-5), f'The updated action values are incorrect (error = {abs(expected_sum - q_sum):.5f})'
        yield None
    
    yield 'policy_improvement()'
    seed()
    for epsilon, expected_entropy in zip([0.01, 0.02, 0.05], [2.486858, 3.367890, 5.609687]):
        agent = create_agent(gamma=0.8, epsilon=epsilon)
        agent.policy_evaluation(num_episodes=5)
        agent.policy_improvement()
        if (yield from rl_tests.check_numpy_array(agent.pi, name='self.pi', shape=(agent.num_states, agent.num_actions), dtype=np.floating)):
            entropy = np.sum(-agent.pi * np.log(agent.pi))
            yield np.isclose(entropy, expected_entropy, atol=1e-5), f'The updated policy is incorrect (error = {abs(expected_entropy - entropy):.5f})'
        yield None

rl_tests.run_tests(test_mc_agent())

If all of your tests passed, you can see your agent in action in the following code cell.

Sometimes there is a strange bug and the environment is rendered multiple times. In that case you may have to restart the notebook and reopen the browser tab or restart the editor (e.g. in VS Code).

In [None]:
# start a GUI for the maze environment
env = rl_env.default_5x5_maze(model_based=True)
# you can try the bigger maze by uncommenting the next line
#env = rl_env.default_8x8_maze(model_based=True)

# you can also try the TicTacToe environment,
# in this case you need to set the render_mode below to 'ansi' (!)
#env = rl_env.TicTacToeEnv()

gamma = 0.9  # discount factor
epsilon = 0.01
agents = {'Random': rl_agent.RandomAgent(env),
          'Monte Carlo control': MonteCarloAgent(env, gamma, epsilon)}

rl_gui.RLGui(env, agents=agents, max_steps=1000, render_mode='rgb_array')