<a href="https://colab.research.google.com/github/eemlcommunity/PracticalSessions2023/blob/omardd%2Frl/part1_value_iteration_and_q_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# [EEML 2023] Reinforcement Learning Tutorial - Part 1

## Value Iteration & Q-Learning


# Colab Setup

In [None]:
# Colab setup
from IPython import get_ipython

if 'google.colab' in str(get_ipython()):
  # install rlberry library (https://github.com/rlberry-py/rlberry)
  !pip install rlberry==0.5.0 > /dev/null 2>&1

  # install ffmpeg-python for saving videos
  !pip install ffmpeg-python > /dev/null 2>&1

  # packages required to show video
  !pip install pyvirtualdisplay > /dev/null 2>&1
  !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

# Check rlberry version
import rlberry
print(rlberry.__version__)

# Create directory for saving videos
!mkdir videos > /dev/null 2>&1

# Initialize display and import function to show videos
import rlberry.colab_utils.display_setup
from rlberry.colab_utils.display_setup import show_video

In [None]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

from rlberry.envs import GridWorld

In [None]:
def get_env(easy=False):
  """Creates an instance of a grid-world MDP."""
  if easy:
    env = GridWorld(
        nrows=3,
        ncols=3,
        walls=(),
        reward_at = {(2, 2): 1.0},
        success_probability=0.9,
        terminal_states = ((2, 2),),
    )
  else:
    env = GridWorld(
        nrows=5,
        ncols=7,
        reward_at = {(0, 6): 1.0},
        walls=((0, 4), (1, 4), (2, 4), (3, 4)),
        success_probability=0.9,
        terminal_states=((0, 6),)
    )
  return env

def render_policy(env, policy=None, horizon=50):
  """Visualize a policy in an environment

  Args:
    env: GridWorld
        environment where to run the policy
    policy: np.array
        matrix mapping states to action (Ns).
        If None, runs random policy.
    horizon: int
        maximum number of timesteps in the environment.
  """
  env.enable_rendering()
  state, info = env.reset()                       # get initial state
  for timestep in range(horizon):
      if policy is None:
        action = env.action_space.sample()  # take random actions
      else:
        action = policy[state]
      next_state, reward, terminated, truncated, info = env.step(action)
      state = next_state
      if terminated or truncated:
        break
  # save video and clear buffer
  env.save_video('./videos/gw.mp4', framerate=5)
  env.clear_render_buffer()
  env.disable_rendering()
  # show video
  show_video('./videos/gw.mp4')

In [None]:
# Create an environment and visualize it
env = get_env()
render_policy(env)  # visualize random policy

# The reward function and transition probabilities can be accessed through
# the R and P attributes:
print(f"Shape of the reward array = (S, A) = {env.R.shape}")
print(f"Shape of the transition array = (S, A, S) = {env.P.shape}")
print(f"Reward at (s, a) = (1, 0): {env.R[1, 0]}")
print(f"Prob[s\'=2 | s=1, a=0]: {env.P[1, 0, 2]}")
print(f"Number of states and actions: {env.Ns}, {env.Na}")

# The states in the griworld correspond to (row, col) coordinates.
# The environment provides a mapping between (row, col) and the index of
# each state:
print(f"Index of state (1, 0): {env.coord2index[(1, 0)]}")
print(f"Coordinates of state 5: {env.index2coord[5]}")

# Value Iteration


Complete the function value_iteration below.

### What to implement

The application of the Bellman operator:

$$
T^* Q(s, a) =  R(s, a) + \gamma \sum_{s'} P(s'|s,a) \max_{a'} Q(s', a')
$$

In [None]:
def value_iteration(P, R, gamma=0.9, tol=1e-3):
    """
    Run value iteration

    Parameters
    ----------
    P: np.array
        transition matrix (Ns, Na, Ns)
    R: np.array
        reward matrix (Ns, Na)
    gamma: float
        discount factor
    tol: float
        precision of the solution

    Returns
    -------
    Q, greedy_policy, Qfs
      Q: final Q-function (at iteration n)
      greedy_policy: greedy policy wrt Qn
      Qfs: all Q-functions generated by the algorithm (for visualization)
    """
    Ns, Na = R.shape
    Q = np.zeros((Ns, Na))
    TQ = np.zeros((Ns, Na))
    Qfs = [Q]
    err = np.inf
    while err > tol:
      # ====================================================
      # YOUR IMPLEMENTATION HERE
      # compute TQ ...
      # ====================================================
      err = np.abs(TQ - Q).max()
      Q = TQ
      Qfs.append(Q)

    greedy_policy = np.argmax(Q, axis=1)
    return Q, greedy_policy, Qfs

In [None]:
#
# Running Value Iteration
#

# Parameters
tol = 1e-5
gamma = 0.99

# Environment
env = get_env()

# run value iteration to obtain Q-values
VI_Q, VI_greedypol, all_qfunctions = value_iteration(
    env.P, env.R, gamma=gamma, tol=tol)

# render the policy
print("Greedy policy obtained from value iteration")
render_policy(env, VI_greedypol)

# show the error between the computed V-functions and the final V-function
# (that should be the approximatele the optimal one, if correctly implemented)
# as a function of the number of iterations
final_V = all_qfunctions[-1].max(axis=1)
norms = [ np.abs(q.max(axis=1) - final_V).max() for q in all_qfunctions]
plt.plot(norms)
plt.xlabel('Iteration')
plt.ylabel('Error')
plt.title("Value Iteration Convergence")
plt.show()

# Q-Learning

### What to implement
Finish the implementation of the function ``q_learning`` that takes as input an environment, runs Q learning for $T$ time steps and returns $Q_T$.

**Implement the update to be applied to the Q function**:

$$\delta_t = r_t + \gamma \max_a Q_t(s_{t+1}, a) - Q_t(s_t, a_t)$$

### What you can play with
You can test different learning rates:
  * $\alpha_t(s, a) =$ constant in $]0, 1[$
  * $\alpha_t(s, a) = \frac{1}{\text{number of visits to} (s, a)}$
  * others?

You can also test different initializations of the Q function and try different values of $\varepsilon$ in the $\varepsilon$-greedy exploration.




In [None]:
def q_learning(env, n_iterations, gamma, store_q_interval):
    """
    Implementation of Q-Learning

    Parameters
    ----------
    env: gym.Env
        environment
    n_iterations: int
        number of Q-learning iterations
    gamma: float
      discount factor
    store_q_interval: int
      interval (in number of iterations) to store q functions


    Returns
    -------
    Q, greedy_policy, Qfs, N_visits
      Q: final Q-function (at iteration n)
      greedy_policy: greedy policy wrt Qn
      Qfs: all Q-functions generated by the algorithm (for visualization)
      N_visits: number of visits to each state-action pair
    """
    Ns, Na = env.R.shape
    Q = np.zeros((Ns, Na))  # can we improve this initialization?
    N = np.zeros((Ns, Na))  # number of visits to each (s, a)

    Qfs = [Q.copy()]


    # epsilon for exploration (you can change it, make it depend on time, etc.)
    epsilon = 0.5

    state, info = env.reset()
    for tt in tqdm(range(n_iterations), desc="running q_learning"):
      # epsilon-greedy exploration
      if np.random.uniform() < epsilon:  # happers with prob epsilon
        action = env.action_space.sample()
      else:
        action = Q[state, :].argmax()

      next_state, reward, terminated, truncated, info = env.step(action)

      # ====================================================
	    # YOUR IMPLEMENTATION HERE
      #
      # take action, observe next state and reward

      # compute delta_t
      delta = 0 # ...
      # ====================================================

      # update Q
      alpha = 0.1
      Q[state, action] += alpha*delta

      # update number of visits
      N[state, action] += 1

      # update state
      if terminated or truncated:
        state, _ = env.reset()
      else:
        state = next_state


      # store Q function
      if (tt % store_q_interval) == 0:
        Qfs.append(Q.copy())
    greedy_policy = np.argmax(Q, axis=1)
    return Q, greedy_policy, Qfs, N

In [None]:
#
# Running Q-Learning
#

# Parameters
store_q_interval = 1000   #@param {type:"integer"}
n_iterations = 2500000  #@param {type:"integer"}
gamma = 0.99  #@param {type:"number"}
easy_enviroment = False  #@param {type:"boolean"}

# Environment
# Start with easy=True, then try easy=False
env = get_env(easy=easy_enviroment)

# Get ground truth from Value Iteration
VI_Q, _, _ = value_iteration(env.P, env.R, gamma=gamma, tol=tol)
VI_V = VI_Q.max(axis=1)

# run value iteration to obtain Q-values
QL_Q, QL_greedypol, all_qfunctions_ql, N_visits =  q_learning(
    env, n_iterations, gamma, store_q_interval)

# render the policy
print("Greedy policy obtained with Q Learning")
render_policy(env, QL_greedypol)

# show the error between the V functions obtained in Q Learning, and the
# final V function obtained with Value Iteration
norms = [ np.abs(q.max(axis=1) - VI_V).max() for q in all_qfunctions_ql]
n_iterations = (1 + np.arange(len(norms))) * store_q_interval
plt.plot(n_iterations, norms)
plt.xlabel('Iteration')
plt.ylabel('Error')
plt.title("Q-Learning Convergence")
plt.show()