<a href="https://colab.research.google.com/github/decomiteA/ReachRLToolbox/blob/main/TestRL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook models reaching movement using reinforcement learning algorithms

In [None]:
#@title Imports

import numpy as np 
import matplotlib.pyplot as plt 
from scipy.signal import convolve as conv

In [None]:
#@title Plot handling

def plot_state_action_values(env, value, ax=None):
  """
  Generate plot showing value of each action at each state.
  """
  if ax is None:
    fig, ax = plt.subplots()

  for a in range(env.n_actions):
    ax.plot(range(env.n_states), value[:, a], marker='o', linestyle='--')
  ax.set(xlabel='States', ylabel='Values')
  ax.legend(['R','U','L','D'], loc='lower right')

def plot_quiver_max_action(env,value,ax=None):
  """
  Generate plot showing action of maximum value or maximum probability for each state
  """
  if ax is None: 
    fig, ax = plt.subplots()

  X = np.tile(np.arange(env.dim_x),[env.dim_y,1])+0.5
  Y = np.tile(np.arange(env.dim_y)[::-1][:,np.newaxis],[1,env.dim_x])
  which_max = np.reshape(value.argmax(axis=1),(env.dim_y,env.dim_x))
  which_max = which_max[::-1,:]
  U = np.zeros(X.shape)
  V = np.zeros(Y.shape)
  U[which_max == 0] = 1
  V[which_max == 1] = 1
  U[which_max == 2] = -1
  V[which_max == 3] = -1

  ax.quiver(X,Y,U,V)
  ax.set(
      title='Maximum value/probability actions',
      xlim = [-0.5, env.dim_x+0.5],
      ylim = [-0.5, env.dim_y+0.5],
  )
  ax.set_xticks(np.linspace(0.5, env.dim_x-0.5, num=env.dim_x))
  ax.set_xticklabels(["%d" % x for x in np.arange(env.dim_x)])
  ax.set_xticks(np.arange(env.dim_x+1), minor=True)
  ax.set_yticks(np.linspace(0.5, env.dim_y-0.5, num=env.dim_y))
  ax.set_yticklabels(["%d" % y for y in np.arange(0, env.dim_y*env.dim_x,
                                                  env.dim_x)])
  ax.set_yticks(np.arange(env.dim_y+1), minor=True)
  ax.grid(which='minor',linestyle='-')

  
def plot_heatmap_max_val(env, value, ax=None):
  """
  Generate heatmap showing maximum value at each state
  """
  if ax is None:
    fig, ax = plt.subplots()

  if value.ndim == 1:
      value_max = np.reshape(value, (env.dim_y,env.dim_x))
  else:
      value_max = np.reshape(value.max(axis=1), (env.dim_y,env.dim_x))
  value_max = value_max[::-1,:]

  im = ax.imshow(value_max, aspect='auto', interpolation='none', cmap='afmhot')
  ax.set(title='Maximum value per state')
  ax.set_xticks(np.linspace(0, env.dim_x-1, num=env.dim_x))
  ax.set_xticklabels(["%d" % x for x in np.arange(env.dim_x)])
  ax.set_yticks(np.linspace(0, env.dim_y-1, num=env.dim_y))
  if env.name != 'windy_cliff_grid':
      ax.set_yticklabels(
          ["%d" % y for y in np.arange(
              0, env.dim_y*env.dim_x, env.dim_x)][::-1])
  return im


def plot_rewards(n_episodes, rewards, average_range=10, ax=None):
  """
  Generate plot showing total reward accumulated in each episode.
  """
  if ax is None:
    fig, ax = plt.subplots()

  smoothed_rewards = (conv(rewards, np.ones(average_range), mode='same')
                      / average_range)

  ax.plot(range(0, n_episodes, average_range),
          smoothed_rewards[0:n_episodes:average_range],
          marker='o', linestyle='--')
  ax.set(xlabel='Episodes', ylabel='Total reward')


def plot_performance(env, value, reward_sums):
  fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 12))
  plot_state_action_values(env, value, ax=axes[0,0])
  plot_quiver_max_action(env, value, ax=axes[0,1])
  plot_rewards(n_episodes, reward_sums, ax=axes[1,0])
  im = plot_heatmap_max_val(env, value, ax=axes[1,1])
  fig.colorbar(im)

The section of code below defines the different environment as objects with their own functions. Each environment is explained in the comments

In [None]:
#@title Environments definition

class ToyExample:
  """
  This class defines the toy example that consists of a 4x4 grid with one good target state
  Here are the different states : 
  12 13 14 15
  8  9  10 11
  4  5  6  7
  1  2  3  4
  """
  def __init__(self):
    self.name="ToyExample1"
    self.n_states=16
    self.n_actions=4
    self.dim_x=4
    self.dim_y=5
    self.init_state=np.random.choice(range(n_states))

  def get_outcome(self,state,action):
    """
    This function retunrs the outcome of taking an action in a given state (returns reward and next_state)
    """
    if state==14: 
      reward = 0
      next_state = None
      return next_state,reward 

    reward = -1 #we penalise any other action
    if action==0:   #move right
      if state%4==3: #right border 
        next_state=state
      else:
        next_state=state+1
      
    elif action==1: #move up
      if state>11:
        next_state=state
      else:
        next_state=state+4
      
    elif action==2: #move left
      if state%4==0: 
        next_state=state
      else:
        next_state=state-1
    
    elif action==3: #move down
      if state<4:
        next_state=state
      else:
        next_state=state-4
      
    else:
      print("Incorrect action, the selected action should be between 0 and 3")
      next_state=None
      reward=None
    return int(next_state) if next_state is not None else None, reward

  def get_all_outcomes(self):
    outcomes={}
    for state in range(self.n_states):
      for action in range(self.n_actions):
        next_state,reward = self.get_outcome(state,action)
        outcomes[state,action]=[(1,next_state,reward)]
    return outcomes

  
