# Introduction

In this demo we will explore a very simple environment. Cheese World is a one-dimensional grid with 4 states and only 2 possible actions: left and right. Arriving at the goal state gives you a reward of 1. Moving left from the start state (state 1 in the figure) stays in the same place, and moving anywhere from the goal state (state 4 in the figure) ends the episode.

<img src="https://github.com/goptavares/intro-to-rl/blob/master/fig/CheeseWorld.png?raw=true" width=680 height=200>

This demo uses a simple random policy to traverse the environment and get to the goal. The value for each state-action pair is calculated with a simple additive strategy, i.e., by adding the rewards obtained for that pair throughout the episode.

Run all cells and familiarize yourself  with the code, as it will help your write your own code to solve the exercises.

# Notebook setup

In [0]:
% matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from pylab import *

!git clone https://github.com/goptavares/intro-to-rl.git
%run intro-to-rl/RL_worlds.py
%run intro-to-rl/plot_util.py

# Helper functions

In [0]:
def default_params(environment):
  """
  Define the default parameters.
  Args:
    environment: an object corresponding to the environment.
  Returns:
    a dictionary containing the default parameters, where the keys
        are strings (parameter names).
  """
  params = dict()
  params['environment'] = environment
  
  params['alpha'] = 0.1  # learning rate
  params['beta'] = 10  # inverse temperature
  params['epsilon'] = 0.05  # epsilon-greedy policy
  params['epsilon_decay'] = 0.9
  params['gamma'] = 1.0  # no discounting

  return params

In [0]:
def init_state():
  # In Cheese World, the initial state is 0.
  return 0

In [0]:
def update_state(state, action, params):
  """
  State transition based on world, action and current state.
  Args:
    state: integer corresponding to the current state.
    action: integer corresponding to the action taken.
    params: a dictionary containing the default parameters.
  Returns:
    an integer corresponding to the next state;
    an integer corresponding to the reward received.
  """
  next_state, reward = params['environment'].get_outcome(state, action)
  return next_state, reward

# Random policy and simple additive value update

In [0]:
def call_policy(params):
  # Random policy.
  return randint(params['environment'].n_actions)

In [0]:
def update_value(prev_state, action, reward, value):
  """
  Update the value function.
  Args:
    prev_state: an integer corresponding to the previous state.
    action: an integer correspoding to action taken.
    reward: a float corresponding to the reward received.
    value: a matrix indexed by state and action. 
  Returns:
    the updated value function (matrix indexed by state and action).
  """
  # Additive value update.
  value[prev_state, action] += reward
  return value

# Explore Cheese World

In [0]:
def run_learning(value, params, n_episodes, max_steps):
  """
  Args:
    value: a matrix indexed by state and action.
    params: a dictionary containing the default parameters.
    n_episodes: integer, number of episodes to run.
    max_steps: integer, maximum number of steps to take in each episode.
  Returns:
    a dictionary where the keys are integers (episode numbers)
        and the values are integers (total rewards per episode);
    the updated value function (matrix indexed by state and action).
  """
  reward_sums = np.zeros(n_episodes)

  # Loop over episodes.
  for episode in xrange(n_episodes):
    # Initialize state.
    state = init_state()    
    step = 0
    reward_sum = 0

    # Make sure to break after max number of steps.
    while step < max_steps:
      # Get action from policy.
      action = call_policy(params)  
      # Update state based on action.
      next_state, reward = update_state(state, action, params)
      # Update value function.
      value = update_value(state, action, reward, value)  
      state = next_state
      # Sum the rewards obtained.
      reward_sum += reward  
      step += 1
      if next_state == None:
        # Episode ends.
        break  
    reward_sums[episode] = reward_sum
  return reward_sums, value

In [0]:
env = cheese_world()
params = default_params(environment=env)

# Decision-maker: choose parameter values.
params['epsilon'] = 0.01
params['alpha'] = 0.5
params['beta'] = 10
params['gamma'] = 0.8

# Define number of episodes and maximum steps per episode.
n_episodes = 5
max_steps = 10

# Initialization.
# Start with uniform value function.
value = np.ones((env.n_states, env.n_actions))

# Run learning.
reward_sums, value = run_learning(value, params, n_episodes, max_steps)

fig = plot_state_action_values(env, value)
fig = plot_heatmap_max_val(env, value)
fig = plot_rewards(n_episodes, reward_sums, average_range=1)