# Introduction

In this tutorial, we will extend some of the TD algorithms we implemented in the previous tutorial.

# Notebook setup

## Instructions

- Import numpy, scipy and matplotlib
- Configure inline plots
- Import helper modules

In [2]:
% matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from pylab import *

# Import definitions of the environments.
import RL_worlds as worlds

# Import helper functions for plotting.
from plot_util import *

In [3]:
def init_state(params):
    """
    Initialize the state at the beginning of an episode.
    Args:
        params: a dictionary containing the default parameters.
    Returns:
        an integer corresponding to the initial state.
    """
    if params['environment'].name == 'windy_cliff_grid':
        return 0
    elif params['environment'].name == 'n_armed_bandit':
        return 0
    elif params['environment'].name == 'cheese_world':
        return 0
    elif params['environment'].name == 'cliff_world':
        return 0
    elif params['environment'].name == 'quentins_world':
        return 54

def update_state(state, action, params):
    """
    State transition based on world, action and current state.
    Args:
        state: integer corresponding to the current state.
        action: integer corresponding to the action taken.
        params: a dictionary containing the default parameters.
    Returns:
        an integer corresponding to the next state;
        an integer corresponding to the reward received.
    """
    next_state, reward = params['environment'].get_outcome(state, action)
    return next_state, reward
    
def call_policy(state, value, params):
    """
    Call a policy to choose actions, given current state and value function.
    Args:
        state: integer corresponding to the current state.
        value: a matrix indexed by state and action.
        params: a dictionary containing the default parameters.
    Returns:
        an integer corresponding action chosen according to the policy.
    """
    # multiple options for policy
    if params['policy'] == 'epsilon_greedy':
        return epsilon_greedy(state, value, params)
    elif params['policy'] == 'softmax':
        return softmax(state, value, params)
    else: # random policy (if policy not recognized, choose randomly)
        return randint(params['environment'].n_actions)

def update_value(prev_state, action, reward, state, value, params):
    """
    Update the value function.
    Args:
        prev_state: an integer corresponding to the previous state.
        action: an integer correspoding to action taken.
        reward: a float corresponding to the reward received.
        state: an integer corresponding to the current state;
          should be None if the episode ended.
        value: a matrix indexed by state and action.
        params: a dictionary containing the default parameters. 
    Returns:
        the updated value function (matrix indexed by state and action).
    """
    if params['learning_rule'] == 'q_learning':
        # off policy learning
        return q_learning(prev_state, action, reward, state, value, params)
    elif params['learning_rule'] == 'sarsa':
        # on policy learning
        return sarsa(prev_state, action, reward, state, value, params)
    else:
        print('Learning rule not recognized')

def default_params(environment):
    """
    Define the default parameters.
    Args:
        environment: an object corresponding to the environment.
    Returns:
        a dictionary containing the default parameters, where the keys
            are strings (parameter names).
    """
    params = dict()
    params['environment'] = environment
    
    params['alpha'] = 0.1  # learning rate    
    params['beta'] = 10  # inverse temperature    
    params['policy'] = 'epsilon_greedy'
    params['epsilon'] = 0.05  # epsilon-greedy policy
    params['learning_rule'] = 'q_learning'
    params['epsilon_decay'] = 0.9
    
    if environment.name == 'windy_cliff_grid':
        params['gamma'] = 0.6  # temporal discount factor
    elif environment.name == 'n_armed_bandit':
        params['gamma'] = 0.9  # temporal discount factor
    elif environment.name == 'cliff_world':
        params['gamma'] = 1.0  # no discounting
    elif environment.name == 'cheese_world':
        params['gamma'] = 0.5  # temporal discount factor
    elif environment.name == 'quentins_world':
        params['gamma'] = 0.9  # temporal discount factor

    return params

# Exercise 1: $\epsilon$ decay

1. Modify the code you wrote for the $\epsilon$-greedy policy so that the $\epsilon$ (proportion of random choices) decreases over time.
2. Test the new policy with different learning algorithms. How does the decay affect your results?

# Exercise 2: Sarsa($\lambda$)

1. You can improve the performance of TD algorithms by allowing faster back propagation. Modify your Sarsa algorithm from the previous tutorial to obtain Sarsa($\lambda$), which makes use of eligibility traces to propagate the strengthening effect of each reward into the whole sequence of actions that led to it.
2. To explore the effect of eligibility traces on learning, run Sarsa($\lambda$) on Quentin's world, testing different combinations of $\lambda$ and $\alpha$ values. How does $\lambda$ affect the efficiency of Sarsa?