## Multi-step Bootstrapping

The TD methods from the last chapter all performed backups by looking one step ahead. The methods from the Monte Carlo chapter performed backups after an entire episode.

Both approaches have their advantages and disadvantages, but there's no reason why we couldn't combine their ideas, by doing multi-step bootstrapping.

## Windy GridWorld

Just like last chapter, we'll use the windy GridWorld environment.

In [3]:
import numpy as np

class ObservableEnvironment:
    def get_states(self):
        """
        Return the set of possible states
        """
        pass
    
    def get_possible_actions(self, state):
        """
        Returns the actions that can be taken from the given state
        """
        pass
    
    def execute_action(self, state, action):
        """
        Returns the new state and the given reward. This does not have to
        be deterministic
        """
        pass
    
    def is_terminal_state(self, state):
        """
        Returns a boolean indicating whether the state is terminal
        """
        pass
    
    def sample(self, policy, state):
        """
        Follows the policy until a terminal state is reached.
        Returns a list of states, actions, and the rewards they gave
        """
        
        result = []
                
        while not self.is_terminal_state(state):
            actions = self.get_possible_actions(state)
            ps = [policy[(state, action)] for action in actions]
            
            action = np.random.choice(actions, p=ps)
            
            new_state, reward = self.execute_action(state, action)
            
            result.append((state, action, reward))
            state = new_state
        
        return result

In [4]:
from itertools import product

class WindyGridWorld(ObservableEnvironment):
    def __init__(self, rewards, wind):
        self.rewards = rewards
        self.wind = wind
        
        n, m = rewards.shape
        self.states = list(product(range(n), range(m)))
        
        self.max_down = n - 1
        self.max_right = m - 1
        
        self.UP = "UP"
        self.DOWN = "DOWN"
        self.LEFT = "LEFT"
        self.RIGHT = "RIGHT"
        
        self.ACTIONS = [self.UP, self.DOWN, self.LEFT, self.RIGHT]
        
    def get_states(self):
        return self.states
    
    def get_possible_actions(self, state):
        i, j = state
        
        actions = []
        
        if i > 0:
            actions.append(self.UP)
            
        if i < self.max_down:
            actions.append(self.DOWN)
        
        if j > 0:
            actions.append(self.LEFT)
        
        if j < self.max_right:
            actions.append(self.RIGHT)
            
        return actions
    
    def execute_action(self, state, action):
        i, j = state        
        
        if action == self.UP:
            i -= 1
            
        if action == self.DOWN:
            i += 1
            
        if action == self.LEFT:
            j -= 1
            
        if action == self.RIGHT:
            j += 1
            
        i -= self.wind[j]
            
        i = max(0, min(i, self.max_down))
        j = max(0, min(j, self.max_right))
        
        return (i, j), self.rewards[(i, j)] - 1
    
    def is_terminal_state(self, state):
        # In this GridWorld we terminate once the reward is reached
        
        i, j = state
        return self.rewards[i, j] > 0

In [9]:
def draw_policy(agent, n, m):
    lookup = {
        "UP": u"↑",
        "DOWN": u"↓",
        "LEFT": u"←",
        "RIGHT": u"→"
    }
    
    result = np.zeros((n, m))
    
    for i in range(n):
        result = ""
        
        for j in range(m):
            state = (i, j)
            actions = agent.env.get_possible_actions(state)      
            num_actions = len(actions)

            QA = { possible_action: agent.Q[(state, possible_action)] for possible_action in actions }
            best_action = max(QA.items(), key=itemgetter(1))[0]
            
            result += lookup[best_action]
        
        print result

In [5]:
R = np.array([
    [0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 1, 0],
    [0, 0, 0, 0, 0, 0]
])

wind = [0, 0, 1, 1, 1, 0]

env = WindyGridWorld(R, wind)

## $n$-step Sarsa

In [6]:
from random import choice
from operator import itemgetter
from Queue import Queue

class SarsaAgent:
    def __init__(self, env, num_steps=1, discount_factor=1., alpha=.4, epsilon=.05):
        self.env = env
        self.num_steps = 1
        self.discount_factor = discount_factor
        self.alpha = alpha
        self.epsilon = epsilon
        
        self.Q = {}
        self.policy = {}
        
        for state in env.get_states():
            actions = env.get_possible_actions(state)
            num_actions = len(actions)
            
            for action in actions:
                self.Q[(state, action)] = 0.
                self.policy[(state, action)] = 1. / num_actions
    
    def learn(self, num_samples, initial_state):
        for _ in range(num_samples):
            S = initial_state
            A = self._choose_action(S)
            
            t = 0
            rewards = Queue()
            reward_sum = 0
            
            while not self.env.is_terminal_state(S):                
                S_next, R = self.env.execute_action(S, A)
                A_next = self._choose_action(S_next)
                
                rewards.put((R, S, A))
                reward_sum += R
                
                if t >= self.num_steps:
                    old_reward, S_old, A_old = rewards.get()

                    # SA R SA
                    self.Q[(S_old, A_old)] += self.alpha * (reward_sum + self.discount_factor * self.Q[(S_next, A_next)] - self.Q[(S_old, A_old)])

                    reward_sum -= old_reward
                    
                self._update_policy()
                
                t += 1
                S, A = S_next, A_next
                
    def _inner_learn(self, S, A):
        return S_next, A_next
                
    def _choose_action(self, S):
        actions = self.env.get_possible_actions(S)
        
        ps = [self.policy[(S, A)] for A in actions]
        A = np.random.choice(actions, p=ps)
        
        return A
    
    def _update_policy(self):
        for state in self.env.get_states():
            actions = self.env.get_possible_actions(state)      
            num_actions = len(actions)
            
            best_action = self._find_best_action(state)
            
            for action in actions:
                self.policy[(state, action)] = self.epsilon / num_actions
                
            self.policy[(state, best_action)] += 1 - self.epsilon
            
    def _find_best_action(self, state):
        QA = { action: self.Q[(state, action)] for action in self.env.get_possible_actions(state) }
        return max(QA.items(), key=itemgetter(1))[0]

The multi-step Sarsa agent with $n = 1$ is the same as the one from the previous chapter. It also arrives at the same final policy:

In [39]:
sarsa_agent = SarsaAgent(env, num_steps=1)
sarsa_agent.learn(num_samples=2000, initial_state=(0, 0))
draw_policy(sarsa_agent, 3, 6)

→→→→→↓
→→→↓↓↓
→→→→→←


As expected, agents that take a larger number of steps arrive at the same policy or an equivalent optimal policy:

In [42]:
sarsa_agent = SarsaAgent(env, num_steps=2)
sarsa_agent.learn(num_samples=2000, initial_state=(0, 0))
draw_policy(sarsa_agent, 3, 6)

→→→→→↓
→→↑↓↓↓
→→→→→←


In [43]:
sarsa_agent = SarsaAgent(env, num_steps=3)
sarsa_agent.learn(num_samples=2000, initial_state=(0, 0))
draw_policy(sarsa_agent, 3, 6)

→→→→→↓
→→→↓↓↓
→→→→→←
