# Monte Carlo Methods

## Observable environments

In contrast to the DP chapter, we now work with environments where we don't know about all dynamics. Instead, we can interact with the environment and observe its reactions. This allows us to sample from the environment.

Monte Carlo methods perform a lot of these sampling steps, and try to draw conclusions from the results. By the law of large numbers, averaging the returns from many samples converges to the true expected value.

One important constraint is that the methods from this chapter only work in environments with terminal states.

In [149]:
import numpy as np

class ObservableEnvironment:
    def get_states(self):
        """
        Return the set of possible states
        """
        pass
    
    def get_possible_actions(self, state):
        """
        Returns the actions that can be taken from the given state
        """
        pass
    
    def execute_action(self, state, action):
        """
        Returns the new state and the given reward. This does not have to
        be deterministic
        """
        pass
    
    def is_terminal_state(self, state):
        """
        Returns a boolean indicating whether the state is terminal
        """
        pass
    
    def sample(self, policy, state):
        """
        Follows the policy until a terminal state is reached.
        Returns a list of states, actions, and the rewards they gave
        """
        
        result = []
        
        while not self.is_terminal_state(state):
            actions = self.get_possible_actions(state)
            ps = [policy[(state, action)] for action in actions]
            
            action = np.random.choice(actions, p=ps)
            
            new_state, reward = self.execute_action(state, action)
            
            result.append((state, action, reward))
            state = new_state
        
        return result

## BlackJack

In this chapter, we try to find an optimal policy to play a simplified version of BlackJack. The game is described in detail in Example 5.1 of Sutton & Barto's book.

In [145]:
from itertools import product
from random import choice

class BlackJack(ObservableEnvironment):
    def __init__(self):
        self._init_deck()
        
        self.HIT = 0
        self.STICK = 1
        
        self.WINNING_STATE = (21, 0, 0)
        self.LOSING_STATE = (0, 21, 0)
        self.DRAWING_STATE = (21, 21, 0)
        self.terminal_states = [self.WINNING_STATE, self.LOSING_STATE, self.DRAWING_STATE]
        
        self.WON = (self.WINNING_STATE, 1)
        self.LOST = (self.LOSING_STATE, -1)
        self.DRAW = (self.DRAWING_STATE, 0)
        
    def _init_deck(self):
        # Ace: 1
        # Numbers: 2 to 10
        # Jack/Queen/King: 10
        self.deck = range(1, 10 + 1) + [10] * 3
        
    def get_states(self):
        player_state = range(12, 21 + 1)
        dealer_state = range(1, 10 + 1)
        usable_ace = [0, 1]
        
        return list(product(player_state, dealer_state, usable_ace)) + self.terminal_states
    
    def get_possible_actions(self, state):
        player, dealer, usable_ace = state
        return [self.HIT, self.STICK]
    
    def execute_action(self, state, action):
        player, dealer, usable_ace = state
        
        if action == self.HIT:
            new_card = self._sample_card()
            player += new_card
            
            if new_card == 1:
                usable_ace = 1
            
            if player > 21:
                if usable_ace == 1:
                    player -= 10
                    usable_ace = 0
                else:
                    return self.LOST
        elif action == self.STICK:
            while dealer <= 17:
                dealer += self._sample_card()
                
            if player < dealer <= 21:
                return self.LOST
            else:
                return self.WON
        
        state = (player, dealer, usable_ace)
        return state, 0
    
    def _sample_card(self):
        return choice(range(1, 10 + 1))
    
    def is_terminal_state(self, state):
        return state in self.terminal_states

## On-policy

### First-visit

In [146]:
class Agent:
    def print_policy(self):
        policy = self.policy
        
        for state in self.env.get_states():
            if self.env.is_terminal_state(state):
                continue
            
            actions = self.env.get_possible_actions(state)                
            QA = { action: self.Q[(state, action)] for action in actions }
            best_action = max(QA.items(), key=itemgetter(1))[0]
            
            print state, best_action

In [147]:
from operator import itemgetter

class OnPolicyFirstVisitMonteCarloAgent(Agent):
    def __init__(self, env, epsilon=.05):
        self.env = env
        self.epsilon = epsilon
        
        self.Q = {}
        self.returns = {}
        self.policy = {}
        
        for state in self.env.get_states():
            actions = self.env.get_possible_actions(state)
            num_actions = len(actions)
            
            for action in actions:
                self.Q[(state, action)] = 0
                self.returns[(state, action)] = []
                self.policy[(state, action)] = 1. / num_actions
        
    def learn(self, num_samples):
        for _ in range(num_samples):
            episode = self.env.sample(self.policy, self._get_start_state())
            
            if len(episode) == 0:
                continue
            
            returns = {}
            reward_after = 0
            
            for state, action, reward in reversed(episode):
                reward_after += reward
                returns[(state, action)] = reward_after
                
            for (state, action), ret in returns.items():
                self.returns[(state, action)].append(ret)
                self.Q[(state, action)] = mean(self.returns[(state, action)])
            
            states, _, _ = zip(*episode)
            for state in set(states):
                actions = self.env.get_possible_actions(state)
                num_actions = len(actions)
                
                QA = { action: self.Q[(state, action)] for action in actions }
                best_action = max(QA.items(), key=itemgetter(1))[0]
                
                for action in actions:
                    self.policy[(state, action)] = self.epsilon / num_actions
            
                self.policy[(state, best_action)] += 1 - self.epsilon
            
    def _get_start_state(self):
        return choice(self.env.get_states())

By learning from 500,000 samples we can learn the optimal policy. The same policy is shown in the book, but visualized a bit nicer.

In [148]:
env = BlackJack()
agent = OnPolicyFirstVisitMonteCarloAgent(env)

agent.learn(num_samples=500000)

agent.print_policy()

(12, 1, 0) 0
(12, 1, 1) 0
(12, 2, 0) 0
(12, 2, 1) 0
(12, 3, 0) 0
(12, 3, 1) 0
(12, 4, 0) 0
(12, 4, 1) 0
(12, 5, 0) 0
(12, 5, 1) 0
(12, 6, 0) 0
(12, 6, 1) 0
(12, 7, 0) 0
(12, 7, 1) 0
(12, 8, 0) 0
(12, 8, 1) 0
(12, 9, 0) 0
(12, 9, 1) 0
(12, 10, 0) 0
(12, 10, 1) 0
(13, 1, 0) 0
(13, 1, 1) 0
(13, 2, 0) 0
(13, 2, 1) 0
(13, 3, 0) 0
(13, 3, 1) 0
(13, 4, 0) 0
(13, 4, 1) 0
(13, 5, 0) 0
(13, 5, 1) 0
(13, 6, 0) 0
(13, 6, 1) 0
(13, 7, 0) 0
(13, 7, 1) 0
(13, 8, 0) 0
(13, 8, 1) 0
(13, 9, 0) 0
(13, 9, 1) 0
(13, 10, 0) 0
(13, 10, 1) 0
(14, 1, 0) 0
(14, 1, 1) 0
(14, 2, 0) 0
(14, 2, 1) 0
(14, 3, 0) 0
(14, 3, 1) 0
(14, 4, 0) 0
(14, 4, 1) 0
(14, 5, 0) 0
(14, 5, 1) 0
(14, 6, 0) 0
(14, 6, 1) 0
(14, 7, 0) 1
(14, 7, 1) 0
(14, 8, 0) 0
(14, 8, 1) 0
(14, 9, 0) 0
(14, 9, 1) 0
(14, 10, 0) 0
(14, 10, 1) 0
(15, 1, 0) 0
(15, 1, 1) 0
(15, 2, 0) 0
(15, 2, 1) 0
(15, 3, 0) 0
(15, 3, 1) 0
(15, 4, 0) 0
(15, 4, 1) 0
(15, 5, 0) 0
(15, 5, 1) 0
(15, 6, 0) 1
(15, 6, 1) 0
(15, 7, 0) 0
(15, 7, 1) 0
(15, 8, 0) 0
(15, 8, 1) 0
(15, 9

### Every-visit

## Off-policy