Implementation of Monte Carlo ES (Sutton and Barto, section 5.3, page 99)

In [261]:
import numpy as np
import gym
from tqdm import tqdm

For OpenAI's gym, blackjack has an observation space consisting of a 3-tuple of

The player's current sum

The value of dealer's one showing card (1 - 10)

Whether or not the player has an ace (0 or 1) - important because aces can be either 1 or 11

In [None]:
class Agent:

    def __init__(self, gamma=0.85):
        self.env = gym.make("Blackjack-v1", natural=False) # Blackjack environment from the OpenAI gym
        self.sum_size = self.env.observation_space[0].n # Number of possibilities for the player's current sum
        self.dealer_size = self.env.observation_space[1].n # Number of possibilities for the dealer's face up card
        self.gamma = gamma
        self.policy = self.initializeBlackjackPolicy()
        self.Q = self.initializeQ()
        self.returns = self.initializeReturns()

    '''
    Create an empty returns list
    '''
    def initializeReturns(self):
        dealer_possibilities = []
        player_sum_possibilities = []
        actions = []

        for i in range(self.env.action_space.n):
            actions.append([])

        for i in range(self.dealer_size):
            dealer_possibilities.append(actions)
        
        for j in range(self.sum_size):
            player_sum_possibilities.append(dealer_possibilities)
        
        
        
        return player_sum_possibilities
    
    '''
    Initializes each state's policy to a random initial value
    '''
    def initializeBlackjackPolicy(self):
        policy = np.empty((self.sum_size, self.dealer_size), dtype=int)
        for sum in range(self.sum_size):
            for dealer_value in range(self.dealer_size):
                policy[sum, dealer_value] = 0

        return policy
    
    '''
    Creates a Q function of zeros for each state action pair
    '''
    def initializeQ(self):
        Q = np.empty((self.sum_size, self.dealer_size, self.env.action_space.n))
        for sum in range(self.sum_size):
            for dealer_value in range(self.dealer_size):
                Q[sum, dealer_value] = np.zeros(self.env.action_space.n)

        return Q
    
    '''
    Returns a random action to take and gain experience from
    Primarily for testing purposes, inefficient in practice
    '''
    def getRandomAction(self, observation):
        return np.random.randint(0, self.env.action_space.n)

    def getActionFromPolicy(self, observation):
        S_t_dim_0 = observation[0] - 1 # subtract 1 for indexing
        S_t_dim_1 = observation[1] - 1 # subtract 1 for indexing
        return self.policy[S_t_dim_0][S_t_dim_1]
    
    '''
    Generates an action based on the policy to follow
    '''
    def generateEpisode(self, actionType=getActionFromPolicy):
        observation = self.env.reset()
        observation = (observation[0][0], observation[0][1])
        episode = []
        done = False
        while not done:
            action = actionType(self, observation)
            new_observation, reward, terminated, truncated, _ = self.env.step(action)

            new_state = (observation[0], observation[1])
            episode.append((new_state, action, reward))
            observation = new_observation

            done = terminated or truncated
        return episode

    def updateAfterEpisode(self, episode):
        G = 0
        for step in range(len(episode)):
            S_t_dim_0 = episode[step][0][0]
            S_t_dim_1 = episode[step][0][1]
            
            A_t = episode[step][1]

            reward = episode[step][2]
            G = self.gamma * G + reward

            if episode[step][0] in episode[:step]: # Implement first look only
                continue

            self.returns[S_t_dim_0][S_t_dim_1][A_t].append(G)
            self.Q[S_t_dim_0, S_t_dim_1, A_t] = np.average(self.returns[S_t_dim_0][S_t_dim_1][A_t])
            self.policy[S_t_dim_0][S_t_dim_1] = np.argmax(self.Q[S_t_dim_0, S_t_dim_1])

    def monteCarloES(self, gamma=0.85, num_episodes=100000):
        for ep in tqdm(range(num_episodes)):
            observation, info = self.env.reset()
            episode = self.generateEpisode()
            self.updateAfterEpisode(episode)



In [None]:
a = Agent()
a.monteCarloES()

In [482]:
a.policy

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0],
       [0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0],
       [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
       [0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1],
       [0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0],
       [0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1],
       [0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0],
       [0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
a.returns[30][0][0]

In [463]:
np.average(a.returns[30][0][0])

-0.1827600080466707

In [464]:
a.Q[30, 0]

array([0., 0.])