Implementation of Off-policy MC Control for estimating the optimal policy (Sutton and Barto, section 5.7, page 111)

We will be using OpenAI Gym's CliffWalking environment.

Action Space - move up (0), move right (1), move down (2), move left (3)

In [118]:
import gym
import numpy as np
from tqdm import tqdm

In [267]:
class Agent:

    '''
    Sets up the environment, Q function, C function, and policy
    '''
    def __init__(self):
        self.env = gym.make("CliffWalking-v0")
        self.Q = np.zeros((self.env.observation_space.n, self.env.action_space.n))
        self.C = np.zeros((self.env.observation_space.n, self.env.action_space.n))
        self.policy = np.array([np.argmax(self.Q[i]) for i in range(len(self.Q))])
    
    '''
    Generates an episode, following a specific policy
    '''
    def generate_episode(self, policy):
        episode = []

        state = self.env.reset()[0]
        action = policy(state)

        while True:
            new_state, reward, terminated, truncated, _ = self.env.step(action)
            
            episode.append([state, action, reward])

            if terminated:
                episode[-1][2] = 100 # Reward shaping, give reward if episode terminates
                return episode
            
            if truncated:
                return episode

            state = new_state
            action = policy(state)
    
    '''
    Random policy for testing, highly inefficient as expected
    '''
    def randomPolicy(self, state):
        return np.random.randint(self.env.action_space.n)
    
    '''
    Halfway decent manually crafted policy, good for giving the agent an initial push
    '''
    def goRightEpsilonGreedily(self, state):
        if np.random.rand() > 0.75:
            return np.random.randint(self.env.action_space.n)
        
        return 1
    
    '''
    Theoretically a better policy, but has a hard time finding the goal initially
    '''
    def maxQEpsilonGreedily(self, state):
        if np.random.rand() > 0.75:
            return np.random.randint(self.env.action_space.n)
        
        return np.argmax(self.Q[state])

    '''
    Control sequence, limited to a certain number of episodes
    Follows algorithm in Sutton and Barto
    '''
    def control(self, num_episodes):
        gamma = 0.95
        for episode_number in tqdm(range(num_episodes)):
            episode = self.generate_episode(self.goRightEpsilonGreedily)
            G = 0
            W = 1

            for step_idx in range(len(episode) - 1, -1, -1):
                step = episode[step_idx]
                G = gamma * G + step[2]
                self.C[step[0]][step[1]] = self.C[step[0]][step[1]] + W
                self.Q[step[0]][step[1]] = self.Q[step[0]][step[1]] + W / self.C[step[0]][step[1]] * (G - self.Q[step[0]][step[1]])
                self.policy[step[0]] = np.argmax(self.Q[step[0]])
                if step[1] != self.policy[step[0]]:
                    break
                W = W / 0.25

In [274]:
a = Agent()
a.control(10000)

100%|██████████| 10000/10000 [00:23<00:00, 426.23it/s]


In [275]:
a.policy.reshape(4, 12)

array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)