Implementation of Off-policy MC Control for estimating the optimal policy (Sutton and Barto, section 5.7, page 111)

We will be using OpenAI Gym's CliffWalking environment.

Action Space - move up (0), move right (1), move down (2), move left (3)

In [118]:
import gym
import numpy as np
from tqdm import tqdm

In [151]:
class Agent:

    def __init__(self):
        self.env = gym.make("CliffWalking-v0")
        self.Q = np.zeros((self.env.observation_space.n, self.env.action_space.n))
        self.C = np.zeros((self.env.observation_space.n, self.env.action_space.n))
        # self.policy = [np.argmax(observation) for observation in self.Q]
        self.policy = np.ones(self.env.observation_space.n)
    
    def generate_episode(self, policy):
        episode = []

        state = self.env.reset()[0]
        action = policy()

        while True:
            new_state, reward, terminated, truncated, _ = self.env.step(action)
            
            if terminated or truncated:
                return episode
            
            episode.append([state, action, reward])

            state = new_state
            action = policy()
    
    def randomPolicy(self):
        return np.random.randint(self.env.action_space.n)
    
    def goRightEpsilonGreedily(self):
        if np.random.rand() > 0.5:
            return np.random.randint(self.env.action_space.n)
        
        return 1

    def control(self, num_episodes):
        gamma = 0.95
        for _ in tqdm(range(num_episodes)):
            episode = self.generate_episode(self.goRightEpsilonGreedily)
            G = 0
            W = 1
            for step_idx in range(len(episode) - 1, -1, -1):
                step = episode[step_idx]
                G = gamma * G + step[2]
                self.C[step[0]][step[1]] = self.C[step[0]][step[1]] + W
                self.Q[step[0]][step[1]] = self.Q[step[0]][step[1]] + W / self.C[step[0]][step[1]] * (G - self.Q[step[0]][step[1]])
                self.policy[step[0]] = np.argmax(self.Q[step[0]])
                if step[1] != self.policy[step[0]]:
                    break
                W = W / 0.25

In [153]:
a = Agent()
a.control(1000)

100%|██████████| 1000/1000 [00:01<00:00, 572.09it/s]


In [155]:
a.Q

array([[ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0., -1.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0., -

In [79]:
env = gym.make("CliffWalking-v0")
env.reset()
env.step(0)

(24, -1, False, False, {'prob': 1.0})