In [1]:
import gymnasium as gym
import numpy as np

In [2]:
env = gym.make('CliffWalking-v0')

## Q1) Exploring starts

In [3]:
def monte_carlo_es(env, n_episodes=500):
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    N = np.zeros((env.observation_space.n, env.action_space.n))
    gamma = 1.0
    total_steps = []

    for i in range(n_episodes):
        state, info = env.reset()
        episode = []
        done = False
        steps = 0

        while not done:
            action = np.random.choice(env.action_space.n)
            next_state, reward, done, info, _ = env.step(action)
            episode.append((state, action, reward))
            state = next_state
            steps += 1
        total_steps.append(steps)

        returns = 0
        for j in range(len(episode)-1, -1, -1):
            state, action, reward = episode[j]
            returns = gamma*returns + reward
            N[state][action] += 1
            Q[state][action] += (returns - Q[state][action])/N[state][action]

    policy = np.argmax(Q, axis=1)

    return policy, Q, total_steps

## Q2) On-policy first-visit MC control

In [4]:
def on_policy_mc_control(env, n_episodes=500, epsilon=0.1):
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    N = np.zeros((env.observation_space.n, env.action_space.n))
    gamma = 1.0
    total_steps = []

    for i in range(n_episodes):
        state, info = env.reset()
        done = False
        steps = 0

        while not done:
            if np.random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[state])
            next_state, reward, done, info, _ = env.step(action)
            N[state][action] += 1
            Q[state][action] += (reward + gamma*np.max(Q[next_state]) - Q[state][action])/N[state][action]
            state = next_state
            steps += 1
        total_steps.append(steps)

    policy = np.argmax(Q, axis=1)

    return policy, Q, total_steps

In [5]:
monte_carlo_es_policy, monte_carlo_es_q, total_steps_es = monte_carlo_es(env)
on_policy_mc_control_policy, on_policy_mc_control_q, total_steps_control = on_policy_mc_control(env)

In [6]:
print(str.format('Total Number of Steps taken to reach Optimal Policy using Monte Carlo ES: {}', sum(total_steps_es)))
print(str.format('Total Number of Steps taken to reach Optimal Policy using On-Policy First-Visit MC Control: {}', sum(total_steps_control)))


Total Number of Steps taken to reach Optimal Policy using Monte Carlo ES: 3298153
Total Number of Steps taken to reach Optimal Policy using On-Policy First-Visit MC Control: 17525


In [7]:
print(str.format('Average Number of Steps per Episode taken to reach Optimal Policy using Monte Carlo ES: {}', sum(total_steps_es)/len(total_steps_es)))
print(str.format('Average Number of Steps per Episode taken to reach Optimal Policy using On-Policy First-Visit MC Control: {}', sum(total_steps_control)/len(total_steps_control)))


Average Number of Steps per Episode taken to reach Optimal Policy using Monte Carlo ES: 6596.306
Average Number of Steps per Episode taken to reach Optimal Policy using On-Policy First-Visit MC Control: 35.05


We see that Monte Carlo control performs better - that is - takes less number of steps to reach an optimal policy than the exploring starts technique. This shows that Monte Carlo control is a better method to use.