In [1]:
import time

from environments import GridWorldEnv
from agents.monte_carlo import OnPolicy_Agent, OffPolicy_Agent

In [2]:
# function for playing until terminal state
def play(env, agent, render = False):    
    if render:
        env.render()

    episode = []
    done = False
    s = env.reset()

    while not done:
        a = agent.get_action(s)
        t, r, done, _ = env.step(a)
        
        episode.append([s, a, r, t])
        s = t

        if render:
            env.render()
            time.sleep(0.3)
    else:
        return episode

In [3]:
# define environment
env = GridWorldEnv()

In [4]:
%%time

# train first-visit on-policy agent
agent_onp = OnPolicy_Agent(state_space = env.state_space, action_space = env.action_space, epsilon = 0.1, first_visit = True)
agent_onp.is_learning = True

for episode in range(10000):
    episode = play(env, agent_onp)
    agent_onp.learn(episode)

CPU times: user 3.83 s, sys: 9.76 ms, total: 3.84 s
Wall time: 3.82 s


In [5]:
# follow optimal policy of first-visit on-policy agent
agent_onp.is_learning = False
_ = play(env, agent_onp, render = True)

Canvas(height=300, width=300)

In [6]:
%%time

# train off-policy agent
agent_offp = OffPolicy_Agent(state_space = env.state_space, action_space = env.action_space, epsilon = 0, first_visit = False)
agent_offp.is_learning = True

for episode in range(10000):
    episode = play(env, agent_offp)
    agent_offp.learn(episode)

CPU times: user 34.3 s, sys: 109 ms, total: 34.4 s
Wall time: 34.3 s


In [7]:
# follow optimal policy of off-policy agent
agent_offp.is_learning = False
_ = play(env, agent_offp, render = True)

Canvas(height=300, width=300)