In [1]:
import time

from environments import (
    WindyGridWorldEnv,
    KingWindyGridWorldEnv,
    StochasticWindyGridWorldEnv,
    CliffWalkingEnv
)
from agents.temporal_difference import (
    Sarsa_Agent,
    ExpectedSarsa_Agent,
    QLearning_Agent,
    DoubleQLearning_Agent
)

In [2]:
# function for playing until terminal state
def play(env, agent, render = False):    
    if render:
        env.render()

    done = False
    s = env.reset()
    a = agent.get_action(s)

    while not done:
        t, r, done, _ = env.step(a)
        s, a = agent.learn([s, a, r, t], alpha = 0.5, gamma = 1)

        if render:
            env.render()
            time.sleep(0.3)

In [3]:
# define environment
# env = WindyGridWorldEnv()
# env = KingWindyGridWorldEnv()
# env = StochasticWindyGridWorldEnv()
env = CliffWalkingEnv()

In [4]:
%%time

# train sarsa agent
agent_sarsa = Sarsa_Agent(state_space = env.state_space, action_space = env.action_space, epsilon = 0.1)
agent_sarsa.is_learning = True

for episode in range(1000):
    play(env, agent_sarsa)

CPU times: user 2.2 s, sys: 5.68 ms, total: 2.21 s
Wall time: 2.2 s


In [5]:
# follow target-policy of sarsa agent
agent_sarsa.is_learning = False
play(env, agent_sarsa, render = True)

Canvas(height=200, width=600)

In [6]:
%%time

# train q-learning agent
agent_q = QLearning_Agent(state_space = env.state_space, action_space = env.action_space, epsilon = 0.1)
agent_q.is_learning = True

for episode in range(1000):
    play(env, agent_q)

CPU times: user 2.31 s, sys: 18 ms, total: 2.33 s
Wall time: 2.33 s


In [7]:
# follow target-policy of q-learning agent
agent_q.is_learning = False
_ = play(env, agent_q, render = True)

Canvas(height=200, width=600)

In [8]:
%%time

# train expected sarsa agent
agent_expected_sarsa = ExpectedSarsa_Agent(state_space = env.state_space, action_space = env.action_space, epsilon = 0.1)
agent_expected_sarsa.is_learning = True

for episode in range(1000):
    play(env, agent_expected_sarsa)

CPU times: user 2.2 s, sys: 6.65 ms, total: 2.21 s
Wall time: 2.2 s


In [9]:
# follow target-policy of expected sarsa agent
agent_expected_sarsa.is_learning = False
_ = play(env, agent_expected_sarsa, render = True)

Canvas(height=200, width=600)

In [10]:
%%time

# train double q-learning agent
agent_double_q = DoubleQLearning_Agent(state_space = env.state_space, action_space = env.action_space, epsilon = 0.1)
agent_double_q.is_learning = True

for episode in range(1000):
    play(env, agent_double_q)

CPU times: user 3.11 s, sys: 0 ns, total: 3.11 s
Wall time: 3.11 s


In [11]:
# follow target-policy of double q-learning agent
agent_double_q.is_learning = False
_ = play(env, agent_double_q, render = True)

Canvas(height=200, width=600)