Q-LEARNING IMPLEMENTATION

In [26]:
import gymnasium as gym
import numpy as np
import imageio as io

In [27]:
env = gym.make('CliffWalking-v0', render_mode='rgb_array')
q_table = np.zeros((48, 4))

In [28]:
# following Îµ-greedy policy
def policy(state, explore=0.0):
    action = np.argmax(q_table[state])
    if np.random.random() <= explore:
        action = np.random.randint(0,4)
    return action

In [29]:
# PARAMETERS
GAMMA = 0.9
ALPHA = 0.1
N_EPISODES = 500
EPSILON = 0.1

In [30]:
for episode in range(N_EPISODES):
    done = False
    state, _ = env.reset() ; action = policy(state, EPSILON)
    episode_len, total_reward = 0, 0
    while not done:
        new_state, reward, done, _, _ = env.step(action)
        new_action = policy(new_state, EPSILON)
        q_table[state][action] += ALPHA * (reward + GAMMA * np.max(q_table[new_state]) - q_table[state][action])
        state, action = new_state, new_action
        episode_len += 1; total_reward += reward
    print(f"Episode, Total-Episode-Length, Total-Reward: {episode}, {episode_len}, {total_reward}")
env.close()

Episode, Total-Episode-Length, Total-Reward: 0, 100, -100
Episode, Total-Episode-Length, Total-Reward: 1, 81, -279
Episode, Total-Episode-Length, Total-Reward: 2, 1170, -2754
Episode, Total-Episode-Length, Total-Reward: 3, 325, -721
Episode, Total-Episode-Length, Total-Reward: 4, 250, -250
Episode, Total-Episode-Length, Total-Reward: 5, 150, -150
Episode, Total-Episode-Length, Total-Reward: 6, 75, -75
Episode, Total-Episode-Length, Total-Reward: 7, 143, -242
Episode, Total-Episode-Length, Total-Reward: 8, 168, -168
Episode, Total-Episode-Length, Total-Reward: 9, 117, -117
Episode, Total-Episode-Length, Total-Reward: 10, 128, -227
Episode, Total-Episode-Length, Total-Reward: 11, 150, -150
Episode, Total-Episode-Length, Total-Reward: 12, 73, -73
Episode, Total-Episode-Length, Total-Reward: 13, 165, -264
Episode, Total-Episode-Length, Total-Reward: 14, 88, -88
Episode, Total-Episode-Length, Total-Reward: 15, 117, -117
Episode, Total-Episode-Length, Total-Reward: 16, 144, -144
Episode, Tot

In [31]:
done = False
state, _ = env.reset()
frames = [env.render()]
while not done:
    action = policy(state, explore=0.0)
    state, _, done, _ , _ = env.step(action)
    frames.append(env.render())
env.close()
io.mimsave("cliff_walk_qlearn.gif", frames, fps=1)