SARSA IMPLEMENTATION

In [89]:
import gymnasium as gym
import numpy as np

In [90]:
env = gym.make(id='CliffWalking-v0', render_mode="rgb_array")

In [91]:
q_table = np.zeros(shape=(48,4))

In [92]:
# using ε-greedy policy
def policy(state, explore):
    action = int(np.argmax(q_table[state])) #  returns the index (action) corresponding to the max. q_value
    if np.random.random() <= explore:
        action = np.random.randint(0,4)
    return action

In [93]:
# PARAMETERS
EPSILON = 0.1
ALPHA = 0.1
GAMMA = 0.9
N_EPISODES = 500

In [94]:
for episode in range(N_EPISODES):
    done = False
    total_reward, episode_len = 0, 0
    state, _ = env.reset() ; action = policy(state, EPSILON)
    while not done:
        new_state, reward, done, _ , _ = env.step(action)
        new_action = policy(new_state, EPSILON)
        q_table[state][action] += ALPHA * (reward + GAMMA * (q_table[new_state][new_action]) - q_table[state][action])
        state, action = new_state, new_action
        episode_len += 1 ; total_reward += reward
    print(f"Episode, Total Reward, Episode Length: {episode}, {total_reward}, {episode_len}")
env.close()

Episode, Total Reward, Episode Length: 0, -108, 108
Episode, Total Reward, Episode Length: 1, -2280, 993
Episode, Total Reward, Episode Length: 2, -199, 199
Episode, Total Reward, Episode Length: 3, -323, 224
Episode, Total Reward, Episode Length: 4, -541, 343
Episode, Total Reward, Episode Length: 5, -165, 165
Episode, Total Reward, Episode Length: 6, -337, 238
Episode, Total Reward, Episode Length: 7, -221, 122
Episode, Total Reward, Episode Length: 8, -397, 298
Episode, Total Reward, Episode Length: 9, -191, 92
Episode, Total Reward, Episode Length: 10, -335, 137
Episode, Total Reward, Episode Length: 11, -390, 192
Episode, Total Reward, Episode Length: 12, -59, 59
Episode, Total Reward, Episode Length: 13, -368, 170
Episode, Total Reward, Episode Length: 14, -101, 101
Episode, Total Reward, Episode Length: 15, -82, 82
Episode, Total Reward, Episode Length: 16, -147, 147
Episode, Total Reward, Episode Length: 17, -317, 119
Episode, Total Reward, Episode Length: 18, -44, 44
Episode, 

In [95]:
#  RENDERING THE ENVIRONMENT
import imageio as io
done = False
state, _ = env.reset()
frames = [env.render()]
while not done:
    action = policy(state, explore=0.0)
    state, reward, done, _, _ = env.step(action)
    frames.append(env.render())
env.close()
io.mimsave('cliff_walk_sarsa.gif', frames, fps=1)