## Install required packages

In [None]:
# Install dependencies from pyproject.toml (works on Google Colab)
%pip install -q -e .

## Coding

In [4]:
import gymnasium as gym
import numpy as np
from tqdm import tqdm
import random
import imageio.v3 as iio
import pandas as pd

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

In [5]:
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode="rgb_array")
env.reset(seed=SEED)
env.action_space.seed(SEED)
env.observation_space.seed(SEED)

42

**Explore the environment**

In [6]:
env.observation_space.n

np.int64(16)

In [7]:
env.action_space.n

np.int64(4)

In [8]:
state_num = env.observation_space.n
action_num = env.action_space.n
q_table = np.zeros((state_num, action_num))

# display q_table as a dataframe which is easier to read
pd.DataFrame(q_table)

Unnamed: 0,0,1,2,3
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0


In [9]:
def greedy_policy(state):
    action = np.argmax(q_table[state])
    return action

In [10]:
def epsilon_greedy_policy(state, epsilon):
    chance = random.uniform(0, 1)
    if chance > epsilon:
        action = greedy_policy(state)
    else:
        action = env.action_space.sample()

    return action

In [11]:
learning_rate = 0.7
gamma = 0.95

def train(max_traning_episodes, min_epsilon, max_epsilon, decay_rate, max_steps_per_episode):
    for episode in tqdm(range(max_traning_episodes)):
        epsilon = max(min_epsilon, max_epsilon * np.exp(-decay_rate * episode))

        # every episode, we reset env
        state, _ = env.reset()

        for step in range(max_steps_per_episode):
            action = epsilon_greedy_policy(state, epsilon)

            new_state, reward, terminated, truncated, _ = env.step(action)

            q_table[state][action] += learning_rate * (reward + gamma * np.max(q_table[new_state]) - q_table[state][action])

            if terminated or truncated:
                break

            state = new_state
            

In [12]:
train(10000, 0.05, 1, 0.0005, 99) 

  0%|          | 0/10000 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [00:01<00:00, 5591.87it/s]


In [13]:
# display q_table as a dataframe which is easier to read
pd.DataFrame(q_table)

Unnamed: 0,0,1,2,3
0,0.735092,0.773781,0.773781,0.735092
1,0.735092,0.0,0.814506,0.773781
2,0.773781,0.857375,0.773781,0.814506
3,0.814506,0.0,0.773781,0.773781
4,0.773781,0.814506,0.0,0.735092
5,0.0,0.0,0.0,0.0
6,0.0,0.9025,0.0,0.814506
7,0.0,0.0,0.0,0.0
8,0.814506,0.0,0.857375,0.773781
9,0.814506,0.9025,0.9025,0.0


In [14]:
def evaluate_agent(max_steps, max_eval_episodes):
    rewards = []
    for episode in tqdm(range(max_eval_episodes)):
        state, _ = env.reset()
        episode_reward = 0

        for step in range(max_steps):
            action = greedy_policy(state)
            new_state, reward, terminated, truncated, _ = env.step(action)
            episode_reward += reward

            if terminated or truncated:
                break

            state = new_state

        rewards.append(episode_reward)

    return np.mean(rewards), np.std(rewards)
            

In [15]:
mean, std = evaluate_agent(99, 100)
mean, std

100%|██████████| 100/100 [00:00<00:00, 4853.79it/s]


(np.float64(1.0), np.float64(0.0))

In [16]:
def record_video():
    frames = []
    state, _ = env.reset()
    
    frames.append(np.array(env.render()))

    while True:
        action = greedy_policy(state)

        state, _, terminated, truncated, _ = env.step(action)

        frames.append(np.array(env.render()))

        if terminated or truncated:
            break

    iio.imwrite("./demo.mp4", frames, fps=1)

record_video()

## Watch a episode of Agent playing FrozenLake

In [17]:
from IPython.display import Video

Video("./demo.mp4", embed=True)