In [38]:
import gymnasium as gym
import numpy as np
import random

# Create FrozenLake environment (non-slippery)
env = gym.make("FrozenLake-v1", is_slippery=False, map_name="4x4")

state_space = env.observation_space.n   # number of states
action_space = env.action_space.n       # number of actions

# Initialize Q-table
Q = np.zeros((state_space, action_space))

# Hyperparameters
alpha = 0.8        # learning rate
gamma = 0.95       # discount factor
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995
episodes = 5000

epsilon = epsilon_start

# Training loop
for episode in range(episodes):
    state, _ = env.reset()
    done = False
    
    while not done:
        # Epsilon-greedy action selection
        if random.uniform(0,1) < epsilon:
            action = env.action_space.sample()   # explore
        else:
            action = np.argmax(Q[state])         # exploit
        
        # Take action
        next_state, reward, done, truncated, info = env.step(action)
        
        # Q-learning update rule
        Q[state, action] = Q[state, action] + alpha * (
            reward + gamma * np.max(Q[next_state]) - Q[state, action]
        )
        
        state = next_state
    
    # Decay epsilon
    epsilon = max(epsilon_end, epsilon * epsilon_decay)


In [39]:
# Save Q-table to file
np.save("qtable.npy", Q)

# Load Q-table later
Q_loaded = np.load("qtable.npy")

In [40]:
state, _ = env.reset()
done = False
steps = 0

while not done:
    action = np.argmax(Q_loaded[state])   # exploit learned policy
    next_state, reward, done, truncated, info = env.step(action)
    env.render()
    state = next_state
    steps += 1

print("Episode finished after", steps, "steps")


Episode finished after 6 steps
