In [11]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces

class MyFrozenLakeEnv(gym.Env):
    metadata = {"render_modes": ["human"]}

    def __init__(self, render_mode=None):
        super().__init__()
        self.map = [
            "SFFFFFFF",
            "FFFFFFFF",
            "FFFHFFFF",
            "FFFFFHFF",
            "FHFFHFFF",
            "FHFFFFHF",
            "FFFFFFHF",
            "FFFHFFFG"
        ]
        self.nrow = len(self.map)
        self.ncol = len(self.map[0])
        self.nS = self.nrow * self.ncol
        self.nA = 4
        self.render_mode = render_mode
        self.action_space = spaces.Discrete(self.nA)
        self.observation_space = spaces.Discrete(self.nS)
        self.pos_to_state = lambda r, c: r * self.ncol + c
        self.state_to_pos = lambda s: (s // self.ncol, s % self.ncol)
        self.state = None
        self.P = self._build_transition_matrix()

    def _build_transition_matrix(self):
        P = {s: {a: [] for a in range(self.nA)} for s in range(self.nS)}
        for r in range(self.nrow):
            for c in range(self.ncol):
                s = self.pos_to_state(r, c)
                tile = self.map[r][c]
                for a in range(self.nA):
                    if tile in "GH":
                        P[s][a] = [(1.0, s, 0, True)]
                        continue
                    new_r, new_c = r, c
                    if a == 0: new_c = max(c - 1, 0)
                    elif a == 1: new_r = min(r + 1, self.nrow - 1)
                    elif a == 2: new_c = min(c + 1, self.ncol - 1)
                    elif a == 3: new_r = max(r - 1, 0)
                    new_s = self.pos_to_state(new_r, new_c)
                    new_tile = self.map[new_r][new_c]
                    reward = 1 if new_tile == "G" else 0
                    done = new_tile in "GH"
                    P[s][a].append((1.0, new_s, reward, done))
        return P

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.state = (0, 0)
        return self._get_obs(), {}

    def step(self, action):
        r, c = self.state
        if action == 0: c = max(c - 1, 0)
        elif action == 1: r = min(r + 1, self.nrow - 1)
        elif action == 2: c = min(c + 1, self.ncol - 1)
        elif action == 3: r = max(r - 1, 0)
        self.state = (r, c)
        tile = self.map[r][c]
        reward = 1 if tile == "G" else 0
        done = tile in "GH"
        return self._get_obs(), reward, done, False, {}

    def _get_obs(self):
        r, c = self.state
        return self.pos_to_state(r, c)

    def render(self):
        r, c = self.state
        for i in range(self.nrow):
            for j in range(self.ncol):
                if (i, j) == (r, c):
                    print(" A ", end="")
                else:
                    print(f" {self.map[i][j]} ", end="")
            print()
        print()





In [12]:

def value_iteration(env, gamma=0.99, theta=1e-8):
    V = np.zeros(env.nS)
    while True:
        delta = 0
        for s in range(env.nS):
            A = np.zeros(env.nA)
            for a in range(env.nA):
                for prob, next_state, reward, done in env.P[s][a]:
                    A[a] += prob * (reward + gamma * V[next_state])
            max_val = np.max(A)
            delta = max(delta, abs(V[s] - max_val))
            V[s] = max_val
        if delta < theta: break
    policy = np.zeros(env.nS, dtype=int)
    for s in range(env.nS):
        A = np.zeros(env.nA)
        for a in range(env.nA):
            for prob, next_state, reward, done in env.P[s][a]:
                A[a] += prob * (reward + gamma * V[next_state])
        policy[s] = np.argmax(A)
    return policy, V


In [13]:
def policy_iteration(env, gamma=0.99, theta=1e-8):
    policy = np.zeros(env.nS, dtype=int)
    V = np.zeros(env.nS)
    stable = False
    iterations = 0
    while not stable:
        iterations += 1
        while True:
            delta = 0
            for s in range(env.nS):
                a = policy[s]
                val = 0
                for prob, next_state, reward, done in env.P[s][a]:
                    val += prob * (reward + gamma * V[next_state])
                delta = max(delta, abs(V[s] - val))
                V[s] = val
            if delta < theta: break
        stable = True
        for s in range(env.nS):
            old = policy[s]
            A = np.zeros(env.nA)
            for a in range(env.nA):
                for prob, next_state, reward, done in env.P[s][a]:
                    A[a] += prob * (reward + gamma * V[next_state])
            new = np.argmax(A)
            policy[s] = new
            if new != old:
                stable = False
    return policy, V, iterations


In [14]:
def print_policy_arrows(policy, env):
    arrows = {0: '←', 1: '↓', 2: '→', 3: '↑'}
    for r in range(env.nrow):
        line = ""
        for c in range(env.ncol):
            s = env.pos_to_state(r, c)
            ch = env.map[r][c]
            if ch == "H":
                line += " H  "
            elif ch == "G":
                line += " G  "
            else:
                line += f" {arrows[policy[s]]}  "
        print(line)


In [16]:
def evaluate_policy(env, policy, episodes=100):
    total_reward = 0
    success = 0
    for _ in range(episodes):
        obs, _ = env.reset()
        done = False
        while not done:
            action = policy[obs]
            obs, reward, done, _, _ = env.step(action)
        total_reward += reward
        success += reward
    return success / episodes, total_reward / episodes

env = MyFrozenLakeEnv()

vi_policy, vi_V = value_iteration(env)
vi_success, vi_reward = evaluate_policy(env, vi_policy)
print("Value Iteration")
print("Success Rate:", vi_success)
print("Avg Reward:", vi_reward)
print_policy_arrows(vi_policy, env)

pi_policy, pi_V, pi_iters = policy_iteration(env)
pi_success, pi_reward = evaluate_policy(env, pi_policy)
print("Policy Iteration")
print("Success Rate:", pi_success)
print("Avg Reward:", pi_reward)
print("Iterations:", pi_iters)
print_policy_arrows(pi_policy, env)




Value Iteration
Success Rate: 1.0
Avg Reward: 1.0
 ↓   ↓   ↓   ↓   ↓   ↓   ↓   ↓  
 ↓   ↓   ↓   →   ↓   ↓   ↓   ↓  
 ↓   ↓   ↓   H   →   →   ↓   ↓  
 ↓   →   ↓   ↓   ←   H   ↓   ↓  
 ↓   H   ↓   ↓   H   ↓   →   ↓  
 ↓   H   ↓   ↓   ↓   ↓   H   ↓  
 →   →   →   →   ↓   ↓   H   ↓  
 →   →   ↑   H   →   →   →   G  
Policy Iteration
Success Rate: 1.0
Avg Reward: 1.0
Iterations: 15
 ↓   ↓   ↓   ↓   ↓   ↓   ↓   ↓  
 ↓   ↓   ↓   →   ↓   ↓   ↓   ↓  
 ↓   ↓   ↓   H   →   →   ↓   ↓  
 ↓   →   ↓   ↓   ←   H   ↓   ↓  
 ↓   H   ↓   ↓   H   ↓   →   ↓  
 ↓   H   ↓   ↓   ↓   ↓   H   ↓  
 →   →   →   →   ↓   ↓   H   ↓  
 →   →   ↑   H   →   →   →   G  
