In [2]:
!pip install gymnasium==0.29.1
!pip install gymnasium[toy-text]



In [1]:
import gymnasium as gym
from gymnasium.wrappers.time_limit import TimeLimit
import numpy as np
print(f"GYM Version = [{gym.__version__}]")
print(f"Numpy Version = [{np.__version__}]")

GYM Version = [0.29.1]
Numpy Version = [1.24.4]


## Fronzen Lake

In [2]:
env=gym.make("FrozenLake-v1",render_mode="ansi")

### Model Dynamics

In [3]:
print(f"Action Space = {env.action_space}")
print(f"State Space = {env.observation_space}")
print(f"Reward Range = {env.reward_range}")

Action Space = Discrete(4)
State Space = Discrete(16)
Reward Range = (0, 1)


In [4]:
env.get_wrapper_attr('P')[4]

{0: [(0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 4, 0.0, False),
  (0.3333333333333333, 8, 0.0, False)],
 1: [(0.3333333333333333, 4, 0.0, False),
  (0.3333333333333333, 8, 0.0, False),
  (0.3333333333333333, 5, 0.0, True)],
 2: [(0.3333333333333333, 8, 0.0, False),
  (0.3333333333333333, 5, 0.0, True),
  (0.3333333333333333, 0, 0.0, False)],
 3: [(0.3333333333333333, 5, 0.0, True),
  (0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 4, 0.0, False)]}

In [5]:
env.reset()
print(env.render())


[41mS[0mFFF
FHFH
FFFH
HFFG



In [6]:
# 1. Initialize a policy function with action 0 for each state
# 2. Calculate value function for the policy
# 3. Calculate new Q function with the value function from #2 
# 4. check if new polciy is equal to optimal policy
# 5. If yes break the loop
# 6. calculate optimal policy. 

In [7]:
p = env.get_wrapper_attr("P")

In [8]:
type(env)

gymnasium.wrappers.time_limit.TimeLimit

In [22]:
def value_iteration(policy, e: TimeLimit):
    values_table = np.zeros(e.observation_space.n)
    num_iterations = 10
    gamma = 100
    
    for t in range(num_iterations):
        for s in range(e.observation_space.n):
            actions = p[s]
            action = actions[policy[s]]
            value = 0.0
            for prob, next_action, reward, _ in action:
                value += prob * (reward + gamma * values_table[next_action])
            values_table[s] = value
#         print(values_table)
    return values_table

In [29]:
def calulate_policy(values_table:list):
    gamma = 1
    policy = np.zeros(env.observation_space.n)
    
    for state, value in enumerate(values_table):    
        q_values = np.zeros(env.action_space.n)
        for action in range(env.action_space.n):
            q_value = 0.0
            for prob, next_action, reward, _ in p[state][action]:
                q_value += prob * (reward + gamma * values_table[next_action])
                q_values[action] = q_value
        policy[state] = np.argmax(q_values)
    
    return policy

In [30]:
def policy_iteration(e: TimeLimit):
    optimal_policy = np.zeros(e.observation_space.n)
    num_iterations = 100
    
    for i in range(num_iterations):
        values_table = value_iteration(optimal_policy, e)
        new_policy = calulate_policy(values_table)
        
#         if new_policy == optimal_policy:
#             break
        optimal_policy = new_policy
    return optimal_policy

In [31]:
policy = policy_iteration(env)
print(policy)

[1. 3. 2. 3. 0. 0. 0. 0. 3. 1. 0. 0. 0. 2. 1. 0.]
