In [1]:
import gym
import numpy as np

from dp import policy_iteration, value_iteration

# Action mappings
action_mapping = {
    0: '\u2191',  # UP
    1: '\u2192',  # RIGHT
    2: '\u2193',  # DOWN
    3: '\u2190',  # LEFT
}


def play_episodes(environment, n_episodes, policy):
    wins = 0
    total_reward = 0

    for episode in range(n_episodes):

        terminated = False
        state = environment.reset()

        while not terminated:

            # Select best action to perform in a current state
            action = np.argmax(policy[state])

            # Perform an action an observe how environment acted in response
            next_state, reward, terminated, info = environment.step(action)

            # Summarize total reward
            total_reward += reward

            # Update current state
            state = next_state

            # Calculate number of wins over episodes
            if terminated and reward == 1.0:
                wins += 1

    average_reward = total_reward / n_episodes

    return wins, total_reward, average_reward


In [2]:
environment = gym.make('FrozenLake8x8-v0')

environment.render()
print(environment.action_space)  # Discrete()
print(environment.observation_space)  # Discrete()


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
Discrete(4)
Discrete(64)


In [None]:
#Try policy iteration

In [5]:
iteration_name = 'Policy Iteration'
iteration_func = policy_iteration

    # Search for an optimal policy using policy iteration
policy, V = iteration_func(environment.env)

print(f'\n Final policy derived using {iteration_name}:')
print(' '.join([action_mapping[action] for action in np.argmax(policy, axis=1)]))

Policy evaluated in 203 iterations.
Evaluated 2 policies.

 Final policy derived using Policy Iteration:
← ← ← ↓ ↓ ↓ ↓ ↓ ← ← ← ← ↓ ↓ ↓ ↓ ← ← ↑ ↑ ↓ ← ↓ ↓ ← ← ← → ↑ ↑ ↓ ↓ ← ← ↑ ↑ ↓ → ← ↓ ↑ ↑ ↑ → ← ↑ ↑ ↓ ↑ ↑ → ↑ ↑ ↑ ↑ ↓ ↑ → ↑ ↑ → → → ↑


In [8]:
policy2, V = iteration_func(environment.env, discount_factor = 0.5)

print(f'\n Final policy derived using {iteration_name}:')
print(' '.join([action_mapping[action] for action in np.argmax(policy2, axis=1)]))

Policy evaluated in 18 iterations.
Evaluated 2 policies.

 Final policy derived using Policy Iteration:
→ ↓ ↓ ↓ ↓ ↓ ↓ ↓ → ↓ ↓ ← ↓ ↓ → → → ↓ ↑ ↑ ↓ ← ↓ → ← ↓ ← → ↑ ↑ ↓ → ← ← ↑ ↑ ↓ → ← ↓ ↑ ↑ ↑ → ← ↑ ↑ ↓ ↑ ↑ → ↑ ↑ ↑ ↑ ↓ → → ↑ ↑ → → → ↑


In [11]:
def test_policy_iter(policy, n_episodes = 1000):
    wins, total_reward, average_reward = play_episodes(environment, n_episodes, policy)

    print(f'{iteration_name} :: number of wins over {n_episodes} episodes = {wins}')
    print(f'{iteration_name} :: average reward over {n_episodes} episodes = {average_reward} \n\n')


In [12]:

test_policy_iter(policy, 5000)
test_policy_iter(policy, 10000)

test_policy_iter(policy2, 5000)
test_policy_iter(policy2, 10000)


Policy Iteration :: number of wins over 5000 episodes = 4129
Policy Iteration :: average reward over 5000 episodes = 0.8258 


Policy Iteration :: number of wins over 10000 episodes = 8323
Policy Iteration :: average reward over 10000 episodes = 0.8323 


Policy Iteration :: number of wins over 5000 episodes = 2803
Policy Iteration :: average reward over 5000 episodes = 0.5606 


Policy Iteration :: number of wins over 10000 episodes = 5555
Policy Iteration :: average reward over 10000 episodes = 0.5555 


