In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import gym
from value_iteration import value_iteration
from policy_iteration import policy_iteration
from Q_learning import q_learning
from frozenlake import FrozenLakeEnv
from plotting import *
import random
from time import time

## Set up environment

In [None]:
name = '5x5 FrozenLake'
env = FrozenLakeEnv(map_name="5x5", is_slippery=False, hole_reward=True)
#name = '10x10 FrozenLake'
#env = FrozenLakeEnv(map_name="10x10", is_slippery=False, hole_reward=True)

## Policy iteration and Value iteration

In [None]:
random.seed(9001)
policy_v, _, V_v, converge_iter_v, time_iter_v = value_iteration(env, theta=0.0001, discount_factor=0.99, max_iter=50, early_stop=False)

In [None]:
random.seed(9001)
policy_p, _, V_p, converge_iter_p, time_iter_p = policy_iteration(env, discount_factor=0.99, max_iter=50, early_stop=False)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(converge_iter_v, label='Value Iteration')
plt.plot(converge_iter_p, label='Policy Iteration')
plt.legend()
plt.title('%s Convergence Value'%name)
plt.xlabel('Iterations')
plt.ylabel('Convergence value')

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(time_iter_v, label='Value Iteration')
plt.plot(time_iter_p, label='Policy Iteration')
plt.legend()
plt.title('%s Clock Time'%name)
plt.xlabel('Iterations')
plt.ylabel('clock time of each iteration (ms) ')

In [None]:
plot_policy_map("5x5 FrozenLake", policy_v, env.desc, env.colors(), env.directions())
#plot_policy_map("Value Iteration Converged Policy", policy_v, env.desc, env.colors(), env.directions())
#plot_policy_map("Policy Iteration Converged Policy", policy_p, env.desc, env.colors(), env.directions())

In [None]:
plot_value_map("Value Iteration Optimal Value", V_v, env.desc, env.colors())
plot_value_map("Policy Iteration Optimal Value", V_p, env.desc, env.colors())

## Q_learning

In [None]:
random.seed(9001)
#policy_qs, V_qs, converge_iter_qs, time_iter_qs, reward_iter_qs = q_learning(env, num_episodes=50000, next_action='simulated_annealing', discount_factor=0.99, decay=0.99999, max_iter=1000)
e=0.8
l=0.9
policy_qe, V_qe, converge_iter_qe, time_iter_qe, reward_iter_qe = q_learning(env, num_episodes=500000, next_action='epsilon_greedy', discount_factor=0.99, alpha=l, epsilon=e, max_iter=10000)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(converge_iter_qe, label='Q-learning epsilon greedy')
plt.legend()
plt.title('%s Convergence Value'%name)
plt.xlabel('Iterations')
plt.ylabel('Convergence value')

In [None]:
plot_policy_map("", policy_qe, env.desc, env.colors(), env.directions())

In [None]:
plot_value_map("Q-learning Values after 10,000 episodes", V_qe, env.desc, env.colors())