In [None]:
from platform import python_version

print(python_version())

In [None]:
import gym
import math
import warnings

import hiive.mdptoolbox as mdptoolbox
from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

import pygame
from algorithms.rl import RL
from algorithms.planner import Planner
from examples.test_env import TestEnv

from IPython.display import clear_output, display

# suppress pandas warning
pd.options.mode.chained_assignment = None

# set seed
np.random.seed(0)

%matplotlib inline

# Helper Functions

In [None]:
colors = {
    b'S': 'b',
    b'F': 'w',
    b'H': 'k',
    b'G': 'g'
}

directions = {
            0: '←',
            1: '↓',
            2: '→',
            3: '↑'
}

def plot_lake(env, policy=None, title='Frozen Lake'):
    squares = env.nrow
    fig = plt.figure(figsize=(6, 6))
    ax = fig.add_subplot(111, xlim=(-.01, squares+0.01), ylim=(-.01, squares+0.01))
    plt.title(title, fontsize=16, weight='bold', y=1.01)
    for i in range(squares):
        for j in range(squares):
            y = squares - i - 1
            x = j
            p = plt.Rectangle([x, y], 1, 1, linewidth=0.5, edgecolor='k')
            p.set_facecolor(colors[env.desc[i,j]])
            ax.add_patch(p)
            
            if policy is not None:
                if (i, j)!=(7,7):
                    text = ax.text(x+0.5, y+0.5, directions[policy[i, j]],
                               horizontalalignment='center', size=20, verticalalignment='center',
                               color='k')
            
    plt.axis('off')
    # plt.savefig('./frozen/' + title + '.png', dpi=400)
    
# plot_lake(env, policy)


# Setup Problem

In [None]:
# Setup 8x8

env = gym.make('FrozenLake8x8-v1', render_mode=None)

plot_lake(env)

In [None]:
# calculate number of iterations

def number_iteration(V_track, max_iteration):
    iteration =1
    for i in range(max_iteration):
        if not np.all((V_track[i] == 0)):
            iteration += 1
    return iteration

In [None]:
gammas   = [0.1, 0.3, 0.6, 0.9, 0.99]
epsilons = [1e-2, 1e-5, 1e-8, 1e-12]

y = []

for gamma in gammas:
    for epsilon in epsilons:   

        # output success percentage of VI 
        V, V_track, pi = Planner(env.P).value_iteration(gamma=gamma, n_iters=10000, theta=epsilon)
        test_scores = TestEnv.test_env(env=env, render=True, n_iters=100, user_input=False, pi=pi)
        print (gamma, epsilon, test_scores.mean())

        # output success percentage of PI 
        V, V_track, pi = Planner(env.P).policy_iteration(gamma=gamma, n_iters=10000, theta=epsilon)
        test_scores = TestEnv.test_env(env=env, render=True, n_iters=50, user_input=False, pi=pi)
        print (gamma, epsilon, test_scores.mean())

In [None]:
gammas   = [0.1, 0.3, 0.6, 0.9, 0.99]
epsilons = [1e-2, 1e-5, 1e-8, 1e-12]

for gamma in gammas:
    for epsilon in epsilons: 

        # run the VI algorithm
        V, V_track, pi = Planner(env.P).value_iteration(gamma=gamma, n_iters=10000, theta=epsilon)

        # output the iteration of VI 
        iteration = number_iteration(V_track, 10000)
        print(gamma, epsilon, iteration)

        # run the PI algorithm
        V, V_track, pi = Planner(env.P).policy_iteration(gamma=gamma, n_iters=10000, theta=epsilon)

        # output the iteration of PI 
        iteration = number_iteration(V_track, 10000)
        print(gamma, epsilon, iteration)


In [None]:
gammas   = [0.1, 0.3, 0.6, 0.9, 0.99]
epsilons = [1e-2, 1e-5, 1e-8, 1e-12]

for gamma in gammas:
    for epsilon in epsilons: 

        # run the VI algorithm
        V, V_track, pi = Planner(env.P).value_iteration(gamma=gamma, n_iters=100000, theta=epsilon)
        new_pi = list(map(lambda x: pi(x), range(64)))
        new_pi = np.around(np.array(new_pi).reshape((8, 8)), 2)

        # plot the policy
        title='Frozen Lake VI Optimal Policy ({}, {})'.format(gamma, epsilon)
        plot_lake(env, new_pi, title)

        # run the PI algorithm
        V, V_track, pi = Planner(env.P).value_iteration(gamma=gamma, n_iters=100000, theta=epsilon)
        new_pi = list(map(lambda x: pi(x), range(64)))
        new_pi = np.around(np.array(new_pi).reshape((8, 8)), 2)

        # plot the policy
        title='Frozen Lake PI Optimal Policy ({}, {})'.format(gamma, epsilon)
        plot_lake(env, new_pi, title)


## Q-Learning

In [None]:
gammas   = [0.8, 0.9, 0.99]
alphas   = [0.01, 0.1, 0.2]
# alpha_decays = [0.9, 0.999]
# epsilon_decays = [0.9, 0.999]
iterations = [500000, 1000000, 5000000]

for gamma in gammas:
    for alpha in alphas:
        for iteration in iterations:
            
            Q, V, pi, Q_track, pi_track = RL(env).q_learning(gamma=gamma, init_alpha=alpha, n_episodes=iteration)

            print(gamma, alpha, iteration)

In [None]:
gammas   = [0.8, 0.9, 0.99]
alphas   = [0.01, 0.1, 0.2]
alpha_decays = [0.9, 0.999]
epsilon_decays = [0.9, 0.999]
iterations = [500000, 1000000, 5000000]

for gamma in gammas:
    for alpha in alphas:
        for iteration in iterations:
            
            Q, V, pi, Q_track, pi_track = RL(env).q_learning(gamma=gamma, init_alpha=alpha, n_episodes=iteration)
            new_pi = list(map(lambda x: pi(x), range(64)))
            new_pi = np.around(np.array(new_pi).reshape((8, 8)), 2)

            # plot the policy
            title='FL Q-Learning ({}, {}, {})'.format(gamma, alpha, iteration)
            plot_lake(env, new_pi, title)

            # calculate the success percentage
            test_scores = TestEnv.test_env(env=env, render=True, n_iters=100, user_input=False, pi=pi)
            print(gamma, alpha, iteration, test_scores.mean())


