In [None]:
# Import Modules
import numpy as np
import pandas as pd
import seaborn as sns
from hiive.mdptoolbox.mdp import PolicyIteration
from hiive.mdptoolbox.mdp import ValueIteration
from hiive.mdptoolbox.mdp import QLearning
from hiive.mdptoolbox.example import forest
from hiive.mdptoolbox.openai import OpenAI_MDPToolbox 

In [None]:
# Define FrozenLake environment
frozen_lake = OpenAI_MDPToolbox('FrozenLake-v1')

In [None]:
# FrozenLake Value Iteration Experiments
def vi_frozen_lake_exp(gamma_vec, epsilon_vec=0.01):

    if type(gamma_vec) is list:
        df = pd.DataFrame(columns=['Iteration', 'Reward', 'Gamma'])
        for gamma in gamma_vec:
            vi = ValueIteration(frozen_lake.P, frozen_lake.R, gamma=gamma)
            vi.run()
            temp = pd.DataFrame(vi.run_stats)[['Iteration', 'Reward']]
            temp['Gamma'] = np.repeat(gamma, len(temp))
            df = pd.concat([df, temp])
        df.to_csv('frozen_lake_vi_gamma.csv')
    else:
        df = pd.DataFrame(columns=['Iteration', 'Reward', 'Epsilon'])
        for epsilon in epsilon_vec:
            vi = ValueIteration(frozen_lake.P, frozen_lake.R, gamma=gamma_vec, epsilon=epsilon)
            vi.run()
            temp = pd.DataFrame(vi.run_stats)[['Iteration', 'Reward']]
            temp['Epsilon'] = np.repeat(epsilon, len(temp))
            df = pd.concat([df, temp])
        df.to_csv('frozen_lake_vi_epsilon.csv')

In [None]:
# FrozenLake Policy Iteration Experiments
def pi_frozen_lake_exp(gamma_vec):

    if type(gamma_vec) is list:
        df = pd.DataFrame(columns=['Iteration', 'Reward', 'Gamma'])
        for gamma in gamma_vec:
            pi = PolicyIteration(frozen_lake.P, frozen_lake.R, gamma=gamma, eval_type=1)
            pi.run()
            temp = pd.DataFrame(pi.run_stats)[['Iteration', 'Reward']]
            temp['Gamma'] = np.repeat(gamma, len(temp))
            df = pd.concat([df, temp])
        df.to_csv('frozen_lake_pi_gamma.csv')

In [None]:
gamma_vec = [0.9, 0.95, 0.99]
epsilon_vec = [0.1, 0.01, 0.001]
vi_frozen_lake_exp(gamma_vec)
pi_frozen_lake_exp(gamma_vec)
vi_frozen_lake_exp(gamma_vec=0.99, epsilon_vec=epsilon_vec)

In [None]:
# Forest Value Iteration Experiments
def vi_forest_exp(gamma_vec, num_states=1000):

    if type(gamma_vec) is list:
        df = pd.DataFrame(columns=['Iteration', 'Reward', 'Gamma'])
        forest_P, forest_R = forest(num_states)
        for gamma in gamma_vec:
            vi = ValueIteration(forest_P, forest_R, gamma=gamma)
            vi.run()
            temp = pd.DataFrame(vi.run_stats)[['Iteration', 'Reward']]
            temp['Gamma'] = np.repeat(gamma, len(temp))
            df = pd.concat([df, temp])
        df.to_csv('forest_vi_gamma.csv')
    else:
        df = pd.DataFrame(columns=['Iteration', 'Reward', 'States', 'Runtime', 'Iterations'])
        for size in num_states:
            forest_P, forest_R = forest(size)
            vi = ValueIteration(forest_P, forest_R, gamma=gamma_vec)
            vi.run()
            temp = pd.DataFrame(vi.run_stats)[['Iteration', 'Reward']]
            temp['States'] = np.repeat(size, len(temp))
            temp['Runtime'] = np.repeat(vi.time, len(temp))
            temp['Iterations'] = np.repeat(vi.iter, len(temp))
            df = pd.concat([df, temp])
        df.to_csv('forest_vi_size.csv')

In [None]:
# Forest Policy Iteration Experiments
def pi_forest_exp(gamma_vec, num_states=1000):

    if type(gamma_vec) is list:
        df = pd.DataFrame(columns=['Iteration', 'Reward', 'Gamma'])
        forest_P, forest_R = forest(num_states)
        for gamma in gamma_vec:
            pi = PolicyIteration(forest_P, forest_R, gamma=gamma, eval_type=1)
            pi.run()
            temp = pd.DataFrame(pi.run_stats)[['Iteration', 'Reward']]
            temp['Gamma'] = np.repeat(gamma, len(temp))
            df = pd.concat([df, temp])
        df.to_csv('forest_pi_gamma.csv')
    else:
        df = pd.DataFrame(columns=['Iteration', 'Reward', 'States', 'Runtime', 'Iterations'])
        for size in num_states:
            forest_P, forest_R = forest(size)
            pi = PolicyIteration(forest_P, forest_R, gamma=gamma_vec, eval_type=1)
            pi.run()
            temp = pd.DataFrame(pi.run_stats)[['Iteration', 'Reward']]
            temp['States'] = np.repeat(size, len(temp))
            temp['Runtime'] = np.repeat(pi.time, len(temp))
            temp['Iterations'] = np.repeat(pi.iter, len(temp))
            df = pd.concat([df, temp])
        df.to_csv('forest_pi_size.csv')

In [None]:
gamma_vec = [0.9, 0.95, 0.99]
num_states_vec = [20, 200, 1000, 2000, 5000, 10000]
vi_forest_exp(gamma_vec)
pi_forest_exp(gamma_vec)
vi_forest_exp(gamma_vec=0.99, num_states=num_states_vec)
pi_forest_exp(gamma_vec=0.99, num_states=num_states_vec)

In [None]:
# Compare Frozen Lake policies
vi = ValueIteration(frozen_lake.P, frozen_lake.R, gamma=0.99)
vi.run()
pi = PolicyIteration(frozen_lake.P, frozen_lake.R, gamma=0.99, eval_type=1)
pi.run()
vi.policy == pi.policy

In [None]:
# Compare Forest policies
forest_P, forest_R = forest(1000)
vi = ValueIteration(forest_P, forest_R, gamma=0.99)
vi.run()
pi = PolicyIteration(forest_P, forest_R, gamma=0.99, eval_type=1)
pi.run()
vi.policy == pi.policy

In [None]:
# FrozenLake Q-Learning Experiments
def ql_frozen_lake_exp(alpha_vec=0.1, epsilon_vec=1.0):

    if type(alpha_vec) is list:
        df = pd.DataFrame(columns=['Iteration', 'Mean V', 'Max V', 'Alpha Decay'])
        for alpha_decay in alpha_vec:
            ql = QLearning(frozen_lake.P, frozen_lake.R, gamma=0.99, alpha_decay=alpha_decay, n_iter=1000000)
            ql.run()
            temp = pd.DataFrame(ql.run_stats)[['Iteration', 'Mean V', 'Max V']]
            temp['Alpha Decay'] = np.repeat(alpha_decay, len(temp))
            df = pd.concat([df, temp])
        df.to_csv('frozen_lake_ql_alpha_decay.csv')
    else:
        df = pd.DataFrame(columns=['Iteration', 'Mean V', 'Max V', 'Epsilon'])
        for epsilon in epsilon_vec:
            ql = QLearning(frozen_lake.P, frozen_lake.R, gamma=0.99, epsilon=epsilon, n_iter=1000000)
            ql.run()
            temp = pd.DataFrame(ql.run_stats)[['Iteration', 'Mean V', 'Max V']]
            temp['Epsilon'] = np.repeat(epsilon, len(temp))
            df = pd.concat([df, temp])
        df.to_csv('frozen_lake_ql_epsilon.csv')

In [None]:
alpha_vec = [0.99, 0.95, 0.9]
epsilon_vec = [1.0, 0.5, 0.25]
ql_frozen_lake_exp(alpha_vec=alpha_vec)
ql_frozen_lake_exp(epsilon_vec=epsilon_vec)

In [None]:
# Compare Frozen Lake policies
ql = QLearning(frozen_lake.P, frozen_lake.R, gamma=0.99, epsilon=1.0, n_iter=3000000)
ql.run()
vi = ValueIteration(frozen_lake.P, frozen_lake.R, gamma=0.99)
vi.run()
pi = PolicyIteration(frozen_lake.P, frozen_lake.R, gamma=0.99, eval_type=1)
pi.run()
print(ql.policy)
print(vi.policy)
print(pi.policy)
print('PI = QL? ', ql.policy == pi.policy)
print('VI = QL? ', ql.policy == vi.policy)

In [None]:
# Forest Q-Learning Experiments
def ql_forest_exp(epsilon_vec=1.0, num_states=1000):

    if type(epsilon_vec) is list:
        df = pd.DataFrame(columns=['Iteration', 'Max V', 'Epsilon', 'Time'])
        forest_P, forest_R = forest(num_states)
        for epsilon in epsilon_vec:
            ql = QLearning(forest_P, forest_R, gamma=0.99, epsilon=epsilon, n_iter=1000000)
            ql.run()
            temp = pd.DataFrame(ql.run_stats)[['Iteration', 'Max V', 'Time']]
            temp['Epsilon'] = np.repeat(epsilon, len(temp))
            df = pd.concat([df, temp])
        df.to_csv('forest_ql_epsilon.csv')
    else:
        df = pd.DataFrame(columns=['Iteration', 'Max V', 'Time', 'States'])
        for size in num_states:
            forest_P, forest_R = forest(size)
            ql = QLearning(forest_P, forest_R, gamma=0.99, epsilon=1.0, n_iter=1000000)
            ql.run()
            temp = pd.DataFrame(ql.run_stats)[['Iteration', 'Max V', 'Time']]
            temp['States'] = np.repeat(size, len(temp))
            df = pd.concat([df, temp])
        df.to_csv('forest_ql_size.csv')

In [None]:
epsilon_vec = [1.0, 0.5, 0.25]
num_states_vec = [20, 200, 1000, 2000, 5000, 10000]
ql_forest_exp(epsilon_vec=epsilon_vec)
ql_forest_exp(num_states=num_states_vec)

In [None]:
# Compare Forest policies
forest_P, forest_R = forest(1000)
ql = QLearning(forest_P, forest_R, gamma=0.99, epsilon=1.0, n_iter=5000000)
ql.run()
vi = ValueIteration(forest_P, forest_R, gamma=0.99)
vi.run()
pi = PolicyIteration(forest_P, forest_R, gamma=0.99, eval_type=1)
pi.run()
print(ql.policy)
print(vi.policy)
print(pi.policy)
print('PI = QL? ', ql.policy == pi.policy)
print('VI = QL? ', ql.policy == vi.policy)
