In [262]:
import gym
from hiive.mdptoolbox import mdp, example
import numpy as np
import pandas as pd

In [128]:
def forest_starter(S=3, r=1, r2=2, p=0.01):
    P = np.zeros((2, S, S))
    P[0, :, :] = (1 - p) * np.diag(np.ones(S - 1), 1)
    P[0, :, 0] = p
    P[0, S - 1, S - 1] = (1 - p)
    P[1, :, :] = np.zeros((S, S))
    P[1, :, 0] = 1
    
    R = np.zeros((S, 2))
    R[S - 1, 0] = r1
    R[:, 1] = np.ones(S)
    R[0, 1] = 0
    R[S - 1, 1] = r2
    
    return P,R


In [240]:
S_list = [3,25,100,500,1000]
gamma_list = [0.5,.75,0.99]
VI_stats_list_forest = []
PI_stats_list_forest = []
QL_stats_list_forest = []

for S in S_list:
    P, R = forest_starter(S=S)
    for gamma in gamma_list:
        
        vi = mdp.ValueIteration(P, R, gamma=gamma)
        vi.run()
        VI_stats_list_forest.append([S,gamma,vi.run_stats[-1]['Iteration'],vi.run_stats[-1]['Time'],vi.run_stats[-1]['Mean V']])
        
        pi = mdp.PolicyIteration(P, R, gamma=gamma)
        pi.run()
        PI_stats_list_forest.append([S,gamma,pi.run_stats[-1]['Iteration'],pi.run_stats[-1]['Time'],pi.run_stats[-1]['Mean V']])
        
        ql = mdp.QLearning(P, R, gamma=gamma)
        ql.run()
        QL_stats_list_forest.append([S,gamma,ql.run_stats[-1]['Iteration'],ql.run_stats[-1]['Time'],ql.run_stats[-1]['Mean V']])
              

In [241]:
QL_stats_list_forest

[[3, 0.5, 10000, 0.4997525215148926, 3.7546964161547662],
 [3, 0.75, 10000, 0.484025239944458, 4.406299175155886],
 [3, 0.99, 10000, 0.48139119148254395, 22.714962429088313],
 [25, 0.5, 10000, 0.4812495708465576, 0.1421340407885137],
 [25, 0.75, 10000, 0.4913136959075928, 0.22299780523138632],
 [25, 0.99, 10000, 0.4930129051208496, 0.41297385091059097],
 [100, 0.5, 10000, 0.492908239364624, 0.030570751642004762],
 [100, 0.75, 10000, 0.49370288848876953, 0.04696913063806102],
 [100, 0.99, 10000, 0.5045018196105957, 0.08796903079915384],
 [500, 0.5, 10000, 0.7370357513427734, 0.006558670061683734],
 [500, 0.75, 10000, 0.6631848812103271, 0.009351954833361108],
 [500, 0.99, 10000, 0.6650564670562744, 0.01794276447686283],
 [1000, 0.5, 10000, 0.9553797245025635, 0.003117135492847514],
 [1000, 0.75, 10000, 0.9470133781433105, 0.004617596607378818],
 [1000, 0.99, 10000, 1.0079026222229004, 0.008590038380876061]]

In [242]:
S=25
gamma=0.99
q_iters_list = [10000,25000,100000,250000,1000000]
P, R = forest_starter(S=S)
QL_stats_list_iters_forest = []

for iters in q_iters_list:
    ql_iter = mdp.QLearning(P, R, gamma=gamma, n_iter=iters)
    ql_iter.run()
    QL_stats_list_iters_forest.append([S,gamma,ql_iter.run_stats[-1]['Iteration'],ql_iter.run_stats[-1]['Time'],ql_iter.run_stats[-1]['Mean V']])

In [243]:
QL_stats_list_iters_forest

[[25, 0.99, 10000, 0.5012941360473633, 0.34801419638504916],
 [25, 0.99, 25000, 0.9049642086029053, 0.7070143145645967],
 [25, 0.99, 100000, 2.468418598175049, 2.785970885603726],
 [25, 0.99, 250000, 5.729984998703003, 7.041617720185118],
 [25, 0.99, 1000000, 22.135645627975464, 19.193275705657726]]

In [248]:
def PR_maker(env):
    s = len(env.P)
    a = len(env.P[0])

    P = np.zeros((a,s,s))
    R = np.zeros((s,a))

    for state_start, action_dict in env.P.items():
        for action, transition_matrix in action_dict.items():
                 for transition_tuple in transition_matrix:
                    #print(transition_tuple[0])
                    P[action][state_start][transition_tuple[1]] += transition_tuple[0]
                    R[state_start][action] += transition_tuple[0]*transition_tuple[2]
                    
    return P, R

In [249]:
env = gym.make('FrozenLake-v0')
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [252]:
env_list = ['FrozenLake-v0','FrozenLake8x8-v0','FrozenLake20x20-v0']
gamma_list = [0.01, 0.25, 0.5, 0.75, 0.99]
VI_stats_list_lake = []
PI_stats_list_lake = []
QL_stats_list_lake = []

for _env in env_list:
    env = gym.make(_env)
    P, R = PR_maker(env)
    for gamma in gamma_list:
        
        vi = mdp.ValueIteration(P, R, gamma=gamma)
        vi.run()
        VI_stats_list_lake.append([_env,gamma,vi.run_stats[-1]['Iteration'],vi.run_stats[-1]['Time'],vi.run_stats[-1]['Mean V']])
        
        pi = mdp.PolicyIteration(P, R, gamma=gamma)
        pi.run()
        PI_stats_list_lake.append([_env,gamma,pi.run_stats[-1]['Iteration'],pi.run_stats[-1]['Time'],pi.run_stats[-1]['Mean V']])
        
        ql = mdp.QLearning(P, R, gamma=gamma)
        ql.run()
        QL_stats_list_lake.append([_env,gamma,ql.run_stats[-1]['Iteration'],ql.run_stats[-1]['Time'],ql.run_stats[-1]['Mean V']])
              

In [257]:
QL_stats_list_lake

[['FrozenLake-v0', 0.01, 10000, 0.523979663848877, 0.00031033662618355995],
 ['FrozenLake-v0', 0.25, 10000, 0.5147817134857178, 0.0005279905375929539],
 ['FrozenLake-v0', 0.5, 10000, 0.52162766456604, 0.0],
 ['FrozenLake-v0', 0.75, 10000, 0.5224766731262207, 0.0011573395105686937],
 ['FrozenLake-v0', 0.99, 10000, 0.514765739440918, 0.0],
 ['FrozenLake8x8-v0', 0.01, 10000, 0.6458983421325684, 5.208333333333333e-06],
 ['FrozenLake8x8-v0', 0.25, 10000, 0.6379959583282471, 0.00020774661075364213],
 ['FrozenLake8x8-v0', 0.5, 10000, 0.6551291942596436, 2.6020838541666664e-05],
 ['FrozenLake8x8-v0', 0.75, 10000, 0.6352622509002686, 2.082161783675203e-05],
 ['FrozenLake8x8-v0', 0.99, 10000, 0.646143913269043, 2.082286973499948e-05],
 ['FrozenLake20x20-v0', 0.01, 10000, 1.4214210510253906, 0.0],
 ['FrozenLake20x20-v0',
  0.25,
  10000,
  1.5332906246185303,
  8.333333333333333e-07],
 ['FrozenLake20x20-v0', 0.5, 10000, 1.6004881858825684, 0.0],
 ['FrozenLake20x20-v0',
  0.75,
  10000,
  1.584281

In [254]:
env_val = 'FrozenLake20x20-v0'
env = gym.make(env_val)
P, R = PR_maker(env)
gamma=0.25
q_iters_list = [10000,100000,1000000]
QL_stats_list_iters_lake = []

for iters in q_iters_list:
    ql_iter = mdp.QLearning(P, R, gamma=gamma, n_iter=iters)
    ql_iter.run()
    QL_stats_list_iters_lake.append([S,gamma,ql_iter.run_stats[-1]['Iteration'],ql_iter.run_stats[-1]['Time'],ql_iter.run_stats[-1]['Mean V']])

In [255]:
QL_stats_list_iters_lake

[[25, 0.25, 10000, 1.518669605255127, 8.333333333333333e-07],
 [25, 0.25, 100000, 12.255282878875732, 2.4991666666666663e-06],
 [25, 0.25, 1000000, 123.86499166488647, 4.2372542480187345e-05]]

In [273]:
pd.DataFrame(PI_stats_list_lake)

Unnamed: 0,0,1,2,3,4
0,FrozenLake-v0,0.01,4,0.0,0.021044
1,FrozenLake-v0,0.25,4,0.008062,0.027544
2,FrozenLake-v0,0.5,4,0.0,0.039839
3,FrozenLake-v0,0.75,4,0.002052,0.070726
4,FrozenLake-v0,0.99,1000,0.302321,0.396239
5,FrozenLake8x8-v0,0.01,8,0.0,0.010487
6,FrozenLake8x8-v0,0.25,8,0.0,0.012664
7,FrozenLake8x8-v0,0.5,8,0.002052,0.016791
8,FrozenLake8x8-v0,0.75,8,0.008058,0.027477
9,FrozenLake8x8-v0,0.99,1000,0.383411,0.337006


In [258]:
VI_stats_list_forest
PI_stats_list_forest
QL_stats_list_forest

VI_stats_list_lake
PI_stats_list_lake
QL_stats_list_lake

QL_stats_list_iters_forest
QL_stats_list_iters_lake





[[25, 0.25, 10000, 1.518669605255127, 8.333333333333333e-07],
 [25, 0.25, 100000, 12.255282878875732, 2.4991666666666663e-06],
 [25, 0.25, 1000000, 123.86499166488647, 4.2372542480187345e-05]]