In [2]:
from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning
from hiive.mdptoolbox.example import forest
# import hiive_mdptoolbox.example
# import hiive_mdptoolbox
import gym
import numpy as np
import sys
import os
from numpy.random import choice
import pandas as pd
import seaborn as sns
np.random.seed(44)

In [3]:
def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

In [4]:
def test_policy(P, R, policy, test_count=100, gamma=0.9):
    num_state = P.shape[-1]
    total_episode = num_state * test_count
    # start in each state
    total_reward = 0
    for state in range(num_state):
        state_reward = 0
        for state_episode in range(test_count):
            episode_reward = 0
            disc_rate = 1
            while True:
                # take step
                action = policy[state]
                # get next step using P
                probs = P[action][state]
                candidates = list(range(len(P[action][state])))
                next_state =  choice(candidates, 1, p=probs)[0]
                # get the reward
                reward = R[state][action] * disc_rate
                episode_reward += reward
                # when go back to 0 ended
                disc_rate *= gamma
                if next_state == 0:
                    break
            state_reward += episode_reward
        total_reward += state_reward
    return total_reward / total_episode


In [5]:
def trainVI(P, R, discount=0.9, epsilon=[1e-9]):
    vi_df = pd.DataFrame(columns=["Epsilon", "Policy", "Iteration", 
                                  "Time", "Reward", "Value Function"])
    for eps in epsilon:
        vi = ValueIteration(P, R, gamma=discount, epsilon=eps, max_iter=int(1e15))
        vi.run()
        reward = test_policy(P, R, vi.policy)
        info = [float(eps), vi.policy, vi.iter, vi.time, reward, vi.V]
        df_length = len(vi_df)
        vi_df.loc[df_length] = info
    return vi_df

### 20 State Forest Management

In [6]:
P, R = forest(S=20, r1=10, r2=6, p=0.1)

In [7]:
vi_df = trainVI(P, R, epsilon=[1e-1, 1e-3, 1e-6, 1e-9, 1e-12, 1e-15])
vi_df

Unnamed: 0,Epsilon,Policy,Iteration,Time,Reward,Value Function
0,0.1,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",33,0.002351,2.789945,"(4.328504830081768, 4.881518644971712, 4.88151..."
1,0.001,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",55,0.002257,2.887394,"(4.460720290173723, 5.013211594807497, 5.01321..."
2,1e-06,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",87,0.00368,2.87422,"(4.474643139169861, 5.027129333047953, 5.02712..."
3,1e-09,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",120,0.005053,2.909106,"(4.475122825121185, 5.027609012960728, 5.02760..."
4,1e-12,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",153,0.006857,3.151855,"(4.475137648839068, 5.027623836684378, 5.02762..."
5,1e-15,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",186,0.008051,2.905897,"(4.4751381069387985, 5.027624294784101, 5.0276..."


In [8]:
pi = PolicyIteration(P, R, gamma=0.9, max_iter=1e6)
pi.run()
pi_pol = pi.policy
pi_reward = test_policy(P, R, pi_pol)
pi_iter = pi.iter
pi_time = pi.time
pi_iter, pi_time, pi_reward

(14, 0.007139921188354492, 2.7201342014146013)

### 20 State Forest Management Q-LEARNING

In [9]:
def trainQ(P, R, discount=0.9, alpha_dec=[0.99], alpha_min=[0.001], 
            epsilon=[1.0], epsilon_decay=[0.99], n_iter=[1000000]):
    q_df = pd.DataFrame(columns=["Iterations", "Alpha Decay", "Alpha Min", 
                                 "Epsilon", "Epsilon Decay", "Reward",
                                 "Time", "Policy", "Value Function",
                                 "Training Rewards"])
    
    count = 0
    for i in n_iter:
        for eps in epsilon:
            for eps_dec in epsilon_decay:
                for a_dec in alpha_dec:
                    for a_min in alpha_min:
                        q = QLearning(P, R, discount, alpha_decay=a_dec, 
                                      alpha_min=a_min, epsilon=eps, 
                                      epsilon_decay=eps_dec, n_iter=i)
                        q.run()
                        reward = test_policy(P, R, q.policy)
                        count += 1
                        print("{}: {}".format(count, reward))
                        st = q.run_stats
                        rews = [s['Reward'] for s in st]
                        info = [i, a_dec, a_min, eps, eps_dec, reward, 
                                q.time, q.policy, q.V, rews]
                        
                        df_length = len(q_df)
                        q_df.loc[df_length] = info
    return q_df

In [10]:
alpha_decs = [0.99, 0.999]
alpha_mins =[0.001, 0.0001]
eps = [10.0, 1.0]
eps_dec = [0.99, 0.999]
iters = [1000000, 10000000]
q_df = trainQ(P, R, discount=0.9, alpha_dec=alpha_decs, alpha_min=alpha_mins, 
            epsilon=eps, epsilon_decay=eps_dec, n_iter=iters)


1: 3.4468680630895006
2: 3.4808212726097816
3: 3.043212580639407
4: 3.1920685212850013
5: 3.216424339503616
6: 3.541600673973948
7: 3.19357482093787
8: 3.351518036404763
9: 2.9959005887587
10: 3.3269017226434356
11: 3.1594923626720077
12: 3.4180675475494153
13: 1.05
14: 3.384818937203864
15: 3.381091152158147
16: 0.85
17: 3.3454151999510167
18: 3.1799928406423756
19: 3.01683529971716
20: 3.0023630070891034
21: 3.3410796056740866
22: 3.36688410288134
23: 3.1406228533708913
24: 3.2046558470614963
25: 3.3421390942373663
26: 3.3616673129709334
27: 3.0499745264372353
28: 3.161056796760002
29: 3.416622192964107
30: 1.15
31: 0.9
32: 3.448767259348544


In [11]:
vi_df.Policy == pi_pol

0    True
1    True
2    True
3    True
4    True
5    True
Name: Policy, dtype: bool

In [12]:
test_policy(P,R,q_df.Policy[18])

3.0937564391332626

In [17]:
q_df

Unnamed: 0,Iterations,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time,Policy,Value Function,Training Rewards
0,1000000,0.99,0.001,10.0,0.99,3.446868,33.382288,"(0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, ...","(4.4731702784148695, 5.024992018188736, 5.0237...","[10.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,1000000,0.99,0.0001,10.0,0.99,3.480821,33.031124,"(0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...","(4.4338116315389575, 4.986632955062145, 2.2955...","[1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1000000,0.999,0.001,10.0,0.99,3.043213,32.262838,"(0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, ...","(4.472009355992276, 5.0253345462254195, 5.0313...","[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1000000,0.999,0.0001,10.0,0.99,3.192069,32.340923,"(0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, ...","(4.474616747874139, 5.0276630137059986, 4.8830...","[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1000000,0.99,0.001,10.0,0.999,3.216424,32.56526,"(0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, ...","(4.459844339494238, 5.016525904443956, 5.02329...","[0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."
5,1000000,0.99,0.0001,10.0,0.999,3.541601,31.731805,"(0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(4.435679568763774, 4.9878600371666035, 4.0249...","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,1000000,0.999,0.001,10.0,0.999,3.193575,31.661077,"(0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, ...","(4.4787574062064435, 5.0300810478330655, 5.030...","[1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, ..."
7,1000000,0.999,0.0001,10.0,0.999,3.351518,31.66199,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, ...","(4.4718269377290705, 5.023795812787034, 4.8770...","[1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, ..."
8,1000000,0.99,0.001,1.0,0.99,2.995901,31.845467,"(0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, ...","(4.476150196867361, 5.026950579793138, 5.02688...","[1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 10.0, 0.0,..."
9,1000000,0.99,0.0001,1.0,0.99,3.326902,31.443331,"(0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, ...","(4.435011074363516, 4.987596197739208, 4.06223...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 10.0, 0.0,..."


In [14]:
q_df.groupby("Iterations").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time
Iterations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000000,0.9945,0.00055,5.5,0.9945,3.002023,31.967909
10000000,0.9945,0.00055,5.5,0.9945,2.964255,1580.403143


In [15]:
q_df.groupby("Epsilon Decay").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Reward,Time
Epsilon Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.99,0.9945,0.00055,5.5,3.220174,414.168476
0.999,0.9945,0.00055,5.5,2.746104,1198.202575


### 500 State Forest Management

In [2]:
P, R = forest(S=500, r1=100, r2= 15, p=0.01)

In [6]:
vi_df = trainVI(P, R, epsilon=[1e-1, 1e-3, 1e-6, 1e-9, 1e-12, 1e-15])
vi_df

Unnamed: 0,Epsilon,Policy,Iteration,Time,Reward,Value Function
0,0.1,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",79,0.012341,2.790057,"(4.710556185449387, 5.239434944489701, 5.23943..."
1,0.001,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",119,0.013518,2.726258,"(4.7117745667154995, 5.240595870281114, 5.2405..."
2,1e-06,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",179,0.019228,2.737932,"(4.711792669916437, 5.240613400253226, 5.24061..."
3,1e-09,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",239,0.028688,2.801772,"(4.711792702216012, 5.240613431989174, 5.24061..."
4,1e-12,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",299,0.033417,2.736673,"(4.711792702273827, 5.240613432046434, 5.24061..."
5,1e-15,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",349,0.031081,2.728691,"(4.7117927022739305, 5.240613432046538, 5.2406..."


In [7]:
pi = PolicyIteration(P, R, gamma=0.9, max_iter=1e6)
pi.run()
pi_pol = pi.policy
pi_reward = test_policy(P, R, pi_pol)
pi_iter = pi.iter
pi_time = pi.time
pi_iter, pi_time, pi_reward

(46, 0.14986801147460938, 2.7411655523985554)

# Q-Learning

In [9]:
alpha_decs = [0.99, 0.999]
alpha_mins =[0.001, 0.0001]
eps = [10.0, 1.0]
eps_dec = [0.99, 0.999]
iters = [1000000, 10000000]
q_df = trainQ(P, R, discount=0.9, alpha_dec=alpha_decs, alpha_min=alpha_mins, 
            epsilon=eps, epsilon_decay=eps_dec, n_iter=iters)

1: 2.650183487415402
2: 2.6363744404711285
3: 2.6072561523039504
4: 2.6117818590299353
5: 2.57748663663716
6: 2.67399605909375
7: 2.6278609803234336
8: 2.626661209650027
9: 2.5297012242759016
10: 2.655280375155995
11: 2.6601539567558454
12: 2.608396759527677
13: 2.6790831093841865
14: 2.6850597811143957
15: 0.822
16: 2.676218333466832
17: 2.7135200868055835
18: 2.8117761382427333
19: 2.764109280376738
20: 2.765967761281483
21: 2.7403159646717272
22: 2.861893456347953
23: 2.656581016247709
24: 2.8327739465709403
25: 2.812239418250017
26: 2.7756577123354447
27: 2.7755963806781256
28: 2.7871103415832237
29: 2.753035768577412
30: 2.850375830314547
31: 2.7556116413006757
32: 2.8514372672801582


In [10]:
q_df

Unnamed: 0,Iterations,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time,Policy,Value Function,Training Rewards
0,1000000,0.99,0.001,10.0,0.99,2.650183,40.425692,"(0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, ...","(4.712210801507589, 5.241277084192105, 5.24121...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1000000,0.99,0.0001,10.0,0.99,2.636374,40.048326,"(0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...","(4.672188906786895, 5.200874840421224, 4.37178...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1000000,0.999,0.001,10.0,0.99,2.607256,47.949863,"(0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, ...","(4.711828804944402, 5.241218072985039, 5.24151...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
3,1000000,0.999,0.0001,10.0,0.99,2.611782,41.661012,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, ...","(4.710461366899375, 5.239581376346029, 5.09524...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
4,1000000,0.99,0.001,10.0,0.999,2.577487,43.996562,"(0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, ...","(4.71057876795351, 5.240113535858624, 5.240027...","[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, ..."
5,1000000,0.99,0.0001,10.0,0.999,2.673996,791.700858,"(0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, ...","(4.666537393022336, 5.1952190427092955, 4.3410...","[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
6,1000000,0.999,0.001,10.0,0.999,2.627861,40.808036,"(0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(4.713478207004994, 5.242419702750562, 5.24130...","[1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, ..."
7,1000000,0.999,0.0001,10.0,0.999,2.626661,42.122573,"(0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(4.708203589152052, 5.237402752385588, 5.13867...","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
8,1000000,0.99,0.001,1.0,0.99,2.529701,40.108953,"(0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...","(4.711667962492642, 5.240921762076372, 5.24098...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,1000000,0.99,0.0001,1.0,0.99,2.65528,41.491017,"(0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, ...","(4.67205487152553, 5.200652330776339, 4.374608...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [11]:
pi_pol == q_df.Policy

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
Name: Policy, dtype: bool

In [12]:
q_df.groupby("Iterations").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time
Iterations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000000,0.9945,0.00055,5.5,0.9945,2.520468,89.188617
10000000,0.9945,0.00055,5.5,0.9945,2.78175,532.726505


In [13]:
q_df.groupby("Epsilon Decay").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Reward,Time
Epsilon Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.99,0.9945,0.00055,5.5,2.697819,345.814748
0.999,0.9945,0.00055,5.5,2.604399,276.100374


In [14]:
q_df.groupby("Alpha Decay").mean()

Unnamed: 0_level_0,Alpha Min,Epsilon,Epsilon Decay,Reward,Time
Alpha Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.99,0.00055,5.5,0.9945,2.712874,276.001729
0.999,0.00055,5.5,0.9945,2.589345,345.913393


In [16]:
q_df.groupby("Alpha Min").mean()

Unnamed: 0_level_0,Alpha Decay,Epsilon,Epsilon Decay,Reward,Time
Alpha Min,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0001,0.9945,5.5,0.9945,2.731923,278.397922
0.001,0.9945,5.5,0.9945,2.570296,343.5172
