In [1]:
import numpy as np
from acrobot_env_extended import AcrobotEnvExtended
import random 

In [2]:
env = AcrobotEnvExtended(render_mode='rgb_array')

In [3]:
cost1_space = np.linspace(-1, 1, 10)
sint1_space = np.linspace(-1, 1, 10)
cost2_space = np.linspace(-1, 1, 20)
sint2_space = np.linspace(-1, 1, 20)
velt1_space = np.linspace(-12.57, 12.57, 40)
velt2_space = np.linspace(-28.27, 28.27, 30)

In [10]:
def get_state(obs):
    c1,s1,c2,s2,vt1,vt2 = obs
    c1_bin = np.digitize(c1, cost1_space)
    s1_bin = np.digitize(s1, sint1_space)
    c2_bin = np.digitize(c2, cost2_space)
    s2_bin = np.digitize(s2, sint2_space)
    vt1_bin = np.digitize(vt1, velt1_space)
    vt2_bin = np.digitize(vt2, velt2_space)
    return c1_bin, s1_bin,c2_bin,s2_bin,vt1_bin,vt2_bin

In [11]:
state = get_state(np.array([-0.4, 0.2, 0.3, 0.4, 0.5, 0.6]))
state

(3, 6, 13, 14, 21, 15)

In [12]:
actions = list(range(env.action_space.n))
actions

[0, 1, 2]

In [13]:
Q = np.zeros((len(cost1_space)+1, len(sint1_space)+1, len(cost2_space)+1, len(sint2_space)+1, len(velt1_space)+1, len(velt2_space)+1, len(actions)))
Q

array([[[[[[[0., 0., 0.],
            [0., 0., 0.],
            [0., 0., 0.],
            ...,
            [0., 0., 0.],
            [0., 0., 0.],
            [0., 0., 0.]],

           [[0., 0., 0.],
            [0., 0., 0.],
            [0., 0., 0.],
            ...,
            [0., 0., 0.],
            [0., 0., 0.],
            [0., 0., 0.]],

           [[0., 0., 0.],
            [0., 0., 0.],
            [0., 0., 0.],
            ...,
            [0., 0., 0.],
            [0., 0., 0.],
            [0., 0., 0.]],

           ...,

           [[0., 0., 0.],
            [0., 0., 0.],
            [0., 0., 0.],
            ...,
            [0., 0., 0.],
            [0., 0., 0.],
            [0., 0., 0.]],

           [[0., 0., 0.],
            [0., 0., 0.],
            [0., 0., 0.],
            ...,
            [0., 0., 0.],
            [0., 0., 0.],
            [0., 0., 0.]],

           [[0., 0., 0.],
            [0., 0., 0.],
            [0., 0., 0.],
            ...,
            [

In [14]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

In [15]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
        print('explore')
    # exploit
    else:
        action = np.argmax(Q[state])
        print('exploit')
        
    return action

In [16]:
obs,_ = env.reset()
print(obs)
done = False
episode_reward = 0
while not done:
    state = get_state(obs)
    action = epsilon_greedy_policy(state, Q, 0.5)
    obs, reward, done, _, _ = env.step(action)
    episode_reward += reward
    print('->', state, action, reward, obs, done)
    env.render()
episode_reward

[ 0.9997409  -0.02276181  0.9986918  -0.05113351 -0.04480985  0.02408327]
exploit
-> (9, 5, 19, 10, 20, 15) 0 -1.0 [ 0.99986804 -0.01624532  0.9969682  -0.07781032  0.1075906  -0.28534907] False
exploit
-> (9, 5, 19, 9, 20, 15) 0 -1.0 [ 0.99984205  0.01777169  0.9872377  -0.1592537   0.22287437 -0.51518494] False
exploit
-> (9, 5, 19, 8, 20, 15) 0 -1.0 [ 0.9976973   0.06782368  0.9627354  -0.270445    0.26437593 -0.59658664] False
exploit
-> (9, 5, 19, 7, 20, 15) 0 -1.0 [ 0.993081    0.11743157  0.92601335 -0.37749082  0.22078456 -0.50988024] False
exploit
-> (9, 6, 19, 6, 20, 15) 0 -1.0 [ 0.98854464  0.15092862  0.89220065 -0.4516392   0.10880009 -0.28834796] False
explore
-> (9, 6, 18, 6, 20, 15) 0 -1.0 [ 0.9873944   0.15827934  0.8786859  -0.4774004  -0.0362333   0.00229987] False
explore
-> (9, 6, 18, 5, 20, 15) 2 -1.0 [ 0.9937031   0.11204494  0.9202084  -0.3914289  -0.42015797  0.9345407 ] False
exploit
-> (9, 6, 19, 6, 19, 15) 0 -1.0 [ 0.9996934   0.02476085  0.97879404 -0.20484

-535.0