In [14]:
#pip install gym
import gym
import time
from random import randint
import numpy as np
import math
from typing import Tuple
import sys

# Cargo el entorno

In [15]:
env = gym.make('CartPole-v1', render_mode='human')

# Policies

In [16]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
        # print('explore')
    else:
        action = np.argmax(Q[state])
        # print('exploit')
    return action

In [17]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

In [18]:
# bins = np.linspace(-0.000001, 100., 2)
# bins

array([-1.e-06,  1.e+02])

In [19]:
bins_ang = np.linspace(-0.418, 0.418, 5)
bins_ang

array([-0.418, -0.209,  0.   ,  0.209,  0.418])

In [20]:
bins_pole_ang = np.linspace(-sys.maxsize, sys.maxsize, 5)
bins_pole_ang

array([-9.22337204e+18, -4.61168602e+18,  0.00000000e+00,  4.61168602e+18,
        9.22337204e+18])

In [21]:
def get_state(obs):
    # d = np.digitize(obs, bins)
    # state = tuple(d)
    pole_ang = obs[2]
    pole_vel = obs[3]
    d1 = np.digitize(pole_ang, bins_ang)
    d2 = np.digitize(pole_vel, bins_pole_ang)
    # state = tuple
    return (d1, d2)

In [22]:
np.array([-1.4, -2., 0.23, 1.2])

array([-1.4 , -2.  ,  0.23,  1.2 ])

In [23]:
state = get_state(np.array([-1.4, -2., 0.23, 1.2]))
state

(4, 3)

In [12]:
# Q = np.random.random((2,2,2,2,2))
# Q = np.random.random((2,2,2))
# Q

array([[[0.1433992 , 0.09291489],
        [0.96891188, 0.15985174]],

       [[0.38525869, 0.26707061],
        [0.6799751 , 0.84042969]]])

In [24]:
Q = np.zeros((5, 5) + (env.action_space.n,))
Q.shape

(5, 5, 2)

In [25]:
def learning_rate(n : int , min_rate=0.01 ) -> float  :
    """Decaying learning rate"""
    return max(min_rate, min(1.0, 1.0 - math.log10((n + 1) / 25)))

In [26]:
def new_Q_value( reward : float ,  new_state : tuple , discount_factor=1 ) -> float:
    """Temperal diffrence for updating Q-value of state-action pair"""
    future_optimal_value = np.max(Q[new_state])
    learned_value = reward + discount_factor * future_optimal_value
    return learned_value

# Simulación

In [27]:
rounds = 0
for i in range(1000):
    obs,_ = env.reset()
    done = False
    print(i - 1, '  ->  ', rounds)
    rounds = 0
    while not done:
        state = get_state(obs)
        policy = epsilon_greedy_policy(state, Q, 0.2)
        print(policy)
        obs, reward, done, info, _ = env.step(policy)
        rounds += 1

        new_state = get_state(obs)
        
        lr = learning_rate(i)
        print("lr value ", lr)

        learnt_value = new_Q_value(reward , new_state )
        print("Learnt value ", learnt_value)

        old_value = Q[state][policy]
        print("Old value ", old_value)

        new_pol = (1-lr)*old_value + lr*learnt_value
        print("new_pol value ", new_pol)

        Q[state][policy] =new_pol

        # print(obs)
        # print('->', state, policy, reward, obs, done, info)
        env.render()
        time.sleep(0.02)
        if done:
            env.reset()

obs,_ = env.reset()
done = False
rounds = 0
env.close()

while not done:
    state = get_state(obs)
    policy = optimal_policy(state, Q)
    obs, reward, done, info, _ = env.step(policy)
    rounds += 1
    env.render()
    time.sleep(0.05)
    if done:
        env.reset()
print('Optimal  ->  ', rounds)

-1   ->   0
0
lr value  1.0
Learnt value  1.0
Old value  0.0
new_pol value  1.0
0
lr value  1.0
Learnt value  1.0
Old value  1.0
new_pol value  1.0
0
lr value  1.0
Learnt value  1.0
Old value  0.0
new_pol value  1.0
0
lr value  1.0
Learnt value  2.0
Old value  1.0
new_pol value  2.0
0
lr value  1.0
Learnt value  3.0
Old value  2.0
new_pol value  3.0
0
lr value  1.0
Learnt value  4.0
Old value  3.0
new_pol value  4.0
0
lr value  1.0
Learnt value  5.0
Old value  4.0
new_pol value  5.0
0
lr value  1.0
Learnt value  6.0
Old value  5.0
new_pol value  6.0
0
lr value  1.0
Learnt value  7.0
Old value  6.0
new_pol value  7.0
0
lr value  1.0
Learnt value  1.0
Old value  7.0
new_pol value  1.0
0   ->   10
0
lr value  1.0
Learnt value  2.0
Old value  0.0
new_pol value  2.0
0
lr value  1.0
Learnt value  2.0
Old value  1.0
new_pol value  2.0
0
lr value  1.0
Learnt value  3.0
Old value  2.0
new_pol value  3.0
0
lr value  1.0
Learnt value  4.0
Old value  3.0
new_pol value  4.0
0
lr value  1.0
Learnt v

KeyboardInterrupt: 