In [50]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

In [51]:
env = gym.make('CartPole-v1')
print(env.observation_space.low, env.observation_space.high)

[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38] [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]


In [60]:
max_obs_values = np.array([2.4, 5, 0.2099 , 4])
min_obs_values = np.array([-2.4, -5, -0.2099, -4])
print(min_obs_values, max_obs_values)

[-2.4    -5.     -0.2099 -4.    ] [2.4    5.     0.2099 4.    ]


In [61]:
DISCRETE_OBS_SPACE_SIZE = [10]* len(max_obs_values) # these are the what the max values will corresponds to
discrete_obs_space_step_size = (max_obs_values - min_obs_values) / DISCRETE_OBS_SPACE_SIZE

def discretizer(obs):
    # obs = np.array([obs[0], obs[2]])
    discrete_obs = (obs - min_obs_values)/discrete_obs_space_step_size
    return tuple(discrete_obs.astype(np.int16)) # tuple to make indexing easier


In [62]:
LEARNING_RATE = 0.1
DISCOUNT = 0.95
EXPLORATION_RATE = 1
EXPLORATION_DECAY_RATE = 0.00001

In [63]:
q_table = np.zeros(DISCRETE_OBS_SPACE_SIZE + [env.action_space.n])
q_table.shape

(10, 10, 10, 10, 2)

In [64]:
print(q_table[0][0][0][0])
print(q_table[(0,0,0,0)]) 

[0. 0.]
[0. 0.]


In [65]:
observation, info = env.reset()
# obs = np.array([observation[0], observation[2]])
obs = observation
print(min_obs_values, max_obs_values)
print(obs)
discrete_obs = (obs - min_obs_values)/discrete_obs_space_step_size
discrete_obs = tuple(discrete_obs.astype(np.int16))
discrete_obs

[-2.4    -5.     -0.2099 -4.    ] [2.4    5.     0.2099 4.    ]
[-0.0440576   0.02953474  0.02037831  0.04577639]


(4, 5, 5, 5)

In [66]:
action = 1
q_table[discrete_obs]
# q_table[discrete_obs + (action,)]

array([0., 0.])

In [None]:
num_episodes = 100000
average_scores = []
for e in range(num_episodes):
    observation, info = env.reset()
    current_obs = discretizer(observation) 
    done = False; score_per_episode = 0
    scores = [] # list for 100 episodes
    while not done:
        action = np.argmax(q_table[current_obs])
        if EXPLORATION_RATE > np.random.random():
             action = env.action_space.sample() 
        observation, reward, done, _, _ = env.step(action)
        discrete_obs = discretizer(observation)
        # print(discrete_obs, done)
        if not done:
            max_future_q = np.max(q_table[discrete_obs])
            current_q = q_table[current_obs + (action,)]
            new_q = (1-LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)
            q_table[current_obs + (action,)] = new_q
            # print(new_q, current_q) # new_q isn't changing
        current_obs = discrete_obs
        score_per_episode += reward
    EXPLORATION_RATE = max(EXPLORATION_RATE - EXPLORATION_DECAY_RATE, 0)
    scores.append(score_per_episode)
        
    if e % 1000 == 0:
        average_score = np.mean(scores)
        average_scores.append(average_score)
        scores = []
        print(f"Episode {e}, exploration rate {EXPLORATION_RATE}, average score {average_score}")
    if score_per_episode > 10000:
        break
env.close()

# cart position is not really changing much for random actions, maybe i should use all the observations


Episode 0, exploration rate 0.999999, average score 14.0
Episode 1000, exploration rate 0.9989989999999712, average score 40.0
Episode 2000, exploration rate 0.9979989999999425, average score 12.0
Episode 3000, exploration rate 0.9969989999999137, average score 12.0
Episode 4000, exploration rate 0.995998999999885, average score 53.0
Episode 5000, exploration rate 0.9949989999998562, average score 25.0
Episode 6000, exploration rate 0.9939989999998274, average score 26.0
Episode 7000, exploration rate 0.9929989999997987, average score 13.0
Episode 8000, exploration rate 0.9919989999997699, average score 13.0
Episode 9000, exploration rate 0.9909989999997412, average score 9.0
Episode 10000, exploration rate 0.9899989999997124, average score 20.0
Episode 11000, exploration rate 0.9889989999996837, average score 13.0
Episode 12000, exploration rate 0.9879989999996549, average score 36.0
Episode 13000, exploration rate 0.9869989999996261, average score 24.0
Episode 14000, exploration rate

In [None]:
plt.plot(average_scores)