In [3]:
import gym
import numpy as np

env = gym.make('CartPole-v1')
n_actions = env.action_space.n
n_states = env.observation_space.shape[0]

# Q-learning parameters
alpha = 0.1
gamma = 0.99
epsilon = 0.1
episodes = 500

# Discretize state space
def discretize(obs, bins=(6, 12, 6, 12)):
    upper_bounds = [env.observation_space.high[0], 0.5, env.observation_space.high[2], np.radians(50)]
    lower_bounds = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -np.radians(50)]
    ratios = [(obs[i] + abs(lower_bounds[i])) / (upper_bounds[i] - lower_bounds[i]) for i in range(len(obs))]
    new_obs = [int(round((bins[i] - 1) * ratios[i])) for i in range(len(obs))]
    new_obs = [min(bins[i] - 1, max(0, new_obs[i])) for i in range(len(obs))]
    return tuple(new_obs)

q_table = np.zeros((6, 12, 6, 12, n_actions))

for episode in range(episodes):
    state = discretize(env.reset()[0])
    done = False
    while not done:
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state])
        obs, reward, terminated, truncated, _ = env.step(action)
        next_state = discretize(obs)
        done = terminated or truncated
        q_table[state + (action,)] += alpha * (reward + gamma * np.max(q_table[next_state]) - q_table[state + (action,)])
        state = next_state

print("Training finished.")

Training finished.
