In [1]:
!pip install gym numpy


Collecting gym
  Downloading gym-0.26.2.tar.gz (721 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting gym-notices>=0.0.4
  Downloading gym_notices-0.0.8-py3-none-any.whl (3.0 kB)
Building wheels for collected packages: gym
  Building wheel for gym (pyproject.toml) ... [?25ldone
[?25h  Created wheel for gym: filename=gym-0.26.2-py3-none-any.whl size=827633 sha256=9e19593dce829eb46e66b1acae53b3ee88fe3bbcd44b1e66f9986e1b7fe7c829
  Stored in directory: /home/codespace/.cache/pip/wheels/b9/22/6d/3e7b32d98451b4cd9d12417052affbeeeea012955d437da1da
Successfully built gym
Installing collected packages: gym-notices, gym
Successfully installed gym-0.26.2 gym-notices-0.0.8

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new rele

In [10]:
import gym
import numpy as np


To implement Q-learning, we need to create a Q-table for state-action pairs and update the Q-values iteratively based on the agent's experiences.

In [11]:
def init_q_table(num_states, num_actions):
    return np.zeros((num_states, num_actions))

def update_q_table(q_table, state, action, reward, next_state, alpha, gamma):
    q_table[state, action] += alpha * (reward + gamma * np.max(q_table[next_state]) - q_table[state, action])


Let's discretize the state space to reduce the number of state-action pairs:

In [12]:
def discretize_state(obs, bins):
    state = 0
    for i in range(len(obs)):
        state *= len(bins[i])
        state += np.digitize(obs[i], bins[i]) - 1
    return state


Now let's create an epsilon-greedy exploration strategy:

In [13]:
def choose_action(q_table, state, epsilon, num_actions):
    if np.random.random() < epsilon:
        return np.random.randint(num_actions)
    else:
        return np.argmax(q_table[state])


Now we're ready to train the agent using Q-learning:

In [14]:
# Initialize environment and parameters
env = gym.make('CartPole-v0')
bins = [np.linspace(-2.4, 2.4, 9), np.linspace(-3, 3, 9), np.linspace(-0.5, 0.5, 9), np.linspace(-2, 2, 9)]


num_states = np.prod([len(b) - 1 for b in bins])
num_actions = env.action_space.n

alpha = 0.1
gamma = 0.99
epsilon = 1
epsilon_decay = 0.995
episodes = 1000

q_table = init_q_table(num_states, num_actions)

# Train the agent
for episode in range(episodes):
    obs = env.reset()
    state = discretize_state(obs, bins)
    done = False

    while not done:
        action = choose_action(q_table, state, epsilon, num_actions)
        next_obs, reward, done, _ = env.step(action)
        next_state = discretize_state(next_obs, bins)

        update_q_table(q_table, state, action, reward, next_state, alpha, gamma)

        state = next_state

    epsilon *= epsilon_decay

    if episode % 100 == 0:
        print(f'Episode {episode}: epsilon={epsilon}')


TypeError: '<' not supported between instances of 'dict' and 'dict'

In [9]:
!pip uninstall -y numpy
!pip install numpy==1.19.5


Found existing installation: numpy 1.23.5
Uninstalling numpy-1.23.5:
  Successfully uninstalled numpy-1.23.5
Collecting numpy==1.19.5
  Downloading numpy-1.19.5.zip (7.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: numpy
  Building wheel for numpy (pyproject.toml) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for numpy [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[930 lines of output][0m
  [31m   [0m Running from numpy source directory.
  [31m   [0m Cythonizing sources
  [31m   [0m numpy/random/_bounded_integers.pxd.in h

After training, the agent can now balance the pole for longer periods of time. You can visualize the agent's performance by rendering the environment:

In [None]:
for _ in range(5):
    obs = env.reset()
    done = False
    while not done:
        env.render()
        state = discretize_state(obs, bins)
        action = choose_action(q_table, state, 0, num_actions)
        obs, _, done, _ = env.step(action)

env.close()
