# Cartpole via Gymnasium


Imagine a pole hinged to a cart that moves along a horizontal track. The goal of Cartpole is to keep the pole balanced upright by moving the cart left or right.

<p align="center">
    <img src="gym_media/cartpole.gif" alt="Cartpole"/>
</p>

## Environment Dynamics (CartPole-v1)
* State space (observations): A 4D vector:
    * Cart position (x)
    * Cart velocity (ẋ)
    * Pole angle (θ)
    * Pole angular velocity (θ̇)
* Action space: Discrete actions:
    * 0 = move cart to the left
    * 1 = move cart to the right
* Reward:
    * +1 for every timestep the pole remains upright (i.e., the episode hasn't ended).
* Episode ends when:
    * Pole falls too far (angle > 12°)
    * Cart moves too far from the center (position > ~2.4 units)
    * Episode reaches 500 timesteps (in CartPole-v1)

## Why is Cartpole Popular
* Simple dynamics but requires real-time decision making.
* Ideal for testing and debugging RL algorithms like Q-learning, DQN, PPO, etc.
* Deterministic and quick to simulate.


In [None]:
import gymnasium as gym
import numpy as np
import math
import time
import matplotlib.pyplot as plt

In [None]:
env = gym.make("CartPole-v1", render_mode=None)

In [None]:
# NUM_BINS is the following states
# [cart position, cart velocity, pole angle, pole angular velocity]

NUM_BINS = (12, 12, 18, 18)

# Tuned Q-learning hyperparameters for convergence
alpha = 0.1
gamma = 0.99
epsilon = 1.0
epsilon_decay = 0.999
epsilon_min = 0.01
episodes = 10000
max_steps = 500

# Binning function
def create_bins(low, high, num_bins):
    return np.linspace(low, high, num_bins + 1)[1:-1]

cart_position_bins = create_bins(-4.8, 4.8, NUM_BINS[0])
cart_velocity_bins = create_bins(-3.0, 3.0, NUM_BINS[1])
pole_angle_bins = create_bins(-0.418, 0.418, NUM_BINS[2])
pole_velocity_bins = create_bins(-3.5, 3.5, NUM_BINS[3])

def discretize_state(state):
    cart_pos, cart_vel, pole_angle, pole_vel = state
    return (
        np.digitize(cart_pos, cart_position_bins),
        np.digitize(cart_vel, cart_velocity_bins),
        np.digitize(pole_angle, pole_angle_bins),
        np.digitize(pole_vel, pole_velocity_bins),
    )

# Initialize Q-table and logging
q_table = np.zeros(NUM_BINS + (env.action_space.n,))
reward_history = []

# Training loop
for episode in range(episodes):
    state, _ = env.reset()
    state_disc = discretize_state(state)
    total_reward = 0

    for step in range(max_steps):
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state_disc])

        next_state, reward, terminated, truncated, _ = env.step(action)
        next_state_disc = discretize_state(next_state)
        total_reward += reward

        best_next_action = np.max(q_table[next_state_disc])
        q_table[state_disc + (action,)] += alpha * (reward + gamma * best_next_action - q_table[state_disc + (action,)])
        state_disc = next_state_disc

        if terminated or truncated:
            break

    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    reward_history.append(total_reward)

    if (episode + 1) % 100 == 0:
        print(f"Episode {episode+1}: reward={total_reward:.0f}, epsilon={epsilon:.3f}")
        env_vis = gym.make("CartPole-v1", render_mode="human")
        vis_state, _ = env_vis.reset()
        vis_state_disc = discretize_state(vis_state)

        for _ in range(max_steps):
            vis_action = np.argmax(q_table[vis_state_disc])
            vis_state, _, vis_terminated, vis_truncated, _ = env_vis.step(vis_action)
            vis_state_disc = discretize_state(vis_state)
            time.sleep(0.01)
            if vis_terminated or vis_truncated:
                break
        time.sleep(0.02)
        env_vis.close()

env.close()
print("Training complete.")

In [None]:
# Plot rewards and smoothed moving average
window = 50
rolling_avg = np.convolve(reward_history, np.ones(window)/window, mode='valid')

plt.figure(figsize=(12, 6))
plt.plot(reward_history, label="Reward per Episode", alpha=0.3)
plt.plot(range(window - 1, len(reward_history)), rolling_avg, label=f"{window}-episode Moving Average", linewidth=2)
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.title("Q-Learning Performance on CartPole-v1")
plt.legend()
plt.grid(True)
plt.show()