
# CartPole (Gym/Gymnasium): Tabular Q-Learning with Discretization

If you don't have the environment, install one of:
- `pip install gymnasium[classic-control]`
- `pip install gym[classic_control]`


In [None]:

import numpy as np
import matplotlib.pyplot as plt

env = None
GYMNASIUM = None
try:
    import gymnasium as gym
    env = gym.make("CartPole-v1")
    GYMNASIUM = True
except Exception:
    try:
        import gym
        env = gym.make("CartPole-v1")
        GYMNASIUM = False
    except Exception:
        print("Gym/Gymnasium not available. Install to run training.")


In [None]:

num_bins = (6, 6, 12, 12)
obs_high = np.array([4.8, 5.0, 0.418, 5.0])
obs_low  = -obs_high

def create_bins(low, high, bins):
    return [np.linspace(low[i], high[i], bins[i]-1) for i in range(len(bins))]

bins_list = create_bins(obs_low, obs_high, num_bins)

def discretize(obs, bins_list):
    return tuple(int(np.digitize(obs[i], bins_list[i])) for i in range(len(bins_list)))


In [None]:

alpha = 0.1
gamma = 0.99
eps_start, eps_end = 1.0, 0.05
eps_decay_episodes = 300
episodes = 600

def epsilon(ep):
    frac = min(1.0, ep/eps_decay_episodes)
    return eps_start*(1-frac) + eps_end*frac


In [None]:

if env is not None:
    Q = np.zeros(num_bins + (env.action_space.n,), dtype=float)
    returns = []

    for ep in range(episodes):
        if GYMNASIUM:
            s, info = env.reset()
        else:
            s = env.reset()
        s = np.clip(s, obs_low, obs_high)
        ds = discretize(s, bins_list)

        done = False
        ep_ret = 0.0
        e = epsilon(ep)

        while not done:
            if np.random.random() < e:
                a = env.action_space.sample()
            else:
                a = int(np.argmax(Q[ds]))

            if GYMNASIUM:
                ns, r, terminated, truncated, info = env.step(a)
                done = terminated or truncated
            else:
                ns, r, done, info = env.step(a)

            ns = np.clip(ns, obs_low, obs_high)
            dns = discretize(ns, bins_list)

            td_target = r + (0.0 if done else gamma * np.max(Q[dns]))
            Q[ds + (a,)] += alpha * (td_target - Q[ds + (a,)])

            ds = dns
            ep_ret += r

        returns.append(ep_ret)

    window = 20
    movavg = np.convolve(returns, np.ones(window)/window, mode="valid")
    plt.figure()
    plt.plot(returns, label="Return per episode")
    plt.plot(np.arange(window-1, len(returns)), movavg, label=f"Moving average (window={window})")
    plt.xlabel("Episode")
    plt.ylabel("Return")
    plt.title("CartPole Q-learning (tabular)")
    plt.legend()
    plt.show()
else:
    print("Environment unavailable; skip this cell after installing gym/gymnasium.")



**Try Next**
- Adjust bins, α, γ, and ε‑schedule
- Compare with SARSA
- Replace table with a tiny NN (DQN‑lite) for function approximation
