In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [None]:
# create the environment
# and get the observation space bounds
env = gym.make("MountainCar-v0", render_mode=None)
obs_low = env.observation_space.low
obs_high = env.observation_space.high
n_actions = env.action_space.n

In [None]:
env

In [None]:
# define the number of bins for each dimension
bins = np.array([18, 14])
Q = np.zeros(tuple(bins) + (n_actions,))

In [None]:
# function to choose an action based on the Q-table
def discretize(obs):
    ratios = (obs - obs_low) / (obs_high - obs_low)
    ratios = np.clip(ratios, 0, 0.999)
    ret = ratios* bins 
    print(f"Discretized observation: {ret}")
    return tuple((ratios * bins).astype(int))

In [None]:
# function to choose an action based on epsilon-greedy policy
alpha = 0.1
gamma = 0.99
epsilon = 0.1
episodes = 10000

### Q-learning Update Rule

The core update formula of Q-learning is:

$$
Q(s, a) \leftarrow Q(s, a) + \alpha \cdot \left( r + \gamma \cdot \max_{a'} Q(s', a') - Q(s, a) \right)
$$

---
**Where:**

- \( Q(s, a) \): Current estimate of the Q-value
- \( \alpha \): Learning rate
- \( r \): Reward received
- \( \gamma \): Discount factor
- \( s' \): Next state
- \( \max_{a'} Q(s', a') \): Maximum future Q-value
---



In [None]:
reward_log = []

# train the agent
print("Training the agent...")
for ep in range(episodes):
    obs, _ = env.reset()
    state = discretize(obs)
    total_reward = 0
    done = False
    count = 0
    while not done:
        if np.random.rand() < epsilon:
            action = np.random.randint(n_actions)
        else:
            action = np.argmax(Q[state])

        next_obs, reward, terminated, truncated, _ = env.step(action)
        next_state = discretize(next_obs)
        done = terminated or truncated
        # reward is for instant reward, Q is for future rewards, here reward is always -1
        # reward is like R-matrix in RL
        Q[state][action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state][action])
        state = next_state
        total_reward += reward
        count += 1
    print(f"Episode {ep+1}: Steps = {count}, Total Reward = {total_reward}")
    reward_log.append(total_reward)
    if (ep + 1) % 500 == 0:
        print(f"Episode {ep+1}: Reward = {total_reward}")

In [None]:
# === plot  ===
plt.plot(reward_log)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("MountainCar Q-Learning Training Curve")
plt.grid(True)
plt.show()

In [None]:
# save the Q-table to a file
with open("q_table.pkl", "wb") as f:
    pickle.dump(Q, f)
print("✅ Q save as q_table.pkl")

In [None]:

# === infer use the trained Q-table to play the game and visualize the result by rendering the environment ===
env = gym.make("MountainCar-v0", render_mode="human")
obs, _ = env.reset()
state = discretize(obs)
done = False
total_steps = 0

while not done:
    action = np.argmax(Q[state])
    obs, reward, terminated, truncated, _ = env.step(action)
    state = discretize(obs)
    done = terminated or truncated
    total_steps += 1

print(f"  infer steps: {total_steps}  reward: {reward}")
env.close()