In [None]:
import numpy as np
import gymnasium as gym
from gymnasium.utils.save_video import save_video, capped_cubic_video_schedule
from progressbar import progressbar
import matplotlib.pyplot as plt

In [None]:
env = gym.make('MountainCar-v0')

In [None]:
position_count = 32
velocity_count = 32
action_space = 3

position_space = (-1.2, 0.6)
position_bins = np.linspace(*position_space, position_count)

velocity_space = (-0.07, 0.07)
velocity_bins = np.linspace(*velocity_space, velocity_count)

In [None]:
Q = np.zeros(shape=(position_count, velocity_count, action_space))

In [None]:
def get_state(observation):
    position, velocity = observation
    return np.digitize(position, position_bins)-1, np.digitize(velocity, velocity_bins)-1

In [None]:
number_of_episodes = 5000
max_steps = 1000

alpha = 0.1
epsilon = 1.0
gamma = 0.99

step_sum = 0

rewards = []

for episode in progressbar(range(number_of_episodes)):
    record = capped_cubic_video_schedule(episode)
    if record:
        env = gym.make('MountainCar-v0', render_mode='rgb_array_list')

    observation, _ = env.reset()
    state = get_state(observation)

    if epsilon > 0.01: epsilon -= 2/number_of_episodes
    reward_sum = 0

    for step in range(max_steps):
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state])
        
        new_observation, reward, terminated, truncated, _ = env.step(action)
        new_state = get_state(new_observation)

        error = Q[state][action] - (reward + gamma * np.max(Q[new_state]))
        Q[state][action] -= alpha * error
            
            
        step_sum += 1
        reward_sum += reward

        state = new_state

        if terminated:
            break

    if record:
        save_video(env.render(), "videos", fps=env.metadata["render_fps"],
                   step_starting_index=step_sum,
                   episode_index=episode)
        env = gym.make('MountainCar-v0')

    if episode % 100 == 0: print(np.average(rewards[-100:]))
    rewards.append(reward_sum)

In [None]:
plt.plot(rewards)

In [None]:
plt.imshow(np.max(Q, axis=2))

In [None]:
plt.imshow(np.argmax(Q, axis=2))