https://gymnasium.farama.org/environments/classic_control/mountain_car/

In [None]:
!pip install gymnasium

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gymnasium
  Downloading gymnasium-0.27.1-py3-none-any.whl (883 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 KB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jax-jumpy>=0.2.0
  Downloading jax_jumpy-0.2.0-py3-none-any.whl (11 kB)
Collecting gymnasium-notices>=0.0.1
  Downloading gymnasium_notices-0.0.1-py3-none-any.whl (2.8 kB)
Installing collected packages: gymnasium-notices, jax-jumpy, gymnasium
Successfully installed gymnasium-0.27.1 gymnasium-notices-0.0.1 jax-jumpy-0.2.0


In [None]:
import gymnasium as gym
import numpy as np

In [None]:
env = gym.make("MountainCar-v0")
env.reset()

(array([-0.51335526,  0.        ], dtype=float32), {})

In [None]:
DISC_OBS_SIZE = [20] * len(env.observation_space.high)
disc_obs_win_size = (env.observation_space.high - env.observation_space.low) / DISC_OBS_SIZE

In [None]:
q_table = np.zeros(DISC_OBS_SIZE + [env.action_space.n])
q_table.shape

(20, 20, 3)

## Greedy policy

### Training

In [None]:
ALPHA = 0.5
GAMMA = 0.9
EPISODES = 5000
n_success_train = 0

In [None]:
#pos = np.linspace(env.observation_space.low[0], env.observation_space.high[0], 20)
#vel = np.linspace(env.observation_space.low[1], env.observation_space.high[1], 20)

In [None]:
def get_state(obs):
  tmp = (obs - env.observation_space.low) / disc_obs_win_size
  state = tuple(tmp.round().astype(int))
  return state

In [None]:
def policy_greedy(state):
  return np.argmax(q_table[state])

In [None]:
def update_q_table(state, new_state, action, reward):
  q_table[state][action] = q_table[state][action] + \
                           ALPHA * (reward + GAMMA * np.max(q_table[new_state]) \
                           - q_table[state][action])

In [None]:
for _ in range(EPISODES):
  state = get_state(env.reset()[0]) 
  term = False
  trunc = False

  while not term and not trunc:
    action = policy(state)

    obs, reward, term, trunc, info = env.step(action)
    new_state = get_state(obs)
    
    if not term:
      update_q_table(state, new_state, action, reward)
    else:
    #elif(new_state[0] >= env.goal_position):
      update_q_table(state, new_state, action, 0)
      n_success_train += 1
      
    state = new_state

env.close()

In [None]:
print(f"Success rate =  {round(n_success_train / EPISODES * 100,1)}%")

Success rate =  81.0%


In [None]:
np.save("mountain_car_q_table_greedy.npy", q_table)

### Test

In [None]:
EPISODES = 500
n_success_test = 0

In [None]:
for _ in range(EPISODES):
  state = get_state(env.reset()[0]) 
  term = False
  trunc = False

  while not term and not trunc:
    action = policy(state)

    obs, reward, term, trunc, info = env.step(action)
    new_state = get_state(obs)
    
    if term:
      n_success_test += 1
      
    state = new_state

env.close()

In [None]:
print(f"Success rate =  {round(n_success_test / EPISODES * 100,1)}%")

Success rate =  100.0%


### One episodes inference

In [None]:
EPISODES = 1
action_seq = []

In [None]:
for _ in range(EPISODES):
  state = get_state(env.reset()[0]) 
  term = False
  trunc = False

  while not term and not trunc:
    action = policy(state)
    action_seq.append(action)

    obs, reward, term, trunc, info = env.step(action)
    new_state = get_state(obs)
    state = new_state

env.close()

In [None]:
print(f"Actions ({len(action_seq)}): {action_seq}")

Actions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 0]


## Epsilon-greedy policy with epsilon decay

### Training

In [None]:
ALPHA = 0.5
GAMMA = 0.9
EPISODES = 5000
epsilon = 1.0
eps_decay = epsilon / EPISODES
n_success_train = 0

In [None]:
def policy_esp_greedy(state):
  rnd = np.random.random()
  
  if rnd < epsilon:
    return env.action_space.sample()
  else:
    return np.argmax(q_table[state])

In [None]:
for _ in range(EPISODES):
  state = get_state(env.reset()[0]) 
  term = False
  trunc = False

  while not term and not trunc:
    action = policy(state)

    obs, reward, term, trunc, info = env.step(action)
    new_state = get_state(obs)
    
    if not term:
      update_q_table(state, new_state, action, reward)
    else:
      update_q_table(state, new_state, action, 0)
      n_success_train += 1
      
    state = new_state

  epsilon = max(epsilon - eps_decay, 0)

env.close()

In [None]:
print(f"Success rate =  {round(n_success_train / EPISODES * 100,1)}%")

Success rate =  57.7%


In [None]:
np.save("mountain_car_q_table_eps_greedy.npy", q_table)

### Test

In [None]:
EPISODES = 500
n_success_test = 0

In [None]:
for _ in range(EPISODES):
  state = get_state(env.reset()[0]) 
  term = False
  trunc = False

  while not term and not trunc:
    action = policy_greedy(state)

    obs, reward, term, trunc, info = env.step(action)
    new_state = get_state(obs)
    
    if term:
      n_success_test += 1
      
    state = new_state

env.close()

In [None]:
print(f"Success rate =  {round(n_success_test / EPISODES * 100,1)}%")

Success rate =  80.0%


### One episodes inference

In [None]:
EPISODES = 1
action_seq = []

In [None]:
for _ in range(EPISODES):
  state = get_state(env.reset()[0]) 
  term = False
  trunc = False

  while not term and not trunc:
    action = policy_greedy(state)
    action_seq.append(action)

    obs, reward, term, trunc, info = env.step(action)
    new_state = get_state(obs)
    state = new_state

env.close()

In [None]:
print(f"Actions ({len(action_seq)}): {action_seq}")

Actions (200): [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 1, 1, 1]
