# RL for MountainCar @ OpenAI gym

importing requisite modules

In [11]:
import numpy as np
from tqdm import tqdm
import gymnasium as gym

initialising gym environment

In [12]:
env = gym.make("MountainCar-v0")
print('Action Space:', env.action_space.n)
print('Observation Space:', env.observation_space.low, '->', env.observation_space.high)

Action Space: 3
Observation Space: [-1.2  -0.07] -> [0.6  0.07]


### TABULAR Q-LEARNING

initialising Q table

In [13]:
N_x = 20
N_v = 20
Qtable = np.zeros((N_x, N_v, env.action_space.n))
print('Q-table size:', Qtable.shape)

Q-table size: (20, 20, 3)


discretised Q-table index of continuously distributed observation

In [14]:
def index_state(state):
    index = (state-env.observation_space.low)//((env.observation_space.high-env.observation_space.low)/(N_x,N_v))
    return tuple(index.astype(int))

Q-Learning hyperparameters

In [15]:
EPISODES = 30_000
rewardLog = np.zeros(EPISODES)

ALPHA    = 0.18
GAMMA    = 0.99

EPS_MAX = 0.90
EPS_MIN = 0.09

training

In [16]:
def train(s, a, r, obs):
    Qtable[index_state(s)][a] += ALPHA*(r + GAMMA*np.max(Qtable[index_state(obs)]) - Qtable[index_state(s)][a])

running episodes

In [17]:
episode = 0
for episode in tqdm(range(EPISODES), desc=f"ep: {episode}", leave=True, ncols=69):
    done = False
    state, info = env.reset(seed = 42)

    # exploration rate decay
    EPSILON = EPS_MAX - (EPS_MAX - EPS_MIN)*(episode/EPISODES)

    while not done:
        # epsilon-greedy agent
        if np.random.uniform() < EPSILON:
            action = env.action_space.sample()
        else:
            action = np.argmax(Qtable[index_state(state)])
        
        # take action
        new_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        
        # Q-learning
        if not done:
            train(state, action, reward, new_state)
        elif new_state[0] >= env.goal_position:
            # print('success in episode', episode)
            Qtable[index_state(state)][action] = 0
        
        # updates
        state = new_state
        rewardLog[episode] += reward

env.close()

  logger.warn(
ep: 0: 100%|██████████████████| 30000/30000 [04:47<00:00, 104.33it/s]


In [18]:
env.close()

In [19]:
N = EPISODES // 10

for k in range(EPISODES//N):
    print(N*(k+1), np.mean(rewardLog[N*k:N*(k+1)]))

3000 -200.0
6000 -200.0
9000 -200.0
12000 -200.0
15000 -200.0
18000 -199.86166666666668
21000 -198.55633333333333
24000 -194.75733333333332
27000 -189.47666666666666
30000 -181.57933333333332


In [20]:
TEST = 5
test = gym.make("MountainCar-v0", render_mode='human')

for k in range(TEST):
    done = False
    state, info = test.reset(seed = 69)

    while not done:
        # greedy agent
        action = np.argmax(Qtable[index_state(state)])
        
        # take action
        state, reward, terminated, truncated, info = test.step(action)
        done = terminated or truncated

test.close()