In [1]:
# Imports 
import gym
import numpy as np


In [2]:
# Global agent parameters 
LEARNING_RATE = 0.1
DISCOUNT_FACTOR = 0.95
TRAINING_EPISODES = 25_000
SHOW_EVERY = 2_000

In [3]:
# Define the mountain car environment 
env = gym.make('MountainCar-v0')

  logger.warn(


In [4]:
# Observation and action space 
obs_space = env.observation_space
action_space = env.action_space
print(f"The observation space: {obs_space}")
print(f"The action space: {action_space}")

The observation space: Box([-1.2  -0.07], [0.6  0.07], (2,), float32)
The action space: Discrete(3)


We now want to define a Q table. This is the quality of every possible action given every possible state. In most cases we will not know the bounds of our observation space. But, this gym enviroment tells us them. 

In [5]:
# Maximum position and momentum values
print(obs_space.high)
# Minimum position and momentum values
print(obs_space.low)

[0.6  0.07]
[-1.2  -0.07]


This observation space is continuous, thus we cannot make a table out of it. We must discretise the observation space to make a Q table.

In [6]:
# Define wanted position observations 
pos_obs = 20
# Also find the position step
dx = (obs_space.high[0] - obs_space.low[0])/pos_obs

# Define wanted momentum observations 
mom_obs = 20 
# Also find the momentum step
dp = (obs_space.high[1] - obs_space.low[1])/mom_obs

# Fill a matrix with all possible position and momentum positions after 
# discretisation
disc_pos = np.arange(obs_space.low[0],
                     obs_space.high[0] + dx,
                     dx)
disc_mom = np.arange(obs_space.low[1],
                     obs_space.high[1] + dp,
                     dp)
# Find the discretised space that is a combination of all of these
disc_space = np.array(np.meshgrid(disc_pos, disc_mom))

# The Q table dimension will be a tensor of this with the action space dimension
# Initialise the Q table with zeros 
q_table = np.zeros((pos_obs,
                    mom_obs,
                    env.action_space.n))

In [7]:
# A function that will tell us corresponding index of the 
# discrete state we are in given the continous state
def discrete_state_index(
    continuous_state: tuple,
    dx: float,
    dp: float,
):
    disc_state_index = ((continuous_state - env.observation_space.low))//[dx, dp]
    # Coerce to int type and return tuple
    disc_state_index = tuple(disc_state_index.astype(int))
    
    return disc_state_index

In [8]:
# Now define a function for updating the Q values
# Do this without epsilon initially. May go back to change this 
def new_q_value(
    q_table: np.array,
    current_state_index: float,
    new_state_index: float,
    action,
    reward: float,
    learning_rate: float,
    discount_factor: float,
):
    new_q_value = (q_table[current_state_index][action] +
                   learning_rate * (
                       reward + 
                       discount_factor * np.max(q_table[new_state_index]) -
                       q_table[current_state_index][action]
                   )
    )
    
    return new_q_value

In [9]:
# Begin learning
for episode in range(TRAINING_EPISODES):
    # Check if we want to observe this episode
    if episode % SHOW_EVERY == 0:
        print(f"This is episode {episode}")
        render = True
    else:
        render = False
        
        
    # Iterate over the episode
    done = False
    # Find what state we start in 
    current_state_index = discrete_state_index(continuous_state=env.reset(),
                                            dx=dx,
                                            dp=dp)
    
    while not done:
        # Find the best estimate action currently 
        action = np.argmax(q_table[current_state_index])
        # Take a step, find the new state, reward and if we are done
        new_state, reward, done, _ = env.step(action)
        # Render 
        if render:
            env.render()
            
        #Check we didn't succeed
        if not done:
            # Find the index in the q table corresponding to the state
            new_state_index = discrete_state_index(continuous_state=new_state,
                                                    dx=dx,
                                                    dp=dp)
            # Update the q table 
            q_table[current_state_index][action] = new_q_value(
                q_table=q_table,
                current_state_index=current_state_index,
                new_state_index=new_state_index,
                action=action,
                reward=reward,
                learning_rate=LEARNING_RATE,
                discount_factor=DISCOUNT_FACTOR
            )
            
        # We also want to account for the simulation ending and assign max Q 
        elif new_state[0] >= env.goal_position:
            q_table[new_state_index][action] = 0
            
        # Update the state index 
        current_state_index = new_state_index

This is episode 0
This is episode 2000
This is episode 4000
This is episode 6000
This is episode 8000
This is episode 10000


KeyboardInterrupt: 