In [1]:
import gym
import numpy as np
from tqdm import tnrange, tqdm_notebook

# Mountain Car v0

State variables
1. $x$ coordinate
2. velocity

In [2]:
env = gym.make("MountainCar-v0")    

In [3]:
x_high, vel_high = env.observation_space.high
x_low, vel_low = env.observation_space.low
print(f"Range of x: {x_low} - {x_high}")
print(f"Range of velocity: {vel_low} - {vel_high}")

Range of x: -1.2000000476837158 - 0.6000000238418579
Range of velocity: -0.07000000029802322 - 0.07000000029802322


In [4]:
X_BIN_COUNT = 20
VEL_BIN_COUNT = 20
action_space_dim = env.action_space.n
DISCOUNT_FACTOR = 0.95
LEARNING_RATE = 0.1
MAX_EPSILON = 1
MIN_EPSILON = 0.5


In [5]:
def round_to_nearest_bin(state):
    x, vel = state
    x_ind = int((x - x_low) / (x_high - x_low) * X_BIN_COUNT)
    vel_ind = int((vel - vel_low) / (vel_high - vel_low) * VEL_BIN_COUNT)
    
    return np.array([x_ind, vel_ind])    

In [6]:
def update_q_table(q_table, quartet):
    current_state, action, reward, next_state = quartet
    
    current_q_value = q_table[tuple(current_state)][action]
    
    best_q_value_next_state = np.max(q_table[tuple(next_state)])
    
    new_q_value = (1 - LEARNING_RATE) * current_q_value + LEARNING_RATE * (reward + DISCOUNT_FACTOR * best_q_value_next_state)
    
    q_table[tuple(current_state)][action] = new_q_value
    
    

In [7]:
def decide_action(q_table, state, epsilon):
    
    if np.random.uniform() < epsilon:
        return np.random.randint(action_space_dim)
    else:
        return np.argmax(q_table[tuple(state)])

In [9]:
episode_count = 15000
SHOW_EVERY = 3000
q_table = np.random.uniform(-2, 0, size = (X_BIN_COUNT, VEL_BIN_COUNT, action_space_dim))
epsilon = MAX_EPSILON

for episode in tnrange(episode_count, desc = "Episode"):
    state = env.reset()
    done = False
    
    epsilon = MAX_EPSILON - (MAX_EPSILON - MIN_EPSILON) / episode_count * episode
    while not done:
        state_binned = tuple(round_to_nearest_bin(state))
        action = decide_action(q_table, state_binned, epsilon)
        new_state, reward, done, _ = env.step(action)    
        
        new_state_binned = tuple(round_to_nearest_bin(new_state))
        
        if episode % SHOW_EVERY == 0:
            env.render()
        
        if not done:
            update_q_table(q_table, (state_binned, action, reward, new_state_binned))
            
        else:
            if new_state[0] >= env.goal_position:
                q_table[state_binned][action] = 0
                
        state = new_state
        
    if episode % SHOW_EVERY == 0:
        env.close()
    
        
        


HBox(children=(IntProgress(value=0, description='Episode', max=15000, style=ProgressStyle(description_width='i…

In [15]:
state = env.reset()
done = False
score = 0
epsilon = 0
while not done:
    action = decide_action(q_table ,round_to_nearest_bin(state),  epsilon)
    new_state, reward, done, _ = env.step(action)
    score += 1
    env.render()
    
    state = new_state
    
env.close()
print(f"Score: {score}")

Score: 158
