In [3]:
import gym
import pylab
import numpy as np

# Inverse Reinforcement Learning

The Mountain Car problem is "on a one-dimensional track, positioned between two “mountains”. The goal is to drive up the mountain on the right; however, the car’s engine is not strong enough to scale the mountain in a single pass. Therefore, the only way to succeed is to drive back and forth to build up momentum."

<img src='https://miro.medium.com/max/1104/1*JjBfoFrKCoBxlraVZaEshw.jpeg'>

The car’s state, at any point in time, is given by a vector containing its horizonal position and velocity. The car commences each episode stationary, at the bottom of the valley between the hills (at position, x, approximately -0.5), and the episode ends when either the car reaches the flag (position > 0.5) or after 200 moves.

Your state, s, is a 2-dim vector that will be (position, velocity) or (x,v) of (-0.5,0) in the beginning because the velocity is 0 and x = -0.5 is the valley between the two mountains which is left of center (there is about a distance of 1.1 in front of you and a distance of -0.7 behind you). Only the x-axis is part of the state, the elevation (y-axis) and gravity is implicit. 

The velocity is positive is you are moving forward to the right, and negative if you are moving to the left. 

action 0 is do nothing, action 1 is run the car in reverse towards the left mountain located at `state[0]=-1.2`,
action 2 is run the car forward towards `state[0]=0.6`


In [4]:
# # MACROS
Push_Left = 0
No_Push = 1
Push_Right = 2

In [5]:
# Initialize environment and reset it to start at valley with 0 velocity
env = gym.make('MountainCar-v0')
env.reset() # array([-0.44943939,  0.        ])

# experiment with constantly moving in one direction or the other to see what happens
const_action = No_Push
for t in range(100):
    update = env.step(const_action)
    if t % 5 == 0:
        print(update)
    
# if you do nothing you jus swing side to side, but your position stays in (-0.64,-0.4)
# your velocity stays in (-0.01,+0.01)

(array([-4.76850926e-01, -3.52075133e-04]), -1.0, False, {})
(array([-0.48371095, -0.00202197]), -1.0, False, {})
(array([-0.49792094, -0.00332095]), -1.0, False, {})
(array([-0.51686825, -0.00400851]), -1.0, False, {})
(array([-0.53705828, -0.00395614]), -1.0, False, {})
(array([-0.5547636 , -0.00317364]), -1.0, False, {})
(array([-0.56672165, -0.00180714]), -1.0, False, {})
(array([-5.70736411e-01, -1.09287782e-04]), -1.0, False, {})
(array([-0.56607247,  0.00160857]), -1.0, False, {})
(array([-0.55358426,  0.00303159]), -1.0, False, {})
(array([-0.53556574,  0.003897  ]), -1.0, False, {})
(array([-0.51533794,  0.00404339]), -1.0, False, {})
(array([-0.49663536,  0.00344328]), -1.0, False, {})
(array([-0.48290684,  0.00220896]), -1.0, False, {})
(array([-0.47667582,  0.00056926]), -1.0, False, {})
(array([-0.4790842 , -0.00117466]), -1.0, False, {})
(array([-0.48969092, -0.0027034 ]), -1.0, False, {})
(array([-0.50654931, -0.0037354 ]), -1.0, False, {})
(array([-0.52655467, -0.004078

In [6]:
# Initialize environment and reset it to start at valley with 0 velocity
env = gym.make('MountainCar-v0')
env.reset() # array([-0.44943939,  0.        ])

# experiment with constantly moving in one direction or the other to see what happens
# constantly do nothing
const_action = No_Push
for t in range(100):
    update = env.step(const_action)
    if t % 5 == 0:
        print(update)
    
# if you do nothing you jus swing side to side, but your position stays in (-0.64,-0.4)
# your velocity stays in (-0.01,+0.01)

(array([-5.53310824e-01,  2.24220318e-04]), -1.0, False, {})
(array([-0.54894264,  0.00128738]), -1.0, False, {})
(array([-0.53989785,  0.00211342]), -1.0, False, {})
(array([-0.52784398,  0.0025496 ]), -1.0, False, {})
(array([-0.51500612,  0.00251499]), -1.0, False, {})
(array([-0.50375501,  0.002016  ]), -1.0, False, {})
(array([-0.49616677,  0.00114521]), -1.0, False, {})
(array([-4.93639728e-01,  6.35227098e-05]), -1.0, False, {})
(array([-0.49663907, -0.00102986]), -1.0, False, {})
(array([-0.50461262, -0.0019336 ]), -1.0, False, {})
(array([-0.51609091, -0.00248081]), -1.0, False, {})
(array([-0.52895565, -0.00256999]), -1.0, False, {})
(array([-0.54083112, -0.00218458]), -1.0, False, {})
(array([-0.54952534, -0.00139613]), -1.0, False, {})
(array([-5.53435615e-01, -3.50484997e-04]), -1.0, False, {})
(array([-0.551842  ,  0.00075966]), -1.0, False, {})
(array([-0.54503784,  0.00172997]), -1.0, False, {})
(array([-0.53427667,  0.0023814 ]), -1.0, False, {})
(array([-0.52154378,  

In [7]:
# Initialize environment and reset it to start at valley with 0 velocity
env = gym.make('MountainCar-v0')
env.reset() # array([-0.44943939,  0.        ])

# experiment with constantly moving in one direction or the other to see what happens
# constantly go right
const_action = Push_Right
for t in range(100):
    update = env.step(const_action)
    if t % 5 == 0:
        print(update)
    
# if you constantly go right,  your position stays in (-0.52,-0.24)
# your velocity stays in (-0.011,+0.011)

(array([-4.02942532e-01,  1.15520119e-04]), -1.0, False, {})
(array([-0.40068835,  0.00066509]), -1.0, False, {})
(array([-0.39599541,  0.0011    ]), -1.0, False, {})
(array([-0.38967038,  0.00134621]), -1.0, False, {})
(array([-0.38279361,  0.00136316]), -1.0, False, {})
(array([-0.37653034,  0.00114992]), -1.0, False, {})
(array([-0.37193303,  0.00074409]), -1.0, False, {})
(array([-3.69768642e-01,  2.14460993e-04]), -1.0, False, {})
(array([-3.70396588e-01, -3.50736679e-04]), -1.0, False, {})
(array([-0.3737127 , -0.00085774]), -1.0, False, {})
(array([-0.37916541, -0.00122184]), -1.0, False, {})
(array([-0.38584234, -0.00138091]), -1.0, False, {})
(array([-0.39261739, -0.00130631]), -1.0, False, {})
(array([-0.39833835, -0.00100881]), -1.0, False, {})
(array([-0.40202502, -0.000538  ]), -1.0, False, {})
(array([-4.03042392e-01,  2.56395926e-05]), -1.0, False, {})
(array([-0.40121477,  0.00058485]), -1.0, False, {})
(array([-0.39685764,  0.00104317]), -1.0, False, {})
(array([-0.390

In [8]:
# Initialize environment and reset it to start at valley with 0 velocity
env = gym.make('MountainCar-v0')
env.reset() # array([-0.44943939,  0.        ])

# experiment with constantly moving in one direction or the other to see what happens
# constantly go left
const_action = Push_Left
for t in range(100):
    update = env.step(const_action)
    if t % 5 == 0:
        print(update)
    
# if you constantly go left,  your position stays in (-0.85,-0.48)
# your velocity stays in (-0.015,+0.011)

(array([-0.52770694, -0.00097651]), -1.0, False, {})
(array([-0.54672927, -0.0056059 ]), -1.0, False, {})
(array([-0.58611381, -0.00920342]), -1.0, False, {})
(array([-0.63866887, -0.0111318 ]), -1.0, False, {})
(array([-0.69506712, -0.01111902]), -1.0, False, {})
(array([-0.74585421, -0.00929901]), -1.0, False, {})
(array([-0.783204  , -0.00610299]), -1.0, False, {})
(array([-0.80187871, -0.0020843 ]), -1.0, False, {})
(array([-0.7994467 ,  0.00219923]), -1.0, False, {})
(array([-0.77621717,  0.00620306]), -1.0, False, {})
(array([-0.73525548,  0.00936888]), -1.0, False, {})
(array([-0.68242697,  0.01114462]), -1.0, False, {})
(array([-0.62604412,  0.01110483]), -1.0, False, {})
(array([-0.57570754,  0.00912572]), -1.0, False, {})
(array([-0.54042938,  0.00549046]), -1.0, False, {})
(array([-0.52667313,  0.00084439]), -1.0, False, {})
(array([-0.53697607, -0.00395756]), -1.0, False, {})
(array([-0.56943731, -0.0080297 ]), -1.0, False, {})
(array([-0.61809934, -0.01063427]), -1.0, Fals

in the update, the tuple  (array([-0.66511328, -0.01531442]), -1.0, False, {})
 contains the (state, reward, end_of_episode_bool, meta_data)
the meta_data is empty for this environment, there is a negative reward for every timestep that 
 you have not reached the goal yet. 

In [10]:
# load the expert 20 demonstrations
expert_demo = np.load('/shared/Carson/RL/lets-do-irl/mountaincar/maxent/expert_demo/expert_demo.npy')
print(expert_demo.shape)
# (number of demonstrations, length of demonstrations, states and actions of demonstrations)
print(expert_demo[0,60,:], expert_demo[0,0,:].shape)
# as you can see from step 60 of the first example, the best strategy is to first accelerate backwards into
# the < -0.8 range into order to gain speed going right into the valley. 

(20, 130, 3)
[-0.90691623 -0.02983074  0.        ] (3,)


In [11]:
'''
To descretize the state space, we separate the range of possible continuous positions x and
continuous velocities y into 20 bins (one_feature)
'''

one_feature = 20 # number of state per one feature

env_low = env.observation_space.low     
env_high = env.observation_space.high   
env_distance = (env_high - env_low) / one_feature  

print(env_low, env_high, env_distance, env_distance[0]*20, env_distance[1]*20)
# the range of x is 1.8 and velocity is 0.14 

[-1.2  -0.07] [0.6  0.07] [0.09  0.007] 1.8000000715255737 0.14000000432133675


In [13]:
'''
using this bin size we descretize the expert demonstrations
The difference between expert_demo and demonstrations is that
demonstrations consists of discreet integers instead of continuous values
for it's states



here I used n for sample index and t for timestep index
'''
def idx_state(state):
    '''
    this function converts a continuous state vector of
    2-dim into a discrete index
    
    by assigning each state and index of state_idx = position_idx + velocity_idx * one_feature
    we make sure that (position_idx, velocity_idx) = (2,3) and (3,2) map to different integers
    The first 20 elements of state_idx go to position_idx = 0 - 19, velocity_idx = 0, etc
    '''
    position_idx = int((state[0] - env_low[0]) / env_distance[0])
    velocity_idx = int((state[1] - env_low[1]) / env_distance[1])
    state_idx = position_idx + velocity_idx * one_feature
    return state_idx

demonstrations = np.zeros((len(expert_demo), len(expert_demo[0]), 3))

for n in range(len(expert_demo)):
    
    for t in range(len(expert_demo[0])):

        state_idx = idx_state(expert_demo[n][t])

        demonstrations[n][t][0] = state_idx
        demonstrations[n][t][1] = expert_demo[n][t][2] 

Q-Learning is a “model-free, off-policy” RL algorithm.

“model-free” because we are not trying to build a pre-determined model of our environment (which may or may not be accurate)

“off-policy” because our behavior in interacting with (or exploring) the environment may be unrelated to what we believe to be the best optimal at the time of taking the action.

You might think of Q-learning as updating a table where each row is a different state and each column is an action that can be taken in that state and where they intersect is the Q-value for that state action pair. the Q-table maps various (state, action) pairs with the Q-value (the expected sum of discounter future rewards when taking action a in state s and behaving optimally thereafter)

It doesnt have to be a square table like this though, Q just has to map some state or state-action to some Q-value

In the above example you can see that the pattern is that we observe some state, s, take an action a, and receive a new state s’, then repeat. 

The update rule is the weighted average of the old Q(s,a) value and the Q value implied by the new observation. That is, the sum of (i) the immediate reward and (ii) the expected discounted reward received from the new state onwards, assuming you always choose the optimal action.

Q’(s,a) = (1 — alpha) * Q(s, a) + alpha *(r + gamma * Q(s’, argmax a’ : Q(s’, a’)))

where argmax a’ : Q(s’, a’) is the action  a’ that has the highest current Q value among our row of states s’

This equation for update is equivalently written 

𝑄(𝑠,𝑎)←𝑄(𝑠,𝑎)+𝛼(𝑟+𝛾*max_𝑎′𝑄(𝑠′,𝑎′)−𝑄(𝑠,𝑎)) 


In [14]:
def update_q_table(state, action, reward, next_state):
    ''' 
    The Q-learning update rule
    𝑄(𝑠,𝑎)←𝑄(𝑠,𝑎)+𝛼(𝑟+𝛾*max_𝑎′𝑄(𝑠′,𝑎′)−𝑄(𝑠,𝑎))
    '''
    q_1 = q_table[state][action]
    q_2 = reward + gamma * max(q_table[next_state])
    q_table[state][action] += q_learning_rate * (q_2 - q_1)