# The Bellman expectation equation for value functions

$V_{\pi}(s(t)) = \mathbf{E}[r(t) + \gamma V_{\pi}(s(t+1)) | s(t)]$

# Verifying the Bellman expectation equation in `CartPole-v0`

- Use the initial state `[0., 0.01, 0.15, 0.]` i.e. the pole tilted to the right at initialization.
- Use the epsilon pole direction policy with epsilon 0.9.
- Use gamma 0.8.
- Verify the Bellman expectation equation for the initial state `[0., 0.01, 0.15, 0.]` by relating it to the value functions of the next possible states.

In [1]:
import random

import gym
import numpy as np


class InitMod(gym.Wrapper):
    """Wrapper class to change initial state  in CartPole-v0
    """
    def __init__(self, env, initial_state):
        super().__init__(env)
        self.initial_state = initial_state
        
    def reset(self):
        observation = self.env.reset()
        self.unwrapped.state = self.initial_state
        return self.unwrapped.state
    

def get_action_random(observation):
    """Sampling function for random policy
    """
    if random.random() < 0.5:
        return 0
    return 1
    
    
def get_action_epsilon_pole_direction_policy(observation):
    """Sampling function for the epsilon pole direction policy
    """
    if random.random() < 0.9:
        return get_action_random(observation)
    if observation[2] > 0:
        return 1
    return 0


class Value():
    def __init__(self, gamma, visit_number={}, value_average={}):
        """
        Helper for computing expected value of states. 
        It has an update() method that updates averages of value samples with new episode data
        """
        self.gamma = gamma
        self.visit_number = visit_number
        self.value_average = value_average
        
    def update(self, episode_history):
        backward_reward_sum = 0
        for step in reversed(episode_history):
            backward_reward_sum = self.gamma * backward_reward_sum + step["reward"]
            key = tuple(step["observation"])
            try:
                visit_number = self.visit_number[key]
            except KeyError:
                visit_number = 0
            if visit_number == 0:
                self.value_average[key] = backward_reward_sum
            else:
                self.value_average[key] = (visit_number * self.value_average[key] + backward_reward_sum) / (visit_number + 1)
            self.visit_number[key] = visit_number + 1

            
# create the wrapped env where the pole it tilted to the right in the initial state    
pole_right_init_cartpole_env = InitMod(env=gym.make("CartPole-v0"), initial_state=np.array([0, 0.01, 0.15, 0]))


# compute expected value of states by going through 100000 episodes
num_episodes = 100000
gamma = 0.8

value_info = Value(gamma=gamma)
for num_episode in range(num_episodes):
    episode_history = []
    observation = pole_right_init_cartpole_env.reset()
    while True:
        action = get_action_epsilon_pole_direction_policy(observation)
        next_observation, reward, done, _ = pole_right_init_cartpole_env.step(action)
        episode_history.append({"observation": observation, "reward": reward})
        observation = next_observation
        if done:
            break
    value_info.update(episode_history)
pole_right_init_cartpole_env.close()

state = (0., 0.01, 0.15, 0.)
print(f"The value of the state {state}, given the epsilon pole direction policy with epsilon 0.9 is {value_info.value_average[state]}")

The value of the state (0.0, 0.01, 0.15, 0.0), given the epsilon pole direction policy with epsilon 0.9 is 4.580812449061228


In [2]:
initial_observation = pole_right_init_cartpole_env.reset()
next_observation_action_0, reward, done, _ = pole_right_init_cartpole_env.step(0)    # prob: 0.45
print(next_observation_action_0)

[ 2.00000000e-04 -1.86919275e-01  1.50000000e-01  3.35996937e-01]


In [3]:
initial_observation = pole_right_init_cartpole_env.reset()
next_observation_action_1, reward, done, _ = pole_right_init_cartpole_env.step(1)    # prob: 0.55
print(next_observation_action_1)

[ 2.00000000e-04  2.02687997e-01  1.50000000e-01 -2.41851667e-01]


# The Bellman expectation equation for value functions

$V_{\pi}(s(t)) = \mathbf{E}[r(t) + \gamma V_{\pi}(s(t+1)) | s(t)]$

In [6]:
1 + gamma * (value_info.value_average[tuple(next_observation_action_0)] * 0.45 + value_info.value_average[tuple(next_observation_action_1)] * 0.55)

4.580674388322798