# Last lesson

- A wrapper for `CartPole-v0` that lets us choose the initial state.

In [1]:
import gym

In [2]:
class InitMod(gym.Wrapper):
    def __init__(self, env, initial_state):
        super().__init__(env)
        self.initial_state = initial_state
        
    def reset(self):
        observation = self.env.reset()
        self.unwrapped.state = self.initial_state
        return self.unwrapped.state

# Initialization choice: pole to the right 

1. Cart position = 0
2. Cart velocity = 0.01
3. Pole angle = 0.15
4. The pole tip velocity = 0

In [6]:
import numpy as np
pole_right_init_cartpole_env = InitMod(env=gym.make("CartPole-v0"), initial_state=np.array([0, 0.01, 0.15, 0]))

In [22]:
observation = pole_right_init_cartpole_env.reset()
print(observation)
pole_right_init_cartpole_env.render()

[0.   0.01 0.15 0.  ]


True

In [23]:
pole_right_init_cartpole_env.close()

# Does changing the initial state have an impact on the total rewards?
- For the default initialization, we get an average total reward of around 22.5 when following the random policy.

In [24]:
def get_average_total_rewards_per_episode(env, policy_sampling_function, num_episodes):
    total_rewards = 0
    for num_episode in range(num_episodes):
        observation = env.reset()
        while True:
            if num_episode == 0:
                env.render()
            action = policy_sampling_function(observation)
            observation, reward, done, _ = env.step(action)
            total_rewards += reward
            if done:
                break
    env.close()
    print(f"Average total rewards per episode for {num_episodes} episodes is {total_rewards / num_episodes}")

# Sampling function for the random policy

In [25]:
# Sampling function for the random policy
import random 

def get_action_random_policy(observation):
    if random.random() < 0.5:
        return 0
    return 1

In [26]:
get_average_total_rewards_per_episode(pole_right_init_cartpole_env, get_action_random_policy, 1000)

Average total rewards per episode for 1000 episodes is 16.746


# Initialization choice: non zero pole velocity

1. Cart position = 0
2. Cart velocity = 0.01
3. Pole angle = 0.
4. The pole tip velocity = 2.0

In [27]:
import numpy as np
non_zero_pole_velocity_init_cartpole_env = InitMod(env=gym.make("CartPole-v0"), initial_state=np.array([0, 0.01, 0., 2.0]))

# Average reward for the "non zero pole velocity" initial state

In [28]:
get_average_total_rewards_per_episode(non_zero_pole_velocity_init_cartpole_env, get_action_random_policy, 1000)

Average total rewards per episode for 1000 episodes is 6.102


# The expected total rewards per episode, while following a policy, depends on the starting state!

# Value of a state, given a policy: The expected total rewards per episode you get starting from the state if you follow the given policy

| State | State description | Policy | Value function |
| --- | --- | --- | --- |
| `[0, 0.01, 0.15, 0]` | Pole angled to the right | random | 16 |
| `[0., 0.01, 0., 2.0]` | Non zero pole tip velocity | random | 6 |

# Sampling function for the epsilon pole direction policy with $\epsilon=0.9$

In [29]:
import random

def get_action_epsilon_pole_direction_policy(observation):
    if random.random() < 0.9:
        return get_action_random_policy(observation)
    if observation[2] > 0:
        return 1
    return 0

# Average reward for "pole right" init and epsilon pole direction policy

In [30]:
get_average_total_rewards_per_episode(pole_right_init_cartpole_env, get_action_epsilon_pole_direction_policy, 1000)

Average total rewards per episode for 1000 episodes is 19.728


# Average reward for "non zero pole velocity" init and epsilon pole direction policy

In [32]:
get_average_total_rewards_per_episode(non_zero_pole_velocity_init_cartpole_env, get_action_epsilon_pole_direction_policy, 1000)

Average total rewards per episode for 1000 episodes is 6.346


| State | State description | Policy | Value function |
| --- | --- | --- | --- |
| `[0, 0.01, 0.15, 0]` | Pole angled to the right | random | 16 |
| `[0, 0.01, 0.15, 0]` | Pole angled to the right | epsilon pole direction | 19 |
| `[0., 0.01, 0., 2.0]` | Non zero pole tip velocity | random | 6 |
| `[0., 0.01, 0., 2.0]` | Non zero pole tip velocity | epsilon pole direction | 6 |

# Three important points about the value function

- Because of the *memoryless property of MDP*, the value of state depends *only on the state and the policy*. It doesn't matter if we encounter the state in the first time step (initialization) or we encounter it in the 50th step. The average total rewards that you get afterwards while following a policy remains the same.

- It is clear from the examples that some states have more value than others, given a policy. From the agent's poin t of view, the value function is an indication of the "goodness" of a state. 

- The value function is an expected value. Since the policy may have probabilistic components, and MDPs may have non-trivial state transition probabilities, the total rewards that you get in different episodes starting from a state will be different. These values (from just one episode) are called *value function samples*. When you repeat over many epsiodes, the average of the samples start converging towards a certain number. This is the *expected value* and is the value function (value of a state).

In [33]:
get_average_total_rewards_per_episode(pole_right_init_cartpole_env, get_action_random_policy, 1)
get_average_total_rewards_per_episode(pole_right_init_cartpole_env, get_action_random_policy, 10)
get_average_total_rewards_per_episode(pole_right_init_cartpole_env, get_action_random_policy, 100)
get_average_total_rewards_per_episode(pole_right_init_cartpole_env, get_action_random_policy, 1000)
get_average_total_rewards_per_episode(pole_right_init_cartpole_env, get_action_random_policy, 10000)

Average total rewards per episode for 1 episodes is 13.0
Average total rewards per episode for 10 episodes is 15.8
Average total rewards per episode for 100 episodes is 15.63
Average total rewards per episode for 1000 episodes is 15.892
Average total rewards per episode for 10000 episodes is 16.4143


# Calculating value samples of all states encountered in an episode

In [35]:
episode_history = []

observation = pole_right_init_cartpole_env.reset()
while True:
    action = get_action_random_policy(observation)
    next_observation, reward, done, _ = pole_right_init_cartpole_env.step(action)
    episode_history.append({"observation": observation, "reward": reward})
    observation = next_observation
    if done:
        break
pole_right_init_cartpole_env.close()

In [36]:
episode_history

[{'observation': array([0.  , 0.01, 0.15, 0.  ]), 'reward': 1.0},
 {'observation': array([ 2.00000000e-04,  2.02687997e-01,  1.50000000e-01, -2.41851667e-01]),
  'reward': 1.0},
 {'observation': array([0.00425376, 0.00577724, 0.14516297, 0.09413264]),
  'reward': 1.0},
 {'observation': array([ 0.0043693 , -0.19109477,  0.14704562,  0.42886288]),
  'reward': 1.0},
 {'observation': array([0.00054741, 0.0016721 , 0.15562288, 0.1859088 ]),
  'reward': 1.0},
 {'observation': array([ 0.00058085,  0.19426474,  0.15934105, -0.05392031]),
  'reward': 1.0},
 {'observation': array([ 0.00446615,  0.3867858 ,  0.15826265, -0.29239534]),
  'reward': 1.0},
 {'observation': array([0.01220186, 0.18980299, 0.15241474, 0.04572142]),
  'reward': 1.0},
 {'observation': array([ 0.01599792, -0.00713863,  0.15332917,  0.38234589]),
  'reward': 1.0},
 {'observation': array([0.01585515, 0.18551163, 0.16097609, 0.14166309]),
  'reward': 1.0},
 {'observation': array([ 0.01956538, -0.0115058 ,  0.16380935,  0.4804

In [37]:
len(episode_history)

17

In [40]:
value_samples_random_policy = {}
backward_reward_sum = 0
for step in reversed(episode_history):
    backward_reward_sum += step["reward"]
    value_samples_random_policy[tuple(step["observation"])] = backward_reward_sum

In [41]:
for key, value in value_samples_random_policy.items():
    print(key, value)

(0.03692988041286534, 0.36257794775102353, 0.20910282630949142, 0.2507381259917327) 1.0
(0.033514847600599786, 0.17075164061327763, 0.19961349397149078, 0.4744666169000328) 2.0
(0.02615343217990403, 0.3680707710347878, 0.19708049310200385, 0.12665004347434686) 3.0
(0.02263170024120346, 0.17608659693502837, 0.1900021189135659, 0.3539187094218975) 4.0
(0.022954683494889327, -0.016149162684293283, 0.17829089540086557, 0.5855611756350163) 5.0
(0.019335265948893687, 0.18097087729978212, 0.17341917356648776, 0.24358609171889067) 6.0
(0.01956538185397864, -0.011505795254247803, 0.16380934789594478, 0.48049128352714854) 7.0
(0.015855149340997812, 0.18551162564904142, 0.1609760861940366, 0.14166308509540823) 8.0
(0.01599792199288572, -0.0071386325943954, 0.15332916839442762, 0.3823458899804495) 9.0
(0.01220186214899767, 0.1898029921944025, 0.15241473990575496, 0.04572142443363275) 10.0
(0.004466146217074472, 0.38678579659615986, 0.1582626467659552, -0.292395343010012) 11.0
(0.000580851436168383