# Last lesson

- A wrapper for `CartPole-v0` that lets us choose the initial state.

In [1]:
import gym

In [2]:
class InitMod(gym.Wrapper):
    def __init__(self, env, initial_state):
        super().__init__(env)
        self.initial_state = initial_state
        
    def reset(self):
        observation = self.env.reset()
        self.unwrapped.state = self.initial_state
        return self.unwrapped.state

# Initialization choice: pole to the right 

1. Cart position = 0
2. Cart velocity = 0.01
3. Pole angle = 0.15
4. The pole tip velocity = 0

In [6]:
import numpy as np
pole_right_init_cartpole_env = InitMod(env=gym.make("CartPole-v0"), initial_state=np.array([0, 0.01, 0.15, 0]))

In [22]:
observation = pole_right_init_cartpole_env.reset()
print(observation)
pole_right_init_cartpole_env.render()

[0.   0.01 0.15 0.  ]


True

In [23]:
pole_right_init_cartpole_env.close()

# Does changing the initial state have an impact on the total rewards?
- For the default initialization, we get an average total reward of around 22.5 when following the random policy.

In [24]:
def get_average_total_rewards_per_episode(env, policy_sampling_function, num_episodes):
    total_rewards = 0
    for num_episode in range(num_episodes):
        observation = env.reset()
        while True:
            if num_episode == 0:
                env.render()
            action = policy_sampling_function(observation)
            observation, reward, done, _ = env.step(action)
            total_rewards += reward
            if done:
                break
    env.close()
    print(f"Average total rewards per episode for {num_episodes} episodes is {total_rewards / num_episodes}")

# Sampling function for the random policy

In [25]:
# Sampling function for the random policy
import random 

def get_action_random_policy(observation):
    if random.random() < 0.5:
        return 0
    return 1

In [26]:
get_average_total_rewards_per_episode(pole_right_init_cartpole_env, get_action_random_policy, 1000)

Average total rewards per episode for 1000 episodes is 16.746


# Initialization choice: non zero pole velocity

1. Cart position = 0
2. Cart velocity = 0.01
3. Pole angle = 0.
4. The pole tip velocity = 2.0

In [27]:
import numpy as np
non_zero_pole_velocity_init_cartpole_env = InitMod(env=gym.make("CartPole-v0"), initial_state=np.array([0, 0.01, 0., 2.0]))

# Average reward for the "non zero pole velocity" initial state

In [28]:
get_average_total_rewards_per_episode(non_zero_pole_velocity_init_cartpole_env, get_action_random_policy, 1000)

Average total rewards per episode for 1000 episodes is 6.102


# The expected total rewards per episode, while following a policy, depends on the starting state!

# Value of a state, given a policy: The expected total rewards per episode you get starting from the state if you follow the given policy

| State | State description | Policy | Value function |
| --- | --- | --- | --- |
| `[0, 0.01, 0.15, 0]` | Pole angled to the right | random | 16 |
| `[0., 0.01, 0., 2.0]` | Non zero pole tip velocity | random | 6 |

# Sampling function for the epsilon pole direction policy with $\epsilon=0.9$

In [29]:
import random

def get_action_epsilon_pole_direction_policy(observation):
    if random.random() < 0.9:
        return get_action_random_policy(observation)
    if observation[2] > 0:
        return 1
    return 0

# Average reward for "pole right" init and epsilon pole direction policy

In [30]:
get_average_total_rewards_per_episode(pole_right_init_cartpole_env, get_action_epsilon_pole_direction_policy, 1000)

Average total rewards per episode for 1000 episodes is 19.728


# Average reward for "non zero pole velocity" init and epsilon pole direction policy

In [32]:
get_average_total_rewards_per_episode(non_zero_pole_velocity_init_cartpole_env, get_action_epsilon_pole_direction_policy, 1000)

Average total rewards per episode for 1000 episodes is 6.346


| State | State description | Policy | Value function |
| --- | --- | --- | --- |
| `[0, 0.01, 0.15, 0]` | Pole angled to the right | random | 16 |
| `[0, 0.01, 0.15, 0]` | Pole angled to the right | epsilon pole direction | 19 |
| `[0., 0.01, 0., 2.0]` | Non zero pole tip velocity | random | 6 |
| `[0., 0.01, 0., 2.0]` | Non zero pole tip velocity | epsilon pole direction | 6 |