# Last lesson 

- The agent exerts a lot of control over how an episode will unfold by *choosing* the action.

# A particular way of choosing actions, given an environment state, is called a POLICY

In [1]:
def policy(observation):
    """
    :param observation: A gym observation of an environment state
    :return: The probability of taking different actions in that environment state e.g. {0: 0.8, 1: 0.2}
    """
    pass

In [2]:
def random_policy(observation):
    return {0: 0.5, 1: 0.5}

# `env.step()` demands a concrete action sampled from the prob. distr. given by the policy. So we need a sampling function for every policy.

In [None]:
import gym 
env = gym.make("CartPole-v0")
observation = env.reset()
while True:
    env.render()
    observation, reward, done, _ = env.step(env.action_space.sample())    # this demands a concrete action
    if done:
        break
env.close()

In [3]:
# Sampling function for the random policy
import random 

def get_action_random_policy(observation):
    if random.random() < 0.5:
        return 0
    return 1

# A function that runs a given number of episodes while the agent is following a particular policy and returns the average total rewards per episode

In [4]:
import gym 

def get_average_total_rewards_per_episode(policy_sampling_function, num_episodes):
    env = gym.make("CartPole-v0")
    total_rewards = 0
    for num_episode in range(num_episodes):
        observation = env.reset()
        while True:
            if num_episode == 0:
                env.render()
            action = policy_sampling_function(observation)
            observation, reward, done, _ = env.step(action)
            total_rewards += reward
            if done:
                break
    env.close()
    return total_rewards / num_episodes

In [5]:
get_average_total_rewards_per_episode(get_action_random_policy, 1000)

22.177

# "Pole direction policy"

In [6]:
def pole_direction_policy(observation):
    if observation[2] > 0: 
        return {0: 0, 1: 1}
    return {0: 1, 1: 0}

In [7]:
# Sampling function
def get_action_pole_direction_policy(observation):
    if observation[2] > 0:
        return 1
    return 0

In [8]:
get_average_total_rewards_per_episode(get_action_pole_direction_policy, 1000)

42.335

# Different policies lead to different total rewards per episode

- "Pole direction policy" ~ 40
- "random policy" ~ 20

# Which policy would give me the most total rewards per episode?

## Central goal of the agent in any RL problem: find (or learn) the *policy* that maximizes total rewards per episode