In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import gym
from matplotlib import pyplot as plt
import itertools
from agents import *
from wrappers import *
from trainers import *
from tqdm.notebook import tqdm

## Blackjack

Play some episodes using a random policy

In [None]:
env = gym.make("Blackjack-v1")
discretizer = BlackjackDiscretizer()
discrete_env = DiscreteWrapper(env, discretizer)

behavioral_policy = np.array([[0.5, 0.5] for _ in range(discretizer.n_states)])

agent = BlackjackOffPolicyMCAgent(behavioral_policy)

test_episodes = 1_000
episode_rewards = []
rewards = 0.
for _ in tqdm(range(test_episodes)):
    state = discrete_env.reset()
    for step in itertools.count():
        action = agent.behavioral_act(state)   
        state, reward, is_done, _ = discrete_env.step(action)
        rewards += reward
        if is_done:
            break    
rewards /= test_episodes
print(f"Average reward = {rewards}")

Use random policy and learn a better one wiht Monte Carlo Off-policy control

In [None]:
env = gym.make("Blackjack-v1")
discretizer = BlackjackDiscretizer()
discrete_env = DiscreteWrapper(env, discretizer)

behavioral_policy = np.array([[0.5, 0.5] for _ in range(discretizer.n_states)])

agent = BlackjackOffPolicyMCAgent(behavioral_policy)
discrete_env = ActionLogger(discrete_env)
trainer = MCControlTrainer(gamma=1.0)

test_episodes = 1_000
episode_rewards = []
changed_states = set()
for episode in tqdm(range(10_000)):
    state = discrete_env.reset()
    episode_reward = 0.
    for step in itertools.count():
        action = agent.behavioral_act(state)
        state, reward, is_done, _ = discrete_env.step(action)
        episode_reward += reward
        if is_done:
            break  
    changed_states |= trainer.update(agent, discrete_env.log)
    episode_rewards.append(episode_reward)

    if episode % 1_000 == 0:
        rewards = 0.
        print(f"States changed: {len(changed_states)}")
        changed_states = set()
        for _ in range(test_episodes):
            state = discrete_env.reset()
            for step in itertools.count():
                prev_state = state
                action = agent.act(state)   
                state, reward, is_done, _ = discrete_env.step(action)
                rewards += reward
                if is_done:
                    break    
        rewards /= test_episodes
        print(f"After {episode} episodes average reward = {rewards}")

It works, let's try to apply the same approach to a more complicated game.

## Montain Car

In [None]:
env = gym.make("MountainCar-v0")
state = env.reset()

frame = env.render(mode="rgb_array")
plt.imshow(frame)

[Environment page](https://gym.openai.com/envs/MountainCar-v0/)

[Environment description](https://github.com/openai/gym/wiki/MountainCar-v0)

[Environment implementation](https://github.com/openai/gym/blob/master/gym/envs/classic_control/mountain_car.py)

First number: position in the range from -1.2 to 0.6

Second number: velocity from -0.7 to 0.7

Actions:
* 0 - accelerate left
* 1 - don't accelerate
* 2 - accelerate right

In [None]:
state

In [None]:
type(env)

In [None]:
def play_once(env: gym.Env, agent: BaseAgent, render: bool=False, verbose: bool=False) -> float:
    state = env.reset()
    episode_reward = 0.
    log = []
    for step in itertools.count():
        if render:
            env.render()
        prev_state = state
        action = agent.act(state)
        state, reward, is_done, _ = env.step(action)
        record = (reward, prev_state, action)
        log.append(record)
        episode_reward += reward
        if is_done:
            break
    if verbose:
        print('get {} rewards in {} steps'.format(
                episode_reward, step + 1))
    return episode_reward, log

We'll use a good external policy.

In [None]:
agent = OrininalSmartAgent()
reward, _ = play_once(env, agent, render=True)
reward

It's deterministic and doesn't ensure coverage of the random target policy, let's introduce some random noise inside and make sure it still can finish the game

In [None]:
env = gym.make("MountainCar-v0")
discretizer = MountainCarDiscretizer(20, 20)
discrete_env = DiscreteWrapper(env, discretizer)
agent = OrininalSmartAgent()
discrete_policy = get_discrete_policy(discretizer, agent, eps=0.)

agent = DisceteSmartAgent(discretizer.n_states, discrete_policy)
reward, _ = play_once(discrete_env, agent, render=True)
reward

In [None]:
env = gym.make("MountainCar-v0")

discretizer = MountainCarDiscretizer(20, 20)
discrete_env = DiscreteWrapper(env, discretizer)

behavioral_agent = OrininalSmartAgent()
behavioral_policy = get_discrete_policy(discretizer, behavioral_agent, eps=0.1)

agent = MountainCarOffPolicyMCAgent(behavioral_policy)
discrete_env = ActionLogger(DiscreteWrapper(env, discretizer))
trainer = MCControlTrainer(gamma=1.0)

test_episodes = 100
episode_rewards = []
changed_states = set()
for episode in tqdm(range(10_000)):
    state = discrete_env.reset()
    episode_reward = 0.
    for step in itertools.count():
        action = agent.behavioral_act(state)
        state, reward, is_done, _ = discrete_env.step(action)
        episode_reward += reward
        if is_done:
            break  
    changed_states |= trainer.update(agent, discrete_env.log)
    episode_rewards.append(episode_reward)

    if episode % 1_000 == 0:
        rewards = 0.
        print(f"States changed: {len(changed_states)}")
        changed_states = set()
        for _ in range(test_episodes):
            state = discrete_env.reset()
            for step in itertools.count():
                prev_state = state
                action = agent.act(state)   
                state, reward, is_done, _ = discrete_env.step(action)
                rewards += reward
                if is_done:
                    break    
        rewards /= test_episodes
        print(f"After {episode} episodes average reward = {rewards}")
        print((np.argmax(agent.action_values, axis=1) == np.argmax(agent.behavioral_policy, axis=1)).mean())

No luck, most games don't lead to the target policy updates, let's try random behavioral policy

In [None]:
env = gym.make("MountainCar-v0")

discretizer = MountainCarDiscretizer(20, 20)
discrete_env = DiscreteWrapper(env, discretizer)

behavioral_policy = [[1./3, 1./3, 1./3] for _ in range(discretizer.n_states)]

agent = MountainCarOffPolicyMCAgent(behavioral_policy)
discrete_env = ActionLogger(DiscreteWrapper(env, discretizer))
trainer = MCControlTrainer(gamma=1.0)

test_episodes = 100
episode_rewards = []
changed_states = set()
for episode in tqdm(range(10_000)):
    state = discrete_env.reset()
    episode_reward = 0.
    for step in itertools.count():
        action = agent.behavioral_act(state)
        state, reward, is_done, _ = discrete_env.step(action)
        episode_reward += reward
        if is_done:
            break  
    changed_states |= trainer.update(agent, discrete_env.log)
    episode_rewards.append(episode_reward)

    if episode % 1_000 == 0:
        rewards = 0.
        print(f"States changed: {len(changed_states)}")
        changed_states = set()
        for _ in range(test_episodes):
            state = discrete_env.reset()
            for step in itertools.count():
                prev_state = state
                action = agent.act(state)   
                state, reward, is_done, _ = discrete_env.step(action)
                rewards += reward
                if is_done:
                    break    
        rewards /= test_episodes
        print(f"After {episode} episodes average reward = {rewards}")
        print((np.argmax(agent.action_values, axis=1) == np.argmax(agent.behavioral_policy, axis=1)).mean())

In [None]:
env = gym.make("MountainCar-v0")

discretizer = MountainCarDiscretizer(20, 20)
discrete_env = DiscreteWrapper(env, discretizer)

behavioral_policy = [[1./3, 1./3, 1./3] for _ in range(discretizer.n_states)]

agent = MountainCarOffPolicyMCAgent(behavioral_policy)
discrete_env = ActionLogger(DiscreteWrapper(env, discretizer))
trainer = MCControlTrainer(gamma=1.0)

test_episodes = 100
episode_rewards = []
changed_states = set()
for episode in tqdm(range(100_000)):
    state = discrete_env.reset()
    episode_reward = 0.
    for step in itertools.count():
        action = agent.behavioral_act(state)
        state, reward, is_done, _ = discrete_env.step(action)
        episode_reward += reward
        if is_done:
            break  
    changed_states |= trainer.update(agent, discrete_env.log)
    episode_rewards.append(episode_reward)

    if episode % 10_000 == 0:
        rewards = 0.
        print(f"States changed: {len(changed_states)}")
        changed_states = set()
        for _ in range(test_episodes):
            state = discrete_env.reset()
            for step in itertools.count():
                prev_state = state
                action = agent.act(state)   
                state, reward, is_done, _ = discrete_env.step(action)
                rewards += reward
                if is_done:
                    break    
        rewards /= test_episodes
        print(f"After {episode} episodes average reward = {rewards}")
        print((np.argmax(agent.action_values, axis=1) == np.argmax(agent.behavioral_policy, axis=1)).mean())

Still no luck.