In [1]:
!pip install cmake 'gym[atari]' scipy -q

In [2]:
import gym

In [217]:
env = gym.make("Taxi-v3").env
env.render()

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[43mB[0m: |
+---------+



In [218]:
env.reset()
env.render()
print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|Y| : |[34;1mB[0m: |
+---------+

Action Space Discrete(6)
State Space Discrete(500)


In [219]:
# Env Specifics
# Taxi is Yellow without a passenger, Green with
# RGYB are possible pickup and destination locations
# Blue letter represents the current passenger pick up location
# Purple letter is the current destination

In [220]:
state = env.encode(3, 1, 2, 0) 
# (taxi row, taxi column, passenger index, destination index)

print("State:", state)

env.s = state
env.render()

State: 328
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+



In [221]:
# Initial Reward Table
# Dict with structure:
# {action: [(probability, nextstate, reward, done)]}
# actions
    # 0: south, 1: north, 2: east, 3: west, 4: pickup, 5: dropoff
# probabilities:
    # always 1 in this env
# nextstate:
    # state we will be in after taking action specified by key
# reward:
    # movement actions have reward -1.0
    # pickup/dropoff actions have reward -10 in this state
    # At dropoff/pickup locations, dropoff and pickup would have higher rewards (obviously)
# done:
    # tells us whether we have dropped someone off successfully (one episode) this is the flag that tells us if we are in a terminal state
    


env.P[328]

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

In [222]:
# Brute Force Solution (no RL)
env.s = 328  # set environment to illustration's state

epochs = 0
penalties, reward = 0, 0

frames = [] # for animation

done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1
    
    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))


Timesteps taken: 428
Penalties incurred: 125


In [223]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)
        
print_frames(frames)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 428
State: 0
Action: 5
Reward: 20


In [22]:
# Clearly this sucked because the agent was just moving randomly
# until it stumbled into the right state.

In [23]:
# Enter Reinforcement Learning (online learning)
# 1) Direct Evaluation --> On Policy
# 2) Temporal Difference Learning --> On Policy
# 3) Q-Learning --> Off Policy
# 4) Approximate Q-Learning --> Off Policy

In [None]:
# Motivate RL using the brute force taxi-v3 solver.

# Direct Evaluation
# Use nchain-v0
# 1) Fix some policy pi (can be initialized randomly)
# 2) Collect Samples in Episodes
# 2a) Maintain counts of total reward from each state onwards
# 2b) Maintain counts of total number of times each state was visited
# 3) Value of a state is then computed as total reward/number of visits
# 4) Run Policy extraction to find best policy

# How to implement
# Make a table of states and total reward and times visited
# Dict with structure:
# {state: [(total reward, times visited, value)]}
# update this dict as you explore


# Temporal Difference Learning, "Learn from every experience"
# 1) Set V(s) = 0 for all states initially and pick a random policy
# 2) Take action according to policy pi(s) and receive reward R(s, a, s')
# 3) Compute sample = R(s, a, s') + g*V(s')
# 4) Incorporate this sample into the current value of s' using the update rule
        # V(s') <- (1-alpha)*V(s') + alpha*sample
# 5) As you explore, shrink the learning rate as a function of the number of times a state s' has been visited
# 6) At the end of learning, you have some table of states by values.

# How to implement
# Make a table of states, values, and times visited
# Dict with structure:
# {state: [(times visited, value)]}
# After each action, update state's value and times visited according to update rule, update alpha


# Q-Learning
# 1) Update is pretty much the same as in temporal difference, learning but the table changes to accomodate Q states instead of only values
# 2) This lets us learn off-policy so we can split our time between exploiting the policy we have so far and exploring (epsilon greedy)
# 3) This is also done in the tutorial so we can just go off of that


# Overall ideas of this section:
# Using the taxi-v3 environment,
# 1) Introduce the problem, explain that we will doing model-free learning (explain differences between model-(based, free))
# 2) Explain what a policy is and explain on policy vs off policy learning approaches
# 3) Go over 2 on policy learning approaches with Taxi-v3
# 3a) Direct Evaluation (easiest)
# 3b) Temporal Difference <- Here explain the step size and why it is important (ema, favor newer, more correct information)
# 4) Explain that we are learning values for each state here. How do we extract a policy?
# 5) Go over value-iteration and policy extraction. Have them do some exercises with this.
# 6) Go over 1 off policy learning approach with Taxi-v3
# 6a) Q-Learning
# 7) Explain exploration vs exploitation tradeoff and how epsilon greedy can be implemented.
# 
# When going through any learning algorithm
# pay special attention to the update steps and the policy extraction part
# 




In [17]:
# Direct Evaluation on nchain-v0
env = gym.make("NChain-v0").env
# Actions
# 0 is forward 
# 1 is backwards
# slip probability is 0.2
# reward for backwards is 2
# reward to go all the way to the end is 10


In [185]:
# dir(env)
import sys
import numpy as np
from gym import utils
from io import StringIO
from contextlib import closing

In [212]:
# Render method to show what the agent is doing
actions = []
# Motivated from code: https://github.com/openai/gym/blob/master/gym/envs/toy_text/nchain.py
def render_env(env, mode="human"):
    MAP = [
    "+-----------+",
    "|S: : : : :E|",
    "+-----------+",
    ]
    outfile = StringIO() if mode == 'ansi' else sys.stdout
    desc = np.asarray(MAP, dtype='c')
    locs = [(0,0), (0,4)]
    
    out = desc.copy().tolist()
    out = [[c.decode('utf-8') for c in line] for line in out]
    agent_position = env.state
    
    if agent_position <= 5:
        out[1][2*agent_position + 1] = utils.colorize(out[1][2*agent_position + 1], 'yellow', highlight=True)
        
    outfile.write("\n".join(["".join(row) for row in out]) + "\n")
    
    if mode != 'human':
        with closing(outfile):
            return outfile.getvalue()
            
def sample_action_space():
    return np.random.choice([0, 1])

def step_env(env, action):
    assert env.action_space.contains(action)
    done = False
    if env.np_random.rand() < env.slip:
        action = not action  # agent slipped, reverse action taken
    if action:  # 'backwards': go back to the beginning, get small reward
        reward = env.small
        env.state = 0
    elif env.state < env.n - 1:  # 'forwards': go up along the chain
        reward = 0
        env.state += 1
    else:  # 'forwards': stay at the end of the chain, collect large reward
        reward = env.large
        done = True
        env.state += 1
    return env.state, reward, done, {}

In [213]:
# Brute Force Solution (no RL)
# env.s = 328   # set environment to illustration's state
env.reset()
epochs = 0
penalties, reward = 0, 0

frames = [] # for animation

done = False

while not done:
    action = sample_action_space()
    state, reward, done, info = step_env(env, action)
#     print(state, reward, done, info)

    if reward == 2:
        penalties += 1
    
    # Put each rendered frame into dict for animation
    frames.append({
        'frame': render_env(env, mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Small rewards incurred: {}".format(penalties))

Timesteps taken: 66
Small rewards incurred: 30


In [214]:
# Looking at this, it takes a long time for the agent to find 
# the +10 reward at the end of the chain. Now let's introduce RL
# through Direct Evaluation. But first, let's review the basics.

print_frames(frames)

+-----------+
|S: : : : :[43mE[0m|
+-----------+

Timestep: 66
State: 5
Action: 0
Reward: 10


In [215]:
# Online Planning vs Offline Planning
# Underlying MDP
# Classic diagram
# Agent -> action -> Environment -> state, reward -> Agent ...
# Continue sampling until state is terminal
# The total collection of samples is epoch

In [224]:
# Two approaches to RL
    # 1) Model Based --> Agent learns Transition probs and rewards
        # by exploring and then uses value iteration and policy 
        # extraction to find the policy. (takes into account the
        # underlying mdp)
    # 2) Model Free --> Agent just tries to directly estimate 
        # values and q values without considering the underlying mdp

# Let's take a look at model-free learning.
# Within model free learning we further differentiate between
    # 1) Passive RL -> Agent is given a policy and learns the value of states under that policy
    # 2) Active RL -> Agent iteratively updates its policy while learning until eventually learning opt policy

In [None]:
# 1) Direct Evaluation