Run the following in Terminal to create a virtual environment for this tutorial

In [None]:
# Make virtual environment. Tested on python3.6, python3.7. Run in terminal, not in Jupyter. TODO: Python3.8+ requires Tensorflow >=2 migration
python3.7 -m venv tutorial-env
source tutorial-env/bin/activate

# Install requirements
pip3 install -r requirements.txt

# point jupyter to virtualenv 
pip3 install ipykernel
python -m ipykernel install --user --name=tutorial_env

# Now, restart jupyter notebook and change the Kernel by selecting Kernel > Change kernel > tutorial_env

In [None]:
import tensorflow as tf
print(tf.__version__) # should be 1.15

For the first tutorial, we choose to run an experiment where we increase the size of the dataset to see how sample efficient the methods are. The environment is the Graph environment with a horizon of 4 and a tabular function class for the Q functions.

In [None]:
# Imports
import numpy as np

from ope.envs.graph import Graph
from ope.models.basics import BasicPolicy

from ope.experiment_tools.experiment import ExperimentRunner, analysis
from ope.experiment_tools.config import Config

In [None]:
runner = ExperimentRunner() # Instantiate a runner for an experiment

In [None]:
# run 5 experiments, each with a varying number of trajectories
for N in range(5):

    # basic configuration with varying number of trajectories
    configuration = {
        "gamma": 0.98,                # discount factor
        "horizon": 4,                 # horizon of the environment
        "base_policy": .8,            # \pi_b(a = 0)
        "eval_policy": .2,            # \pi_e(a = 0)
        "stochastic_env": True,       # Make environment have stochastic transitions
        "stochastic_rewards": False,  # Make environment have stochastic rewards
        "sparse_rewards": False,      # Make environment have sparse rewards
        "num_traj": 8*2**N,           # Number of trajectories to collect from pi_b
        "is_pomdp": False,            # Make environment POMDP
        "pomdp_horizon": 2,           # Horizon of POMDP if is_pomdp is True
        "seed": 1000,                 # Seed
        "modeltype": "tabular",       # Q function model type
        "to_regress_pi_b": False,     # pi_b unknown?
    }

    # store these credentials in an object
    cfg = Config(configuration)

    # initialize environment with the parameters from the config file.
    # If you'd like to use a different environment, swap this line
    env = Graph(make_pomdp=cfg.is_pomdp,
                number_of_pomdp_states=cfg.pomdp_horizon,
                transitions_deterministic=not cfg.stochastic_env,
                max_length=cfg.horizon,
                sparse_rewards=cfg.sparse_rewards,
                stochastic_rewards=cfg.stochastic_rewards)

    # set seed for the experiment
    np.random.seed(cfg.seed)

    # processor processes the state for storage,  {(processor(x), a, r, processor(x'), done)}
    processor = lambda x: x

    # absorbing state for padding if episode ends before horizon is reached. This is environment dependent.
    absorbing_state = processor(np.array([env.n_dim - 1]))

    # Setup policies. BasicPolicy takes the form [P(a=0), P(a=1), ..., P(a=n)]
    # For different policies, swap in here
    actions = [0, 1]
    pi_e = BasicPolicy(
        actions, [max(.001, cfg.eval_policy), 1 - max(.001, cfg.eval_policy)])
    pi_b = BasicPolicy(
        actions, [max(.001, cfg.base_policy), 1 - max(.001, cfg.base_policy)])

    # add env, policies, absorbing state and processor
    cfg.add({
        'env': env,
        'pi_e': pi_e,
        'pi_b': pi_b,
        'processor': processor,
        'absorbing_state': absorbing_state
    })

    # Decide which OPE methods to run.
    # Currently only all is available
    cfg.add({'models': 'all'})

    # Add the configuration
    runner.add(cfg)

In [None]:
# Run the experiments
results = runner.run()

In [None]:
# Analyze the results
# Each row in the result is (OPE estimator, V(pi_e), MSE Error from on-policy: (V(pi_e) - True)**2)
for num, result in enumerate(results):
    print('Result Experiment %s' % (num+1))
    analysis(result)
    print('*'*20)
    print()

For the second tutorial, we choose to run the same experiment as before but with a different environment and different Q function class. The environment is the Pixel-Gridworld (Pix-GW) environment with a horizon of 5 and a NN function class for the Q functions.

In [None]:
from ope.envs.gridworld import Gridworld
from ope.models.epsilon_greedy_policy import EGreedyPolicy
from ope.models.tabular_model import TabularPolicy

In [None]:
runner = ExperimentRunner() # make new runner

In [None]:
for N in range(5): # We suggest running this with a GPU or changing this to range(1)
    configuration = {
        "gamma": 0.98,                # discount factor
        "horizon": 5,                 # horizon of the environment
        "base_policy": .8,            # Probability of deviation from epsilon-greedy for base policy
        "eval_policy": .2,            # Probability of deviation from epsilon-greedy for eval policy
        "stochastic_env": True,       # Make environment have stochastic transitions       
        "stochastic_rewards": False,  # Make environment have stochastic rewards
        "sparse_rewards": False,      # Make environment have sparse rewards
        "num_traj": 8*2**N,           # Number of trajectories to collect from pi_b
        "seed": 1000,                 # Seed
        "to_regress_pi_b": False,     # pi_b unknown?
        "frameskip": 1,               # (x_t, a, r, x_{t+frameskip}). Apply action "a" frameskip number of times
        "frameheight": 1,             # (x_{t:t+frameheight}, a, r, x_{t+1:t+1+frameheight}). State is consider a concatenation of frameheight number of states
        "modeltype": 'conv',          # Q function model type, Convolutional NN
        "Qmodel": 'conv1',            # Q function NN definition (TODO: should be moved) 
    }

    # store these credentials in an object
    cfg = Config(configuration)

    # initialize environment with the parameters from the config file.
    env = Gridworld(slippage=.2*cfg.stochastic_env)

    # Set seed and 
    np.random.seed(cfg.seed)
    eval_policy = cfg.eval_policy
    base_policy = cfg.base_policy

    # to_grid and from_grid are particular to Gridworld
    # These functions are special to convert an index in a grid to an 'image'
    def to_grid(x, gridsize=[8, 8]):
        x = x.reshape(-1)
        x = x[0]
        out = np.zeros(gridsize)
        if x >= 64:
            return out
        else:
            out[x//gridsize[0], x%gridsize[1]] = 1.
        return out

    # This function takes an 'image' and returns the position in the grid
    def from_grid(x, gridsize=[8, 8]):
        if len(x.shape) == 3:
            if np.sum(x) == 0:
                x = np.array([gridsize[0] * gridsize[1]])
            else:
                x = np.array([np.argmax(x.reshape(-1))])
        return x

    # processor processes the state for storage,  {(processor(x), a, r, processor(x'), done)}
    processor = lambda x: x
    
    # Set up e-greedy policy using epsilon-optimal
    policy = env.best_policy()
    
    # absorbing state for padding if episode ends before horizon is reached. This is environment dependent.
    absorbing_state = processor(np.array([len(policy)]))

    # Setup policies.
    pi_e = EGreedyPolicy(model=TabularPolicy(policy, absorbing=absorbing_state), processor=from_grid, prob_deviation=eval_policy, action_space_dim=env.n_actions)
    pi_b = EGreedyPolicy(model=TabularPolicy(policy, absorbing=absorbing_state), processor=from_grid, prob_deviation=base_policy, action_space_dim=env.n_actions)

    cfg.add({
        'env': env,
        'pi_e': pi_e,
        'pi_b': pi_b,
        'processor': processor,
        'absorbing_state': absorbing_state,
        'convert_from_int_to_img': to_grid, # if environment state is an int, can convert to image through this function
    })
    cfg.add({'models': 'all'})

    runner.add(cfg)

In [None]:
# Run the experiments
results = runner.run()

In [None]:
# Analyze the results
# Each row in the result is (OPE estimator, V(pi_e), MSE Error from on-policy: (V(pi_e) - True)**2)
for num, result in enumerate(results):
    print('Result Experiment %s' % (num+1))
    analysis(result)
    print('*'*20)
    print()