* Based on [this overview notebook](https://github.com/google-research/recsim/blob/master/recsim/colab/RecSim_Overview.ipynb)

In [1]:
import numpy as np
import tensorflow as tf

from recsim.environments import interest_evolution
from recsim.agents import full_slate_q_agent, random_agent
from recsim.simulator import runner_lib

## Creating Agents

In [2]:
# creating agents

def create_q_agent(sess, environment, eval_mode, summary_writer=None):
  kwargs = {
      'observation_space': environment.observation_space,
      'action_space': environment.action_space,
      'summary_writer': summary_writer,
      'eval_mode': eval_mode,
  }
  return full_slate_q_agent.FullSlateQAgent(sess, **kwargs)


def create_random_agent(sess, environment, eval_mode, summary_writer=None):
    kwargs = {
      'action_space': environment.action_space,
  }
    return random_agent.RandomAgent(**kwargs)

## Training the Agents

### Configuring the Environment

In [3]:
# environment config

seed = 0
np.random.seed(seed)
env_config = {
  'num_candidates': 10,
  'slate_size': 2,
  'resample_documents': True,
  'seed': seed,
  }

### Training the Random Agent

In [4]:
# training

tmp_random_dir = '/tmp/recsim_random/'
runner = runner_lib.TrainRunner(
    base_dir=tmp_random_dir,
    create_agent_fn=create_random_agent,
    env=interest_evolution.create_environment(env_config),
    episode_log_file="",
    max_training_steps=50,
    num_iterations=10)
runner.run_experiment()

INFO:tensorflow:max_training_steps = 50, number_iterations = 10,checkpoint frequency = 1 iterations.


INFO:tensorflow:max_training_steps = 50, number_iterations = 10,checkpoint frequency = 1 iterations.


INFO:tensorflow:max_steps_per_episode = 27000


INFO:tensorflow:max_steps_per_episode = 27000


INFO:tensorflow:Beginning training...


INFO:tensorflow:Beginning training...


INFO:tensorflow:Reloaded checkpoint and will start from iteration 10


INFO:tensorflow:Reloaded checkpoint and will start from iteration 10






### Training the Q Agent

In [5]:
# training

tmp_q_dir = '/tmp/recsim_q/'
runner = runner_lib.TrainRunner(
    base_dir=tmp_q_dir,
    create_agent_fn=create_q_agent,
    env=interest_evolution.create_environment(env_config),
    episode_log_file="",
    max_training_steps=50,
    num_iterations=10)
runner.run_experiment()

INFO:tensorflow:max_training_steps = 50, number_iterations = 10,checkpoint frequency = 1 iterations.


INFO:tensorflow:max_training_steps = 50, number_iterations = 10,checkpoint frequency = 1 iterations.


INFO:tensorflow:max_steps_per_episode = 27000


INFO:tensorflow:max_steps_per_episode = 27000


INFO:tensorflow:Creating FullSlateQAgent agent with the following parameters:


INFO:tensorflow:Creating FullSlateQAgent agent with the following parameters:


INFO:tensorflow:	 gamma: 0.990000


INFO:tensorflow:	 gamma: 0.990000


INFO:tensorflow:	 update_horizon: 1.000000


INFO:tensorflow:	 update_horizon: 1.000000


INFO:tensorflow:	 min_replay_history: 20000


INFO:tensorflow:	 min_replay_history: 20000


INFO:tensorflow:	 update_period: 4


INFO:tensorflow:	 update_period: 4


INFO:tensorflow:	 target_update_period: 8000


INFO:tensorflow:	 target_update_period: 8000


INFO:tensorflow:	 epsilon_train: 0.010000


INFO:tensorflow:	 epsilon_train: 0.010000


INFO:tensorflow:	 epsilon_eval: 0.001000


INFO:tensorflow:	 epsilon_eval: 0.001000


INFO:tensorflow:	 epsilon_decay_period: 250000


INFO:tensorflow:	 epsilon_decay_period: 250000


INFO:tensorflow:	 tf_device: /cpu:*


INFO:tensorflow:	 tf_device: /cpu:*


INFO:tensorflow:	 use_staging: True


INFO:tensorflow:	 use_staging: True


INFO:tensorflow:	 optimizer: <tensorflow.python.training.rmsprop.RMSPropOptimizer object at 0x14f2d9e48>


INFO:tensorflow:	 optimizer: <tensorflow.python.training.rmsprop.RMSPropOptimizer object at 0x14f2d9e48>


INFO:tensorflow:	 max_tf_checkpoints_to_keep: 4


INFO:tensorflow:	 max_tf_checkpoints_to_keep: 4


INFO:tensorflow:Creating a OutOfGraphReplayBuffer replay memory with the following parameters:


INFO:tensorflow:Creating a OutOfGraphReplayBuffer replay memory with the following parameters:


INFO:tensorflow:	 observation_shape: (11, 20)


INFO:tensorflow:	 observation_shape: (11, 20)


INFO:tensorflow:	 observation_dtype: float32


INFO:tensorflow:	 observation_dtype: float32


INFO:tensorflow:	 terminal_dtype: <class 'numpy.uint8'>


INFO:tensorflow:	 terminal_dtype: <class 'numpy.uint8'>


INFO:tensorflow:	 stack_size: 1


INFO:tensorflow:	 stack_size: 1


INFO:tensorflow:	 replay_capacity: 1000000


INFO:tensorflow:	 replay_capacity: 1000000


INFO:tensorflow:	 batch_size: 32


INFO:tensorflow:	 batch_size: 32


INFO:tensorflow:	 update_horizon: 1


INFO:tensorflow:	 update_horizon: 1


INFO:tensorflow:	 gamma: 0.990000


INFO:tensorflow:	 gamma: 0.990000


Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, there are two
    options available in V2.
    - tf.py_function takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
    (it is not differentiable, and manipulates numpy arrays). It drops the
    stateful argument making all functions stateful.
    


Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, there are two
    options available in V2.
    - tf.py_function takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
    (it is not differentiable, and manipulates numpy arrays). It drops the
    stateful argument making all functions stateful.
    


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Please use tf.global_variables instead.


Instructions for updating:
Please use tf.global_variables instead.


INFO:tensorflow:legacy_checkpoint_load: False


INFO:tensorflow:legacy_checkpoint_load: False


INFO:tensorflow:Beginning training...


INFO:tensorflow:Beginning training...


INFO:tensorflow:Starting iteration 0


INFO:tensorflow:Starting iteration 0


INFO:tensorflow:Starting iteration 1


INFO:tensorflow:Starting iteration 1


INFO:tensorflow:Starting iteration 2


INFO:tensorflow:Starting iteration 2


INFO:tensorflow:Starting iteration 3


INFO:tensorflow:Starting iteration 3


INFO:tensorflow:Starting iteration 4


INFO:tensorflow:Starting iteration 4


Instructions for updating:
Use standard file APIs to delete files with this prefix.


Instructions for updating:
Use standard file APIs to delete files with this prefix.


INFO:tensorflow:Starting iteration 5


INFO:tensorflow:Starting iteration 5


INFO:tensorflow:Starting iteration 6


INFO:tensorflow:Starting iteration 6


INFO:tensorflow:Starting iteration 7


INFO:tensorflow:Starting iteration 7


INFO:tensorflow:Starting iteration 8


INFO:tensorflow:Starting iteration 8


INFO:tensorflow:Starting iteration 9


INFO:tensorflow:Starting iteration 9


### Evaluating the Random Agent

In [6]:
# evaluating

runner = runner_lib.EvalRunner(
      base_dir=tmp_random_dir,
      create_agent_fn=create_random_agent,
      env=interest_evolution.create_environment(env_config),
      max_eval_episodes=5,
      test_mode=True)
runner.run_experiment()

INFO:tensorflow:max_eval_episodes = 5


INFO:tensorflow:max_eval_episodes = 5


INFO:tensorflow:max_steps_per_episode = 27000


INFO:tensorflow:max_steps_per_episode = 27000


INFO:tensorflow:Beginning evaluation...


INFO:tensorflow:Beginning evaluation...


INFO:tensorflow:eval_file: /tmp/recsim_random/eval_5/returns_818


INFO:tensorflow:eval_file: /tmp/recsim_random/eval_5/returns_818


### Evaluating the Q Agent

In [7]:
# evaluating

runner = runner_lib.EvalRunner(
      base_dir=tmp_q_dir,
      create_agent_fn=create_q_agent,
      env=interest_evolution.create_environment(env_config),
      max_eval_episodes=5,
      test_mode=True)
runner.run_experiment()

INFO:tensorflow:max_eval_episodes = 5


INFO:tensorflow:max_eval_episodes = 5


INFO:tensorflow:max_steps_per_episode = 27000


INFO:tensorflow:max_steps_per_episode = 27000


INFO:tensorflow:Creating FullSlateQAgent agent with the following parameters:


INFO:tensorflow:Creating FullSlateQAgent agent with the following parameters:


INFO:tensorflow:	 gamma: 0.990000


INFO:tensorflow:	 gamma: 0.990000


INFO:tensorflow:	 update_horizon: 1.000000


INFO:tensorflow:	 update_horizon: 1.000000


INFO:tensorflow:	 min_replay_history: 20000


INFO:tensorflow:	 min_replay_history: 20000


INFO:tensorflow:	 update_period: 4


INFO:tensorflow:	 update_period: 4


INFO:tensorflow:	 target_update_period: 8000


INFO:tensorflow:	 target_update_period: 8000


INFO:tensorflow:	 epsilon_train: 0.010000


INFO:tensorflow:	 epsilon_train: 0.010000


INFO:tensorflow:	 epsilon_eval: 0.001000


INFO:tensorflow:	 epsilon_eval: 0.001000


INFO:tensorflow:	 epsilon_decay_period: 250000


INFO:tensorflow:	 epsilon_decay_period: 250000


INFO:tensorflow:	 tf_device: /cpu:*


INFO:tensorflow:	 tf_device: /cpu:*


INFO:tensorflow:	 use_staging: True


INFO:tensorflow:	 use_staging: True


INFO:tensorflow:	 optimizer: <tensorflow.python.training.rmsprop.RMSPropOptimizer object at 0x18e53e668>


INFO:tensorflow:	 optimizer: <tensorflow.python.training.rmsprop.RMSPropOptimizer object at 0x18e53e668>


INFO:tensorflow:	 max_tf_checkpoints_to_keep: 4


INFO:tensorflow:	 max_tf_checkpoints_to_keep: 4


INFO:tensorflow:Creating a OutOfGraphReplayBuffer replay memory with the following parameters:


INFO:tensorflow:Creating a OutOfGraphReplayBuffer replay memory with the following parameters:


INFO:tensorflow:	 observation_shape: (11, 20)


INFO:tensorflow:	 observation_shape: (11, 20)


INFO:tensorflow:	 observation_dtype: float32


INFO:tensorflow:	 observation_dtype: float32


INFO:tensorflow:	 terminal_dtype: <class 'numpy.uint8'>


INFO:tensorflow:	 terminal_dtype: <class 'numpy.uint8'>


INFO:tensorflow:	 stack_size: 1


INFO:tensorflow:	 stack_size: 1


INFO:tensorflow:	 replay_capacity: 1000000


INFO:tensorflow:	 replay_capacity: 1000000


INFO:tensorflow:	 batch_size: 32


INFO:tensorflow:	 batch_size: 32


INFO:tensorflow:	 update_horizon: 1


INFO:tensorflow:	 update_horizon: 1


INFO:tensorflow:	 gamma: 0.990000


INFO:tensorflow:	 gamma: 0.990000


INFO:tensorflow:legacy_checkpoint_load: False


INFO:tensorflow:legacy_checkpoint_load: False


INFO:tensorflow:Beginning evaluation...


INFO:tensorflow:Beginning evaluation...


INFO:tensorflow:Restoring parameters from /tmp/recsim_q/train/checkpoints/tf_ckpt-9


INFO:tensorflow:Restoring parameters from /tmp/recsim_q/train/checkpoints/tf_ckpt-9


INFO:tensorflow:eval_file: /tmp/recsim_q/eval_5/returns_799


INFO:tensorflow:eval_file: /tmp/recsim_q/eval_5/returns_799


## Viewing Results

In [8]:
%load_ext tensorboard

### Random Agent Results

In [9]:
%tensorboard --logdir=/tmp/recsim_random/

### Q Agent Results

In [10]:
%tensorboard --logdir=/tmp/recsim_q/