In [None]:
# RL4RS Tutorial v1.0 2022.07.13

In [None]:
# Step 1: Installing
# !git clone https://github.com/fuxiAIlab/RL4RS
# !export PYTHONPATH=$PYTHONPATH:`pwd`/rl4rs
# !conda env create -f environment.yml
# !conda activate rl4rs

In [None]:
# Step 2: Dataset Introduction

![](https://i.bmp.ovh/imgs/2022/07/14/c5e51bb495e704ec.png)

In [1]:
! head -2 /project/wangkai/rl4rs_benchmark_materials/raw_data/rl4rs_dataset_a_sl.csv

timestamp@session_id@sequence_id@exposed_items@user_feedback@user_seqfeature@user_protrait@item_feature@behavior_policy_id
2992008@1@1@3,5,29,72,53,52,164,211,172@1,1,1,1,1,1,1,1,1@24,7,1,127,74,47,212,199,6,15,10,127,126,76,220,196,172,196,15,39,31,132,80,61,200,219,1,5,14,101,40,52,235,238,164,1,14,20,77,80,40,239,233,164,164,33,31,14,139,83,83,125,184,240,160,8,1,25,51,76,43,235,211,164,14,4,9,126,116,43,164,213,183,6,14,4,43,126,57,183,188,164,10,39,25,102,109,111,160,160,157,183,10,5,32,43,103,50,242,172,171,171,35,9,5,86,48,88,218,236,215@64054,50887,66367,44932,59460,20543,83978,50138,74820,58670,3.146,13.82,1.268,4.848,7.523,9.176,9.875,10.66,9.355,9.774,10.29,17.51,7.418,6.973,0,2.683,0,17.45,2.368,17.24,12.07,0,0,0,0,12.02,10.32,11.78,1.009,30.16,0,0@-0.2137,2.0783,-0.3633,-0.142,1.372,-0.6294,-1.8973,-1.7015,-0.8995,0.4999,-1.0659,-1.1485,1.4111,1.5868,0.2704,1.5112,-0.4576,1.2751,1.0225,1.1452,-1.4702,-1.388,0.6215,1.5247,1.6815,0.6815,1.3391,-0.7501,1.0955,-1.1044,0.6773,

In [3]:
with open('/project/wangkai/rl4rs_benchmark_materials/raw_data/rl4rs_dataset_a_sl.csv', 'r') as f:
    data = f.read().split('\n')[1:-1]
    print('number of records: ', len(data))
    print('number of users: ', len(set([x.split('@')[1] for x in data])))
    print('number of items per page: ', len(data[0].split('@')[3].split(',')))
    print('avg. lengths of user history: ', sum([len(x.split('@')[5].split(',')) for x in data])/len(data))
    print('avg. dimensions of user_protrait: ', sum([len(x.split('@')[6].split(',')) for x in data])/len(data))
    print('avg. dimensions of item_feature: ', sum([len(x.split('@')[7].split(',')) for x in data])/len(data))
    print('number of behavior_policy_id: ', len(set([x.split('@')[-1] for x in data])))

number of records:  937949
number of users:  937949
number of items per page:  9
avg. lengths of user history:  36.35307676643399
avg. dimensions of user_protrait:  42.0
avg. dimensions of item_feature:  352.0
number of behavior_policy_id:  1


In [None]:
# Step 2: Data PreProcess
# See https://github.com/fuxiAIlab/RL4RS/blob/main/reproductions/run_split.sh

In [None]:
# Step 3: Simulation Environment (Local)

In [2]:
import os, sys
import gym
import numpy as np
from copy import deepcopy
import tensorflow as tf
from rl4rs.utils.datautil import FeatureUtil
from rl4rs.env.slate import SlateRecEnv, SlateState
from rl4rs.env.seqslate import SeqSlateRecEnv, SeqSlateState

In [26]:
config = {"epoch": 10000, "maxlen": 64, "batch_size": 64, "action_size": 284, 
          "class_num": 2, "dense_feature_num": 432, "category_feature_num": 21, 
          "category_hash_size": 100000, "seq_num": 2, "emb_size": 128, "is_eval": False,
          "hidden_units": 128, "max_steps": 9, "action_emb_size": 32,
          "sample_file": '/project/wangkai/rl4rs_benchmark_materials/simulator/rl4rs_dataset_a_shuf.csv', 
          "model_file": "/project/wangkai/rl4rs_benchmark_materials/simulator/finetuned/simulator_a_dien/model",
          "iteminfo_file": '/project/wangkai/rl4rs_benchmark_materials/raw_data/item_info.csv', 
          "support_rllib_mask": True, 'env': "SlateRecEnv-v0"}

sim = SlateRecEnv(config, state_cls=SlateState)
env = gym.make('SlateRecEnv-v0', recsim=sim)

INFO:tensorflow:Restoring parameters from /project/wangkai/rl4rs_benchmark_materials/simulator/finetuned/simulator_a_dien/model


INFO:tensorflow:Restoring parameters from /project/wangkai/rl4rs_benchmark_materials/simulator/finetuned/simulator_a_dien/model


In [28]:
obs = env.reset()
print('batchsize of batched environment: ', len(obs))

for i in range(config["max_steps"]):
    action = env.offline_action
    next_obs, reward, done, info = env.step(action)
    print('step: ', i, ' action', action[0], ' reward: ', reward[0], ' offline reward: ', env.offline_reward[0], ' done: ', done[0])
    
print('observation type: ', type(next_obs[0]))
print('size of obs.action_mask: ', len(next_obs[0]['action_mask']))
print('size of obs.obs: ', len(next_obs[0]['obs']))

batchsize of batched environment:  64
step:  0  action 31  reward:  0  offline reward:  0  done:  0
step:  1  action 28  reward:  0  offline reward:  0  done:  0
step:  2  action 20  reward:  0  offline reward:  0  done:  0
step:  3  action 87  reward:  0  offline reward:  0  done:  0
step:  4  action 73  reward:  0  offline reward:  0  done:  0
step:  5  action 146  reward:  0  offline reward:  0  done:  0
step:  6  action 235  reward:  0  offline reward:  0  done:  0
step:  7  action 233  reward:  0  offline reward:  0  done:  0
step:  8  action 166  reward:  130.2745725877583  offline reward:  118.5  done:  1
observation type:  <class 'dict'>
size of obs.action_mask:  284
size of obs.obs:  256


In [None]:
# Env with continus action space (without item mask)

In [49]:
config_conti = deepcopy(config)
config_conti['support_conti_env'] = True
config_conti['support_rllib_mask'] = False
sim = SlateRecEnv(config_conti, state_cls=SlateState)
env = gym.make('SlateRecEnv-v0', recsim=sim)
obs = env.reset()
action_vec = np.full((batch_size, 32), 1)
print('size of action embedding ', np.array(env.samples.action_emb).shape)
for i in range(config["max_steps"]):
    next_obs, reward, done, info = env.step(action_vec)
    action = SlateState.get_nearest_neighbor(action_vec, env.samples.action_emb)
    print('step: ', i, ' action', action[0], ' reward: ', reward[0], ' done: ', done[0])

INFO:tensorflow:Restoring parameters from /project/wangkai/rl4rs_benchmark_materials/simulator/finetuned/simulator_a_dien/model


INFO:tensorflow:Restoring parameters from /project/wangkai/rl4rs_benchmark_materials/simulator/finetuned/simulator_a_dien/model


size of action embedding  (284, 32)
step:  0  action 53  reward:  0  done:  0
step:  1  action 53  reward:  0  done:  0
step:  2  action 53  reward:  0  done:  0
step:  3  action 53  reward:  0  done:  0
step:  4  action 53  reward:  0  done:  0
step:  5  action 53  reward:  0  done:  0
step:  6  action 53  reward:  0  done:  0
step:  7  action 53  reward:  0  done:  0
step:  8  action 53  reward:  47.939698447287086  done:  1


In [None]:
# Env with continus action space (with item mask)
# See https://github.com/fuxiAIlab/RL4RS/blob/main/script/modelfree_train.py

In [None]:
# Take raw features as observation

In [55]:
from copy import deepcopy
config_rawstate = deepcopy(config)
config_rawstate['support_rllib_mask'] = True
config_rawstate['support_conti_env'] = False
config_rawstate['rawstate_as_obs'] = True
sim = SlateRecEnv(config_rawstate, state_cls=SlateState)
env = gym.make('SlateRecEnv-v0', recsim=sim)
obs = env.reset()
print('observation type: ', type(obs[0]))
print('size of obs.action_mask: ', len(obs[0]['action_mask']))
print('size of obs.category_feature: ', len(obs[0]['category_feature']))
print('size of obs.dense_feature: ', len(obs[0]['dense_feature']))
print('size of obs.sequence_feature: ', obs[0]['sequence_feature'].shape)

INFO:tensorflow:Restoring parameters from /project/wangkai/rl4rs_benchmark_materials/simulator/finetuned/simulator_a_dien/model


INFO:tensorflow:Restoring parameters from /project/wangkai/rl4rs_benchmark_materials/simulator/finetuned/simulator_a_dien/model


observation type:  <class 'dict'>
size of obs.action_mask:  284
size of obs.category_feature:  21
size of obs.dense_feature:  432
size of obs.sequence_feature:  (2,64)


In [38]:
# Step 3: Simulation Environment (Remote)
# start http-based Env, then run RLlib library
# nohup python -u rl4rs/server/gymHttpServer.py &
# bash run_modelfree_rl.sh DQN/PPO/DDPG/PG/PG_conti/etc.

In [None]:
# Step 4: Model-free Training (RLLib)

In [1]:
import os
import numpy as np
import gym
import ray
from ray.rllib.models import ModelCatalog
from ray.tune.registry import register_env
from rl4rs.utils.rllib_print import pretty_print
from rl4rs.nets.rllib.rllib_rawstate_model import getTFModelWithRawState
from rl4rs.nets.rllib.rllib_mask_model import getMaskActionsModel, \
    getMaskActionsModelWithRawState
from rl4rs.utils.rllib_vector_env import MyVectorEnvWrapper
from script.modelfree_trainer import get_rl_model
from rl4rs.policy.behavior_model import behavior_model
from script.offline_evaluation import ope_eval
from rl4rs.utils.fileutil import find_newest_files
import http.client
import sys
http.client.HTTPConnection._http_vsn = 10
http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'

algo = 'PPO'

ray.init()

config = {"epoch": 2, "maxlen": 64, "batch_size": 64, "action_size": 284, 
          "class_num": 2, "dense_feature_num": 432, "category_feature_num": 21, 
          "category_hash_size": 100000, "seq_num": 2, "emb_size": 128, "is_eval": False,
          "hidden_units": 128, "max_steps": 9, "action_emb_size": 32,
          "sample_file": '/project/wangkai/rl4rs_benchmark_materials/simulator/rl4rs_dataset_a_shuf.csv', 
          "model_file": "/project/wangkai/rl4rs_benchmark_materials/simulator/finetuned/simulator_a_dien/model",
          "iteminfo_file": '/project/wangkai/rl4rs_benchmark_materials/raw_data/item_info.csv', 
          'remote_base': 'http://127.0.0.1:5000', 'trial_name': 'all',
          "support_rllib_mask": True, 'env': "SlateRecEnv-v0"}

print(config)

mask_model = getMaskActionsModel(true_obs_shape=(256,), action_size=config['action_size'])
ModelCatalog.register_custom_model("mask_model", mask_model)
mask_model_rawstate = getMaskActionsModelWithRawState(config=config, action_size=config['action_size'])
ModelCatalog.register_custom_model("mask_model_rawstate", mask_model_rawstate)
model_rawstate = getTFModelWithRawState(config=config)
ModelCatalog.register_custom_model("model_rawstate", model_rawstate)
register_env('rllibEnv-v0', lambda _: MyVectorEnvWrapper(gym.make('HttpEnv-v0', env_id=config['env'], config=config), config['batch_size']))

cfg = {
    "num_workers": 2,
    "use_critic": True,
    "use_gae": True,
    "lambda": 1.0,
    "kl_coeff": 0.2,
    "sgd_minibatch_size": 256,
    "shuffle_sequences": True,
    "num_sgd_iter": 1,
    "lr": 0.0001,
    "vf_loss_coeff": 0.5,
    "clip_param": 0.3,
    "vf_clip_param": 500.0,
    "kl_target": 0.01,
}

rllib_config = dict(
    {
        "env": "rllibEnv-v0",
        "gamma": 1,
        "explore": True,
        "exploration_config": {
            "type": "SoftQ",
            # "temperature": 1.0,
        },
        "num_gpus": 1 if config.get('gpu', True) else 0,
        "num_workers": 0,
        "framework": 'tf',
        "rollout_fragment_length": config['max_steps'],
        "batch_mode": "complete_episodes",
        "train_batch_size": min(config["batch_size"] * config['max_steps'], 1024),
        "evaluation_interval": 1,
        "evaluation_num_episodes": 2048 * 4,
        "evaluation_config": {
            "explore": False
        },
        "log_level": "INFO",
    },
    **cfg)
print('rllib_config', rllib_config)
trainer = get_rl_model(algo, rllib_config)

# restore_file = ''
# trainer.restore(restore_file)

for i in range(config["epoch"]):
    result = trainer.train()
    if (i + 1) % 1 == 0 or i == 0:
        print(pretty_print(result))


ray.shutdown()


2022-07-14 17:34:45,283	ERROR services.py:1254 -- Failed to start the dashboard: Failed to start the dashboard. The last 10 lines of /tmp/ray/session_2022-07-14_17-34-22_052438_47614/logs/dashboard.log:
2022-07-14 17:34:41,447	INFO dashboard.py:92 -- Setup static dir for dashboard: /project/miniconda3/envs/rl4rs/lib/python3.6/site-packages/ray/new_dashboard/client/build
2022-07-14 17:34:41,451	INFO head.py:82 -- Connect to GCS at b'192.168.98.96:43889'
2022-07-14 17:34:41,453	INFO utils.py:202 -- Get all modules by type: DashboardHeadModule



{'epoch': 2, 'maxlen': 64, 'batch_size': 64, 'action_size': 284, 'class_num': 2, 'dense_feature_num': 432, 'category_feature_num': 21, 'category_hash_size': 100000, 'seq_num': 2, 'emb_size': 128, 'is_eval': False, 'hidden_units': 128, 'max_steps': 9, 'action_emb_size': 32, 'sample_file': '/project/wangkai/rl4rs_benchmark_materials/simulator/rl4rs_dataset_a_shuf.csv', 'model_file': '/project/wangkai/rl4rs_benchmark_materials/simulator/finetuned/simulator_a_dien/model', 'iteminfo_file': '/project/wangkai/rl4rs_benchmark_materials/raw_data/item_info.csv', 'remote_base': 'http://127.0.0.1:5000', 'trial_name': 'all', 'support_rllib_mask': True, 'env': 'SlateRecEnv-v0'}
rllib_config {'env': 'rllibEnv-v0', 'gamma': 1, 'explore': True, 'exploration_config': {'type': 'SoftQ'}, 'num_gpus': 1, 'num_workers': 2, 'framework': 'tf', 'rollout_fragment_length': 9, 'batch_mode': 'complete_episodes', 'train_batch_size': 576, 'evaluation_interval': 1, 'evaluation_num_episodes': 8192, 'evaluation_config':

2022-07-14 17:34:48,364	INFO trainer.py:706 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
[2m[36m(pid=47823)[0m Instructions for updating:
[2m[36m(pid=47823)[0m If using Keras pass *_constraint arguments to layers.
[2m[36m(pid=47832)[0m Instructions for updating:
[2m[36m(pid=47832)[0m If using Keras pass *_constraint arguments to layers.
[2m[36m(pid=47823)[0m 2022-07-14 17:35:04,153	INFO dynamic_tf_policy.py:472 -- Testing `compute_actions` w/ dummy batch.
[2m[36m(pid=47832)[0m 2022-07-14 17:35:04,217	INFO dynamic_tf_policy.py:472 -- Testing `compute_actions` w/ dummy batch.
[2m[36m(pid=47832)[0m 2022-07-14 17:35:04,217	INFO tf_run_builder.py:87 -- Executing TF run without tracing. To dump TF timeline traces to disk, set the TF_TIMELINE_DIR environment variable.
[2m[36m(pid=47823)[0m 2022-07-14 17:35:04,221	INFO dynamic_tf_policy.py:481 -- Adding extra-action-fetch `action_prob` to view-reqs.
[2m[36m(pid=47823)[0m 2022-07-

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


2022-07-14 17:35:06,254	INFO dynamic_tf_policy.py:472 -- Testing `compute_actions` w/ dummy batch.
2022-07-14 17:35:06,257	INFO tf_run_builder.py:87 -- Executing TF run without tracing. To dump TF timeline traces to disk, set the TF_TIMELINE_DIR environment variable.
2022-07-14 17:35:06,467	INFO dynamic_tf_policy.py:481 -- Adding extra-action-fetch `action_prob` to view-reqs.
2022-07-14 17:35:06,470	INFO dynamic_tf_policy.py:481 -- Adding extra-action-fetch `action_logp` to view-reqs.
2022-07-14 17:35:06,472	INFO dynamic_tf_policy.py:481 -- Adding extra-action-fetch `action_dist_inputs` to view-reqs.
2022-07-14 17:35:06,474	INFO dynamic_tf_policy.py:481 -- Adding extra-action-fetch `vf_preds` to view-reqs.
2022-07-14 17:35:06,475	INFO dynamic_tf_policy.py:488 -- Testing `postprocess_trajectory` w/ dummy batch.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


2022-07-14 17:35:07,609	INFO rollout_worker.py:1344 -- Built policy map: {'default_policy': <ray.rllib.policy.tf_policy_template.PPOTFPolicy object at 0x7f521435d748>}
2022-07-14 17:35:07,610	INFO rollout_worker.py:1345 -- Built preprocessor map: {'default_policy': <ray.rllib.models.preprocessors.DictFlatteningPreprocessor object at 0x7f521435d320>}
2022-07-14 17:35:07,611	INFO rollout_worker.py:602 -- Built filter map: {'default_policy': <ray.rllib.utils.filter.NoFilter object at 0x7f51f44d3e80>}
2022-07-14 17:35:13,350	INFO dynamic_tf_policy.py:472 -- Testing `compute_actions` w/ dummy batch.
2022-07-14 17:35:13,405	INFO dynamic_tf_policy.py:481 -- Adding extra-action-fetch `action_prob` to view-reqs.
2022-07-14 17:35:13,407	INFO dynamic_tf_policy.py:481 -- Adding extra-action-fetch `action_logp` to view-reqs.
2022-07-14 17:35:13,409	INFO dynamic_tf_policy.py:481 -- Adding extra-action-fetch `action_dist_inputs` to view-reqs.
2022-07-14 17:35:13,410	INFO dynamic_tf_policy.py:481 -- A

trainer_default_config {'num_workers': 2, 'num_envs_per_worker': 1, 'create_env_on_driver': False, 'rollout_fragment_length': 200, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_final_linear': False, 'vf_share_layers': False, 'use_lstm': False, 'max_seq_len': 20, 'lstm_cell_size': 256, 'lstm_use_prev_action': False, 'lstm_use_prev_reward': False, '_time_major': False, 'use_attention': False, 'attention_num_transformer_units': 1, 'attention_dim': 64, 'attention_num_heads': 1, 'attention_head_dim': 32, 'attention_memory_inference': 50, 'attention_memory_training': 50, 'attention_position_wise_mlp_dim': 32, 'attention_init_gru_gate_bias': 2.0, 'attention_use_n_prev_actions': 0, 'attention_use_n_prev_rewards': 0, 

[2m[36m(pid=47832)[0m 2022-07-14 17:35:14,667	INFO sampler.py:593 -- Raw obs from env: { 0: { 'agent0': { 'action_mask': np.ndarray((284,), dtype=int64, min=0.0, max=1.0, mean=0.137),
[2m[36m(pid=47832)[0m                    'obs': np.ndarray((256,), dtype=float64, min=-1.0, max=14.742, mean=0.205)}},
[2m[36m(pid=47832)[0m   1: { 'agent0': { 'action_mask': np.ndarray((284,), dtype=int64, min=0.0, max=1.0, mean=0.137),
[2m[36m(pid=47832)[0m                    'obs': np.ndarray((256,), dtype=float64, min=-1.0, max=6.099, mean=-0.391)}},
[2m[36m(pid=47832)[0m   2: { 'agent0': { 'action_mask': np.ndarray((284,), dtype=int64, min=0.0, max=1.0, mean=0.137),
[2m[36m(pid=47832)[0m                    'obs': np.ndarray((256,), dtype=float64, min=-1.0, max=10.2, mean=0.211)}},
[2m[36m(pid=47832)[0m   3: { 'agent0': { 'action_mask': np.ndarray((284,), dtype=int64, min=0.0, max=1.0, mean=0.137),
[2m[36m(pid=47832)[0m                    'obs': np.ndarray((256,), dtype=float64

[2m[36m(pid=47832)[0m 2022-07-14 17:35:17,243	INFO simple_list_collector.py:661 -- Trajectory fragment after postprocess_trajectory():
[2m[36m(pid=47832)[0m 
[2m[36m(pid=47832)[0m { 'agent0': { 'action_dist_inputs': np.ndarray((9, 284), dtype=float32, min=-0.017, max=0.021, mean=0.0),
[2m[36m(pid=47832)[0m               'action_logp': np.ndarray((9,), dtype=float32, min=-5.656, max=-5.641, mean=-5.648),
[2m[36m(pid=47832)[0m               'actions': np.ndarray((9,), dtype=int64, min=51.0, max=263.0, mean=165.667),
[2m[36m(pid=47832)[0m               'advantages': np.ndarray((9,), dtype=float32, min=-0.002, max=0.006, mean=0.001),
[2m[36m(pid=47832)[0m               'agent_index': np.ndarray((9,), dtype=int64, min=0.0, max=0.0, mean=0.0),
[2m[36m(pid=47832)[0m               'dones': np.ndarray((9,), dtype=bool, min=0.0, max=1.0, mean=0.111),
[2m[36m(pid=47832)[0m               'eps_id': np.ndarray((9,), dtype=int64, min=219111070.0, max=219111070.0, mean=21911