In [59]:
from tf_agents.bandits.environments.movielens_py_environment import MovieLensPyEnvironment
from tf_agents.environments.tf_py_environment import TFPyEnvironment
from tf_agents.bandits.metrics.tf_metrics import RegretMetric
from tf_agents.bandits.agents.lin_ucb_agent import LinearUCBAgent
from tf_agents.bandits.agents.linear_thompson_sampling_agent import LinearThompsonSamplingAgent
from tf_agents.bandits.agents.neural_epsilon_greedy_agent import NeuralEpsilonGreedyAgent
from tf_agents.bandits.environments.environment_utilities import compute_optimal_action_with_movielens_environment, compute_optimal_reward_with_movielens_environment
from tf_agents.networks.q_network import QNetwork
from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver

import matplotlib.pyplot as plt
import tensorflow as tf
from functools import partial

import pandas as pd
import numpy as np


In [52]:
data_path = './u.data'

In [67]:
env = TFPyEnvironment(MovieLensPyEnvironment(
    data_path,
    20,
    32,
    20,
    csv_delimiter='\t'
))


In [68]:
ucb = LinearUCBAgent(
    action_spec=env.action_spec(),
    time_step_spec=env.time_step_spec(),
    dtype=tf.float32,
    accepts_per_arm_features=False
)

ts = LinearThompsonSamplingAgent(
    action_spec=env.action_spec(),
    time_step_spec=env.time_step_spec(),
    dtype=tf.float32,
    accepts_per_arm_features=False
)

In [69]:
network = QNetwork(
    input_tensor_spec=env.time_step_spec().observation,
    action_spec=env.action_spec(),
    fc_layer_params=(50,50,50)
)

neg = NeuralEpsilonGreedyAgent(
    action_spec=env.action_spec(),
    time_step_spec=env.time_step_spec(),
    reward_network=network,
    optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.005),
    epsilon=0.05,
    emit_policy_info="predicted_rewards_mean",
    info_fields_to_inherit_from_greedy=["predicted_rewards_mean"]
)

In [73]:
optimal_reward_fn = partial(compute_optimal_reward_with_movielens_environment, env)
optimal_action_fn = partial(compute_optimal_action_with_movielens_environment, env)
env.computer_optimal_reward = optimal_reward_fn
metric = RegretMetric(optimal_reward_fn)

In [74]:
def train(agent):
    match agent:
        case "ucb":
            agent = ucb
        case "ts":
            agent = ts
        case "neg":
            agent = neg

    replay_buffer = TFUniformReplayBuffer(
        data_spec=agent.policy.trajectory_spec,
        batch_size=32,
        max_length=2
    )

    observers = [replay_buffer.add_batch, metric]

    driver = DynamicStepDriver(
        env=env,
        policy=agent.collect_policy,
        num_steps=2 * 32,
        observers=observers
    )

    regret_values = []

    for i in range(2):
        driver.run()
        loss = agent.train(replay_buffer.gather_all())
        replay_buffer.clear()
        regret_values.append(metric.result())

    return regret_values, agent

In [75]:
regret_ucb, agent_ucb = train('ucb')
plt.plot(regret_ucb)
plt.xlabel('Number of iterations')
plt.ylabel('Average regret')

AttributeError: 'SymbolicTensor' object has no attribute 'compute_optimal_reward'
  In call to configurable 'compute_optimal_reward_with_movielens_environment' (<function compute_optimal_reward_with_movielens_environment at 0x7fa34efd3920>)