In [None]:
import os

os.chdir("../..")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from tf_agents.bandits.agents.lin_ucb_agent import LinearUCBAgent
from tf_agents.bandits.agents.linear_thompson_sampling_agent import (
    LinearThompsonSamplingAgent,
)
from tf_agents.bandits.environments.classification_environment import (
    ClassificationBanditEnvironment,
)
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver
from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
from tf_agents.bandits.metrics.tf_metrics import RegretMetric

from src.practise.utils import prep_reward_binary, predict

## Data prep

In [None]:
BATCHSIZE = 1000
DS_FOLDER = "data/4_dataset/"

In [None]:
context = pd.read_csv(f"{DS_FOLDER}user_context.csv")
ratings = pd.read_csv("data/4_dataset/reward_simple.csv")

train_df = context.merge(ratings, how="left", on="user_id")
train_df.rating.fillna(0, inplace=True)

train_ds = tf.data.Dataset.from_tensor_slices(
    (
        train_df.drop(["user_id", "rating"], axis=1),
        train_df.rating.astype("int32"),
    )
)

NUM_EPOCHS = int(train_df.shape[0] / BATCHSIZE)

In [None]:
rew_dist = prep_reward_binary(
    rew_true_neg=1,
    rew_false_neg=0,
    rew_false_pos=0,
    rew_true_pos=1,
)

## Elements of TF-Agents

In [None]:
# TO BE FILLED!
# in this part, you should prepare:
#   - ClassificationBanditEnvironment
#   - some linear bandit (LinearUCBAgent or LinearThompsonSamplingAgent)
env = ClassificationBanditEnvironment(train_ds, rew_dist, BATCHSIZE)

agent = LinearUCBAgent(
    env.time_step_spec(),
    env.action_spec(),
    tikhonov_weight=1,
    alpha=1,
    use_eigendecomp=True,
    emit_policy_info=(
        "predicted_rewards_mean",
        "predicted_rewards_optimistic",
    ),
)

In [None]:
# Test your environment
# ts = env.reset()
# for i in range(5):
#     act = agent.policy.action(ts)
#     env.step(act.action)

In [None]:
# TO BE FILLED!
# in this part, you should prepare:
#   - RegretMetric
optimal_reward = lambda x: env.compute_optimal_reward()

regret = RegretMetric(optimal_reward, name="regret")

In [None]:
# TO BE FILLED!
# in this part, you should prepare:
#   - TFUniformReplayBuffer
#   - DynamicStepDriver (that will use the replay buffer)

replay_buffer = TFUniformReplayBuffer(
    data_spec=agent.policy.trajectory_spec,
    batch_size=BATCHSIZE,
    max_length=5,
)

replay_observer = [replay_buffer.add_batch, regret]

driver = DynamicStepDriver(
    env=env,
    policy=agent.collect_policy,
    observers=replay_observer,
)

## Training loop

In [None]:
regret_values = []
for i in range(NUM_EPOCHS):

    print(f"\rEpoch: {i+1}/{NUM_EPOCHS}", end="")

    # environment - agent interactions
    replay_buffer.clear()
    _ = driver.run()
    regret_values.append(regret.result())

    # collect data and train
    experience = replay_buffer.as_dataset(
        sample_batch_size=BATCHSIZE, num_steps=1, single_deterministic_pass=True
    )
    for t in experience:
        _ = agent.train(t[0])

In [None]:
# inspired by https://www.tensorflow.org/agents/tutorials/per_arm_bandits_tutorial#defining_the_regret_metric
plt.plot(regret_values)
plt.hlines(tf.math.reduce_mean(regret_values), xmin=0, xmax=NUM_EPOCHS, color="orange")
plt.title("Regret of Linear Agent")
plt.xlabel("Number of Iterations")
_ = plt.ylabel("Average Regret")

## Predict

In [None]:
scored_ds = train_ds.batch(1000).map(lambda x, y: predict(x, agent, env))
preds = list(scored_ds)

acts = tf.concat([i.action for i in preds], axis=0).numpy()
rews = tf.concat([i.info.predicted_rewards_mean[:, 1] for i in preds], axis=0).numpy()

scored_df = train_df.assign(pred=acts, rew=rews)

In [None]:
# accuracy
(scored_df.rating == scored_df.pred).mean()

In [None]:
# confusion matrix
(
    scored_df.groupby(["rating", "pred"], as_index=False)
    .user_id.count()
    .pivot(index="rating", columns="pred", values="user_id")
)

In [None]:
scored_df[["rating", "pred", "rew"]].describe()

In [None]:
minmax = lambda x: (x - x.min()) / (x.max() - x.min())

plot_df = scored_df.sort_values("rew").assign(ix=range(scored_df.shape[0]))

plt.scatter(plot_df.ix, plot_df.rating + 0.1 * np.random.randn(plot_df.shape[0]))
plt.plot(plot_df.ix, minmax(plot_df.rew), color="red")
plt.show()