In [None]:
import os

os.chdir("../..")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from tf_agents.bandits.agents.lin_ucb_agent import LinearUCBAgent
from tf_agents.bandits.environments.classification_environment import (
    ClassificationBanditEnvironment,
)
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver
from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer

from tf_agents.utils.common import create_variable
from tf_agents.metrics import tf_metric
from tf_agents.policies.utils import get_num_actions_from_tensor_spec
from tf_agents.specs import BoundedTensorSpec
from tf_agents.trajectories.trajectory import Trajectory

from src.practise.utils import prep_reward_binary, predict

## Data prep

In [None]:
BATCHSIZE = 1000
DS_FOLDER = "data/4_dataset/"

context = pd.read_csv(f"{DS_FOLDER}user_context.csv")
ratings = pd.read_csv("data/4_dataset/reward_simple.csv")

train_df = context.merge(ratings, how="left", on="user_id")
train_df.rating.fillna(0, inplace=True)

train_ds = tf.data.Dataset.from_tensor_slices(
    (
        train_df.drop(["user_id", "rating"], axis=1),
        train_df.rating.astype("int32"),
    )
)

NUM_EPOCHS = int(train_df.shape[0] / BATCHSIZE)

In [None]:
rew_dist = prep_reward_binary(
    rew_true_neg=1,
    rew_false_neg=0,
    rew_false_pos=0,
    rew_true_pos=1,
)

## Elements of TF-Agents

In [None]:
env = ClassificationBanditEnvironment(train_ds, rew_dist, BATCHSIZE)

agent = LinearUCBAgent(
    env.time_step_spec(),
    env.action_spec(),
    tikhonov_weight=1,
    alpha=1,
    use_eigendecomp=True,
    emit_policy_info=(
        "predicted_rewards_mean",
        "predicted_rewards_optimistic",
    ),
)

In [None]:
# TO BE FILLED!
# in this part, you should prepare:
#   - ActionCountMetric


class ActionCountMetric(tf_metric.TFStepMetric):
    """Action count metric implementation for TF agents

    Attributes:
        action_count: int
            how many possible actions there are
        cnt: tf.Variable(shape=(action_count), dtype=tf.int32)
            count of how many times an action was taken
    """

    def __init__(
        self,
        action_spec: BoundedTensorSpec,
        name: str = "ActionCountMetric",
    ):
        """
        Args:
            action_spec: discrete! BoundedTensorSpec
                discrete bounded TensorSpec specifiyng possible actions,
                it is used to extract number and dtype of actions

            name: optional, name of the metric
        """
        super(tf_metric.TFStepMetric, self).__init__(name)
        self.action_count = get_num_actions_from_tensor_spec(action_spec)

        self.cnt = create_variable(
            initial_value=0,
            dtype=tf.int32,
            shape=(self.action_count),
            name="action count",
        )

    def reset(self):
        """
        Reset the metric.

        Reset removes all calculated values.
        """
        self.cnt.assign(tf.zeros_like(self.cnt))

    @tf.function
    def call(self, t: Trajectory) -> Trajectory:
        """Accumulates statistics from the provided trajectory.

        Each call expects a `trajectory` based on a new batch of data.

        Args:
            t: tf_agents.trajectories.trajectory.Trajectory
                trajectory to use for metric calculation
        Returns:
            The same trajectory that was passed as an argument
        """
        cnts = tf.math.bincount(t.action, minlength=self.action_count)
        self.cnt.assign(cnts)
        return t

    def result(self) -> tf.Tensor:
        """Calculates the final action counts.

        Returns:
            tf.Tensor(shape=(action_count), dtype=tf.int32) with final value of the metric
        """
        return self.cnt

In [None]:
replay_buffer = TFUniformReplayBuffer(
    data_spec=agent.policy.trajectory_spec,
    batch_size=BATCHSIZE,
    max_length=5,
)

action_count = ActionCountMetric(env.action_spec())

replay_observer = [replay_buffer.add_batch, action_count]

driver = DynamicStepDriver(
    env=env,
    policy=agent.collect_policy,
    observers=replay_observer,
)

## Training loop

In [None]:
action_values = []
for i in range(NUM_EPOCHS):

    print(f"\rEpoch: {i+1}/{NUM_EPOCHS}", end="")

    # environment - agent interactions
    replay_buffer.clear()
    _ = driver.run()
    action_values.append(action_count.result().numpy())

    # collect data and train
    experience = replay_buffer.as_dataset(
        sample_batch_size=BATCHSIZE, num_steps=1, single_deterministic_pass=True
    )
    for t in experience:
        _ = agent.train(t[0])

In [None]:
positive_count = [i[1] for i in action_values]
plt.plot(positive_count)
plt.hlines(tf.math.reduce_mean(positive_count), xmin=0, xmax=NUM_EPOCHS, color="orange")
plt.title("Number of positive actions played by the Agent")
plt.xlabel("Number of Iterations")
_ = plt.ylabel("Average number of positive actions")

## Predict

In [None]:
scored_ds = train_ds.batch(1000).map(lambda x, y: predict(x, agent, env))
preds = list(scored_ds)

acts = tf.concat([i.action for i in preds], axis=0).numpy()
rews = tf.concat([i.info.predicted_rewards_mean[:, 1] for i in preds], axis=0).numpy()

scored_df = train_df.assign(pred=acts, rew=rews)

In [None]:
# accuracy
(scored_df.rating == scored_df.pred).mean()

In [None]:
# confusion matrix
(
    scored_df.groupby(["rating", "pred"], as_index=False)
    .user_id.count()
    .pivot(index="rating", columns="pred", values="user_id")
)

In [None]:
scored_df[["rating", "pred", "rew"]].describe()

In [None]:
minmax = lambda x: (x - x.min()) / (x.max() - x.min())

plot_df = scored_df.sort_values("rew").assign(ix=range(scored_df.shape[0]))

plt.scatter(plot_df.ix, plot_df.rating + 0.1 * np.random.randn(plot_df.shape[0]))
plt.plot(plot_df.ix, minmax(plot_df.rew), color="red")
plt.show()