In [None]:
import os

os.chdir("../..")

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from typing import Optional

from tf_agents.bandits.agents.lin_ucb_agent import LinearUCBAgent
from tf_agents.bandits.environments.bandit_tf_environment import BanditTFEnvironment
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver
from tf_agents.policies.utils import get_num_actions_from_tensor_spec
from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
from tf_agents.specs import BoundedTensorSpec
from tf_agents.specs.tensor_spec import TensorSpec
import tf_agents.trajectories.time_step as ts
from tf_agents.typing import types
from tf_agents.utils import common, eager_utils

from src.practise.utils import predict
from src.practise.solution_metrics import RMSEMetric

## Data prep

In [None]:
BATCHSIZE = 1000
DS_FOLDER = "data/4_dataset/"

context = pd.read_csv(f"{DS_FOLDER}user_context.csv")
ratings = pd.read_csv("data/4_dataset/reward_per_book.csv")
actions = ratings.columns[1:]

train_df = context.merge(ratings, how="left", on="user_id")

train_ds = tf.data.Dataset.from_tensor_slices(
    (
        train_df.drop(ratings.columns, axis=1),
        train_df[actions].astype("int32"),
    )
)

NUM_EPOCHS = int(train_df.shape[0] / BATCHSIZE)

In [None]:
action_spec = BoundedTensorSpec(
    shape=(), dtype=tf.int32, minimum=0, maximum=len(actions) - 1, name="action"
)

## Elements of TF-Agents

In [None]:
# TO BE FILLED!
# in this part, you should prepare:
#   - SimpleEnvironment - implementation of BanditTFEnvironment protocol


class SimpleEnvironment(BanditTFEnvironment):
    """SimpleEnvironment for book recommendations.

    Attributes:
        labels: tf.Variable(shape=(batch_size, num_actions), dtype=tf.int32)
            index of the optimal action
        num_actions: tf.constant(dtype=tf.int32)
            number of playable actions
    """

    def __init__(
        self,
        dataset: tf.data.Dataset,
        batch_size: int,
        action_spec: tf.TensorSpec,
    ):
        """
        Args:
            dataset: tf.data.Dataset
                unbatched dataset with two elements - (features, labels)
            batch_size: int
                dataset batch size to be used
            action_spec: discrete! BoundedTensorSpec
                discrete bounded TensorSpec specifiyng possible actions,
                it is used to extract number of actions and dtype of actions
        """
        # this needs to be filled as well :)
        super(SimpleEnvironment, self).__init__(
            time_step_spec=None,
            action_spec=None,
            batch_size=None,
            name="SimpleEnvironment",
        )

    def _observe(self) -> types.NestedArray:
        """Collects another batch of features and labels and prepares time_step.
        Updates current and previous labels.

        Returns:
            tf.Tensor(shape=(batch_size, num_features), dtype=self.time_step.obsevation.dtype)
                context
        """
        return tf.random.uniform([500, 101], dtype=tf.float64)

    def _apply_action(self, action: types.NestedArray) -> types.Float:
        """Calculates rewards for current batch of actions.

        Args:
            action: tf.Tensor(shape=(batch_size, 1), dtype=self.action_spec.dtype)

        Returns:
            tf.Tensor(shape=(batch_size, 1), dtype=time_step_spec.reward.dtype)
            Rewards for each played action.
        """
        return tf.random.uniform([500], dtype=tf.int32)

In [None]:
env = SimpleEnvironment(dataset=train_ds, batch_size=BATCHSIZE, action_spec=action_spec)

agent = LinearUCBAgent(
    env.time_step_spec(),
    env.action_spec(),
    tikhonov_weight=1,
    alpha=1,
    use_eigendecomp=True,
    emit_policy_info=(
        "predicted_rewards_mean",
        "predicted_rewards_optimistic",
    ),
)

In [None]:
replay_buffer = TFUniformReplayBuffer(
    data_spec=agent.policy.trajectory_spec,
    batch_size=BATCHSIZE,
    max_length=5,
)

rmse = RMSEMetric(env.action_spec())

replay_observer = [replay_buffer.add_batch, rmse]

driver = DynamicStepDriver(
    env=env,
    policy=agent.collect_policy,
    observers=replay_observer,
)

## Training loop

In [None]:
rmse_values = []
for i in range(NUM_EPOCHS):

    print(f"\rEpoch: {i+1}/{NUM_EPOCHS}", end="")

    # environment - agent interactions
    replay_buffer.clear()
    _ = driver.run()
    rmse_values.append(rmse.result().numpy())

    # collect data and train
    experience = replay_buffer.as_dataset(
        sample_batch_size=BATCHSIZE, num_steps=1, single_deterministic_pass=True
    )
    for t in experience:
        _ = agent.train(t[0])

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(30, 10))
fig.suptitle("RMSE value by action")
for i, ax in enumerate(axes.flatten()):
    vals = [r[i] for r in rmse_values]
    ax.plot(vals)
    ax.hlines(tf.math.reduce_mean(vals), xmin=0, xmax=NUM_EPOCHS, color="red")
    ax.title.set_text(actions[i])

## Predict

In [None]:
scored_ds = train_ds.batch(1000).map(lambda x, y: predict(x, agent, env))
preds = list(scored_ds)

acts = tf.concat([i.action for i in preds], axis=0).numpy()

scored_df = train_df.assign(
    pred=acts,
)

score_rat = scored_df[actions.values]
score_rat.columns = range(score_rat.shape[1])

scored_df = scored_df.assign(rating=score_rat.idxmax(axis=1).values)

In [None]:
# accuracy
(scored_df.rating == scored_df.pred).mean()

In [None]:
# confusion matrix
confmat = (
    scored_df.groupby(["rating", "pred"], as_index=False)
    .user_id.count()
    .pivot_table(index="rating", columns="pred", values="user_id", fill_value=0)
)
confmat

In [None]:
# close rates
confmat.values.diagonal() / confmat.sum()