In [1]:
import numpy as np

from pymab.policies.ucb import UCBPolicy
from pymab.policies.bayesian_ucb import BayesianUCBPolicy
from pymab.game import Game


Please upgrade to the latest version of PyMC https://www.pymc.io/projects/docs/en/stable/installation.html

Also notice that PyMC3 has been renamed to PyMC.


In [2]:
# Define Q-values, which are the true values of the bandits
Q_values = [-0.1, 0.8, 0.3, 0.4, -0.9, 0.2, 0.25, 0.6, 0.5, -0.3]
n_bandits = 10

reward_distribution = 'bernoulli'

ucb_policy_0 = UCBPolicy(n_bandits=n_bandits,
                      c=0, reward_distribution=reward_distribution)

ucb_policy_1 = UCBPolicy(n_bandits=n_bandits,
                      c=1, reward_distribution=reward_distribution)

ucb_policy_2 = UCBPolicy(n_bandits=n_bandits,
                      c=2, reward_distribution=reward_distribution)

bayesian_ucb = BayesianUCBPolicy(n_bandits=n_bandits, reward_distribution=reward_distribution)

In [3]:
# Setup the game
game = Game(n_episodes=2000, 
            n_steps=1000, 
            Q_values=Q_values,
            policies=[ucb_policy_0,
                    ucb_policy_1,
                    ucb_policy_2,
                    bayesian_ucb
                ], 
            n_bandits=n_bandits,
            )

In [4]:
# Run the game
game.game_loop()

ValueError: p < 0, p > 1 or p is NaN

In [None]:
for policy in game.policies:
    policy.plot_distribution()

In [None]:
# Plot the results
game.plot_average_reward_by_step()

In [None]:
game.plot_average_reward_by_step_smoothed()

* **Greedy policy without optimistic initialization** behaves similarly to **UCB with c = 0**. This is because, if c = 0, there is no encouragement for exploration, meaning that after initialization, it will always choose the action with the highest estimated reward.

In [None]:
game.plot_rate_optimal_actions_by_step()