# Installing Libraries (Python version >= 3.8)

In [None]:
import sys
version = sys.version_info
print(version)
assert version.major == 3 and version.minor >= 8

In [None]:
!python -m pip install numpy==1.23.5 matplotlib==3.7.4 tqdm==4.62.3

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm


class MAB:

    def __init__(self, n_arms=10, epsilon=0.1, mean=0.0):
        """Initialize Multi-armed Bandits (MAB) class.

        Args:
            n_arms: Number of arms to pull.
            epsilon: Probability for exploration in epsilon-greedy algorithm.
            mean: Mean value for normal distributions.
        """
        self.n_arms = n_arms
        self.mean = mean
        self.epsilon = epsilon

    def reset(self):
        """Reset all the information environment and agent have."""
        # True reward drawn from the normal distribution with mean for each action
        self.q_true = np.random.randn(self.n_arms) + self.mean

        # Estimated reward for each action
        self.q_estimation = np.zeros(self.n_arms)

        # Chosen times for each action
        self.action_count = np.zeros(self.n_arms)

        # Action index with best true reward
        self.optimal_action = np.argmax(self.q_true)

    def policy(self):
        """Get index of next action.

        Returns:
            Index of next action
        """
        # ε-greedy
        if np.random.rand() < self.epsilon:
            return np.random.choice(np.arange(self.n_arms))
        return np.argmax(self.q_estimation)

    def step(self, action):
        """Take action.

        Args:
            action: Index of next action

        Returns:
            reward
        """
        # Stochastic reward drawn from the normal distribution
        return np.random.randn() + self.q_true[action]

    def update(self, action, reward):
        """Update estimation.

        Args:
            action: Index of next action
            reward: Reward from environment
        """
        self.action_count[action] += 1
        self.q_estimation[action] += 1.0 / self.action_count[action] * (reward - self.q_estimation[action])
        return reward

In [None]:
def simulate(runs, steps, mab_instances):
    optimal_action_counts = np.zeros((len(mab_instances), runs, steps))
    rewards = np.zeros(optimal_action_counts.shape)

    for i, mab in enumerate(mab_instances):
        for r in tqdm(range(runs)):
            mab.reset()

            for s in range(steps):
                action = mab.policy()
                reward = mab.step(action=action)
                mab.update(action=action, reward=reward)

                rewards[i, r, s] = reward
                if action == mab.optimal_action:
                    optimal_action_counts[i, r, s] = 1

    optimal_action_counts = optimal_action_counts.mean(axis=1)
    rewards = rewards.mean(axis=1)

    return optimal_action_counts, rewards

In [None]:
def plot_reward_distribution():
    plt.violinplot(dataset=np.random.randn(200, 10) + np.random.randn(10))
    plt.xlabel("Action")
    plt.ylabel("Reward distribution")
    plt.savefig('./test_bed.png')
    plt.close()

def run_mab(runs=2000, steps=3000):
    epsilons = [0, 0.1, 0.01, 0.5, 1.0]
    mab_instances = [MAB(epsilon=epsilon) for epsilon in epsilons]
    optimal_action_counts, rewards = simulate(runs, steps, mab_instances)

    plt.figure(figsize=(10, 20))

    plt.subplot(2, 1, 1)
    for eps, rewards in zip(epsilons, rewards):
        plt.plot(rewards, label='epsilon = %.02f' % (eps))
    plt.xlabel('steps')
    plt.ylabel('average reward')
    plt.legend()

    plt.subplot(2, 1, 2)
    for eps, counts in zip(epsilons, optimal_action_counts):
        plt.plot(counts, label='epsilon = %.02f' % (eps))
    plt.xlabel('steps')
    plt.ylabel('% optimal action')
    plt.legend()

    plt.savefig('./e_greedy.png')
    plt.close()

In [None]:
plot_reward_distribution()

In [None]:
run_mab()