# Installing Libraries (Python version >= 3.8)

In [None]:
import sys
version = sys.version_info
print(version)
assert version.major == 3 and version.minor >= 8

In [None]:
!python -m pip install numpy==1.23.5 matplotlib==3.7.4 tqdm==4.62.3

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm


class MABEnv:

    def __init__(self, n_arms=10, mean=0.0):
        """Initialize Multi-armed Bandits (MAB) Environment class.

        Args:
            n_arms: Number of arms to pull.
            mean: Mean value for normal distributions.
        """
        self.n_arms = n_arms
        self.mean = mean

    def reset(self):
        """Reset all the information."""
        # True reward drawn from the normal distribution with mean for each action
        self.q_true = np.random.randn(self.n_arms) + self.mean

        # Action index with best true reward
        self.optimal_action = np.argmax(self.q_true)

    def step(self, action):
        """Take action and return reward.

        Args:
            action: Index of next action

        Returns:
            reward
        """
        # Stochastic reward drawn from the normal distribution
        return np.random.randn() + self.q_true[action]

    def reward_distribution_example():
        plt.violinplot(dataset=np.random.randn(200, 10) + np.random.randn(10))
        plt.xlabel("Action")
        plt.ylabel("Reward distribution")
        plt.savefig('./reward_distribution_example.png')
        plt.close()

In [None]:
class EpsilonGreedyAgent:

    def __init__(self, env, epsilon=0.1):
        """Initialize ε-greedy Agent class.

        Args:
            env: Multi-armed Bandits (MAB) Environment class.
            epsilon: Probability for exploration.
        """
        self.env = env
        self.epsilon = epsilon

    def reset(self):
        """Reset all the information."""
        # Estimated reward for each action
        self.q_estimation = np.zeros(self.env.n_arms)

        # Chosen times for each action
        self.action_count = np.zeros(self.env.n_arms)

    def policy(self):
        """Get index of next action.

        Returns:
            Index of next action
        """
        if np.random.rand() < self.epsilon:
            return np.random.choice(np.arange(self.env.n_arms))
        return np.argmax(self.q_estimation)

    def update(self, action, reward):
        """Update estimation.

        Args:
            action: Index of next action
            reward: Reward from environment
        """
        self.action_count[action] += 1
        self.q_estimation[action] += 1.0 / self.action_count[action] * (reward - self.q_estimation[action])

In [None]:
class Simulator:

    def __init__(self, agents):
        self.agents = agents

    def run(self, runs=2000, steps=3000):
        optimal_action_counts, rewards = self._run(self.agents, runs, steps)

        plt.figure(figsize=(10, 20))

        plt.subplot(2, 1, 1)
        for eps, rewards in zip(epsilons, rewards):
            plt.plot(rewards, label='epsilon = %.02f' % (eps))
        plt.xlabel('steps')
        plt.ylabel('average reward')
        plt.legend()

        plt.subplot(2, 1, 2)
        for eps, counts in zip(epsilons, optimal_action_counts):
            plt.plot(counts, label='epsilon = %.02f' % (eps))
        plt.xlabel('steps')
        plt.ylabel('% optimal action')
        plt.legend()

        plt.savefig('./simulation_result.png')
        plt.close()

    def _run(self, agents, runs, steps):
        optimal_action_counts = np.zeros((len(agents), runs, steps))
        rewards = np.zeros(optimal_action_counts.shape)

        for i, agent in enumerate(agents):
            for r in tqdm(range(runs)):
                agent.env.reset()
                agent.reset()

                for s in range(steps):
                    action = agent.policy()
                    reward = agent.env.step(action=action)
                    agent.update(action=action, reward=reward)

                    rewards[i, r, s] = reward
                    if action == agent.env.optimal_action:
                        optimal_action_counts[i, r, s] = 1

        optimal_action_counts = optimal_action_counts.mean(axis=1)
        rewards = rewards.mean(axis=1)

        return optimal_action_counts, rewards

In [None]:
env = MABEnv(n_arms=10, mean=0.0)
epsilons = [0, 0.1, 0.01, 0.5, 1.0]
agents = [EpsilonGreedyAgent(env=env, epsilon=epsilon) for epsilon in epsilons]
simulator = Simulator(agents=agents)
simulator.run(runs=2000, steps=3000)