# Multi-armed Bandit Problem

In [4]:
import numpy as np
import random
import matplotlib.pyplot as plt

In [48]:
class MultiArmedBandit:
    def __init__(self, number_of_arms: int) -> None:
        self.number_of_arms = number_of_arms
        self.means = np.random.normal(0, 1, number_of_arms)
        self.optimal_arm = np.argmax(self.means)
        self.optimal_mean = self.means[self.optimal_arm]

    def play(self, arm: int) -> float:
        mean = self.means[arm]
        return np.random.normal(mean, 1)
    


In [None]:
def plot_bandit(bandit: MultiArmedBandit):
    rewards = []
    for arm in range(bandit.number_of_arms):
        simulated_rewards = [bandit.play(arm) for _ in range(1000)] 
        rewards.append(simulated_rewards)
    
    plt.figure(figsize=(10, 6))
    plt.violinplot(rewards, showmeans=True, showmedians=True)
    plt.xlabel("Arm", fontsize=12)
    plt.ylabel("Reward Distribution", fontsize=12)
    plt.title("Reward Distributions of Multi-Armed Bandit Arms", fontsize=14)
    plt.xticks(range(1, bandit.number_of_arms + 1), [f'Arm {i}' for i in range(bandit.number_of_arms)], fontsize=10)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

bandit = MultiArmedBandit(10)
plot_bandit(bandit)
print(f"Optimal Arm: {bandit.optimal_arm}")
print(f"Optimal Mean: {bandit.optimal_mean:.2f}")

In [50]:
# random agent for a baseline
class RandomAgent:
    def __init__(self, bandit: MultiArmedBandit) -> None:
        self.bandit = bandit
        self.number_of_arms = bandit.number_of_arms

    def choose_action(self) -> int:
        pass
    
    def update(self, arm: int, reward: float) -> None:
        pass
    

In [51]:
def run_and_collect_data(agent, bandit, num_steps):
    rewards = []
    best_action_counts = []
    regrets = []
    for _ in range(num_steps):
        action = agent.choose_action()
        reward = bandit.play(action)
        agent.update(action, reward)
        rewards.append(reward)
        best_action_counts.append(action == bandit.optimal_arm)
        regrets.append(bandit.optimal_mean - bandit.means[action])
    return rewards, best_action_counts, regrets


In [52]:
def run_and_plot_for_agents(agents, agent_names, bandit, num_steps):
    all_rewards = []
    all_best_action_counts = []
    all_regrets = []

    for agent in agents:
        rewards, best_action_counts, regrets = run_and_collect_data(agent, bandit, num_steps)
        all_rewards.append(rewards)
        all_best_action_counts.append(best_action_counts)
        all_regrets.append(regrets)
    
    average_rewards = [np.cumsum(rewards) / (np.arange(len(rewards)) + 1) for rewards in all_rewards]
    average_best_action_counts = [np.cumsum(best_action_counts) / (np.arange(len(best_action_counts)) + 1) for best_action_counts in all_best_action_counts]
    cumulative_regrets = [np.cumsum(regrets) for regrets in all_regrets]
    
    fig, ax = plt.subplots(3, 1, figsize=(10, 12))
    
    for i, (agent, name) in enumerate(zip(agents, agent_names)):
        ax[0].plot(average_rewards[i], label=f'{name}')
        ax[1].plot(average_best_action_counts[i], label=f'{name}')
        ax[2].plot(cumulative_regrets[i], label=f'{name}')
    
    ax[0].set_xlabel('Steps', fontsize=12)
    ax[0].set_ylabel('Average Reward', fontsize=12)
    ax[0].set_title('Average Reward vs Steps', fontsize=14)
    ax[0].grid(linestyle='--', alpha=0.7)
    ax[0].legend()

    ax[1].set_xlabel('Steps', fontsize=12)
    ax[1].set_ylabel('Best Action Count', fontsize=12)
    ax[1].set_title('Best Action Count vs Steps', fontsize=14)
    ax[1].grid(linestyle='--', alpha=0.7)
    ax[1].legend()

    ax[2].set_xlabel('Steps', fontsize=12)
    ax[2].set_ylabel('Regret', fontsize=12)
    ax[2].set_title('Regret vs Steps', fontsize=14)
    ax[2].grid(linestyle='--', alpha=0.7)
    ax[2].legend()

    plt.tight_layout()
    plt.show()


In [None]:
random_agent = RandomAgent(bandit)
agents = [random_agent]
agent_names = ['Random Agent']
run_and_plot_for_agents(agents, agent_names, bandit, 10000)

In [32]:
# a agent that explore every arm N times and then exploit the best arm
class ExploreThenExploit:
    def __init__(self, bandit: MultiArmedBandit, N: int) -> None:
        self.bandit = bandit

    def choose_action(self) -> int:
        pass
        
    def update(self, arm: int, reward: float) -> None:
        pass



In [None]:
# TODO: run ExploreThenExploit agent for different values of N and comment on the results

In [46]:
class EpsilonGreedy:
    def __init__(self, bandit: MultiArmedBandit, epsilon: float=0.1) -> None:
        pass

    def choose_action(self) -> int:
        pass

    def update(self, arm: int, reward: float) -> None:
        pass

In [None]:
# TODO: Run the epsilon-greedy agent for different values of N and comment on the results

In [65]:
class UCB:
    def __init__(self, bandit: MultiArmedBandit, c: float=2) -> None:
        pass

    def choose_action(self) -> int:
        pass

    def update(self, arm: int, reward: float) -> None:
        pass

In [None]:
# TODO: Run the UCB agent for different values of c and comment on the results


In [None]:
# TODO: run the epsilon greedy and UCB agents; compare the results

In [None]:
# TODO: Create a new class MultiArmedBanditBernoulli that generates rewards from a Bernoulli distribution
class MultiArmedBanditBernoulli:
    def __init__(self, number_of_arms: int) -> None:
        self.number_of_arms = number_of_arms
        self.means = np.random.uniform(0, 1, number_of_arms)
        self.optimal_arm = np.argmax(self.means)
        self.optimal_mean = self.means[self.optimal_arm]

    def play(self, arm: int) -> float:
        pass
    
bandit_bernoulli = MultiArmedBanditBernoulli(10)
print(bandit_bernoulli.means)


In [None]:
# TODO: Implement the Thompson Sampling algorithm

class ThompsonSampling:
    def __init__(self, bandit: MultiArmedBandit) -> None:
        self.bandit = bandit
        self.number_of_arms = bandit.number_of_arms
        self.alpha = np.ones(self.number_of_arms)
        self.beta = np.ones(self.number_of_arms)

    def choose_action(self) -> int:
        pass

    def update(self, action: int, reward: float) -> None:
        pass

    def plot_beta_distribution(self, arms= None):
        '''Plot the beta distribution of the arms'''
        if arms is None:
            arms = range(self.number_of_arms)
        x = np.linspace(0, 1, 1000)
        for i in arms:
            y = np.exp(np.log(x) * (self.alpha[i] - 1) + np.log(1 - x) * (self.beta[i] - 1))
            plt.plot(x, y, label=f'Arm {i}')
        plt.xlabel('p')
        plt.ylabel('Density')
        plt.title('Beta Distribution for Each Arm')
        plt.legend()
        plt.show()

In [None]:
# TODO: run the Thompson Sampling agent and compare the results with other agents. Plot the beta distributions at the end of the learning process.