### Bayesian Multi-Armed Bandit Mood Modeling

In [16]:
import numpy as np

class BayesianBandit:
    def __init__(self, means, stds, pull_cost, no_op_cost, initial_reward, change_interval, initial_exploration=1):
        self.means = np.array(means)
        self.stds = np.array(stds)
        self.pull_cost = pull_cost
        self.no_op_cost = no_op_cost
        self.initial_reward = initial_reward
        self.n_arms = len(means)
        self.total_reward = initial_reward
        self.change_interval = change_interval
        self.step_count = 0
        self.counts = np.full(self.n_arms, initial_exploration)  # Start with some initial explorations
        self.rewards = [np.random.normal(means[i], stds[i], initial_exploration).tolist() for i in range(self.n_arms)]  # Initial rewards
        self.no_op_rewards = []  # Track rewards when choosing NO-OP

    def pull(self, arm):
        reward = np.random.normal(self.means[arm], self.stds[arm])
        self.rewards[arm].append(reward)
        self.total_reward += reward - self.pull_cost
        return reward

    def change_rewards(self):
        self.means += np.random.randint(-20, 20, size=self.n_arms)
        print(f"Reward distributions changed: {self.means}")

    def choose_arm(self):
        self.step_count += 1
        if self.step_count % self.change_interval == 0:
            self.change_rewards()

        estimated_means = np.array([np.mean(rewards) for rewards in self.rewards])
        estimated_std_devs = np.array([np.std(rewards) for rewards in self.rewards])
        no_op_mean = np.mean(self.no_op_rewards) if self.no_op_rewards else -self.no_op_cost

        #print(f"Step {self.step_count}: Estimated Means: {estimated_means}, Estimated Std Devs: {estimated_std_devs}, NO-OP Mean: {no_op_mean}")

        if self.step_count <= self.n_arms:  # Force initial exploration
            return self.step_count - 1

        if no_op_mean > np.max(estimated_means - self.pull_cost):
            return -1  # NO-OP
        else:
            return np.argmax(estimated_means)

    def run(self):
        while self.total_reward > 0:
            arm = self.choose_arm()
            if arm == -1:
                self.total_reward -= self.no_op_cost
                self.no_op_rewards.append(-self.no_op_cost)
                print(f"NO-OP chosen. Total reward: {self.total_reward}")
            else:
                reward = self.pull(arm)
                print(f"Arm {arm} pulled. Reward: {reward:.2f}, Total reward: {self.total_reward}")

            if self.total_reward <= 0:
                print("Game over.")
                break

# Setup your bandit configuration
bandit = BayesianBandit([5, 0, -10, 10], [10, 25, 20, 15], 5, 1, 100, 10)
bandit.run()


Arm 0 pulled. Reward: 8.27, Total reward: 103.27223386365867
Arm 1 pulled. Reward: 12.66, Total reward: 110.93661640547197
Arm 2 pulled. Reward: -12.55, Total reward: 93.38452051984447
Arm 3 pulled. Reward: 15.12, Total reward: 103.50838260907521
Arm 0 pulled. Reward: 26.19, Total reward: 124.69724473137512
Arm 0 pulled. Reward: 20.47, Total reward: 140.16261228609022
Arm 0 pulled. Reward: -4.50, Total reward: 130.66742183430165
Arm 0 pulled. Reward: -0.97, Total reward: 124.69844851867413
Arm 0 pulled. Reward: 3.89, Total reward: 123.58640852057144
Reward distributions changed: [ 2  4  0 -8]
Arm 1 pulled. Reward: -39.73, Total reward: 78.8596726662811
Arm 3 pulled. Reward: -21.26, Total reward: 52.601811123264106
Arm 0 pulled. Reward: 3.54, Total reward: 51.139062182719385
Arm 0 pulled. Reward: 2.58, Total reward: 48.721149367179386
Arm 0 pulled. Reward: -9.43, Total reward: 34.291446388873155
Arm 0 pulled. Reward: 5.58, Total reward: 34.86931999235599
Arm 0 pulled. Reward: 7.61, Tota