In [112]:
import polars as pl
import numpy as np

In [228]:
class KArmedBandit:
    def __init__(self, n_of_arms, mean, variance, epsilon, seed):
        self._n_of_arms = n_of_arms
        self._mean = mean
        self._variance = variance
        self._epsilon = epsilon
        self._seed = seed
        np.random.seed(self._seed)
        self.reset()

    def reset(self):
        self._q_fs = [0 for _ in range(self._n_of_arms)]
        self._q_icr = [0 for _ in range(self._n_of_arms)]
        self._reward_num = [0 for _ in range(self._n_of_arms)]

    def select_action(self):
        if np.random.uniform() > self._epsilon:
            # print("Greedy logic")
            action = np.argmax(self._q_fs)
        else:
            # print("Random epsilon")
            action = np.random.choice(self._n_of_arms)
            # print(f"Selected action: {action}")
        return action

    def generate_reward(self, action):
        value = np.random.normal(self._mean[action], self._variance[action])
        print(f"Got value = {value}")
        return value
    
    def update_internal_state(self, action, reward):
        # update Q, reward num
        # print("Updating value")
        print(f"Previous value fs = {self._q_fs[action]}")
        print(f"Previous value iqr = {self._q_icr[action]}")
        print(f"Previous reward num = {self._reward_num[action]}")

        self._reward_num[action] += 1

        self._q_fs[action] = ((self._q_fs[action] * (self._reward_num[action] - 1)) + reward) / self._reward_num[action]
        
        self._q_icr[action] = self._q_icr[action] + (reward - self._q_icr[action]) / (self._reward_num[action])
        
        print(f"new_q_fs = {self._q_fs[action]}")
        print(f"new_q_iqr = {self._q_icr[action]}")
        




    def run(self, n_of_iter):
        
        for _i in range(n_of_iter):
            if _i%50 == 0:
                print(f"Iteration n {_i}")
                self.summarize()

            action = self.select_action()

            reward = self.generate_reward(action=action)

            self.update_internal_state(action=action, reward=reward)
    
    def summarize(self):
        print("Summarize bandit")
        print("Q function and reward num:")
        print(self._q_fs)
        print(self._q_icr)
        print(self._reward_num)

In [229]:
mean = [5, 1, 5]
variance = [3, 1, 4]

bandit = KArmedBandit(n_of_arms=3, mean=mean, variance=variance, epsilon=0.1, seed=42)



In [230]:
bandit.reset()

In [231]:
bandit.run(10)

Iteration n 0
Summarize bandit
Q function and reward num:
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
Got value = 1.6643596458592387
Previous value fs = 0
Previous value iqr = 0
Previous reward num = 0
new_q_fs = 1.6643596458592387
new_q_iqr = 1.6643596458592387
Got value = 5.95670655406815
Previous value fs = 1.6643596458592387
Previous value iqr = 1.6643596458592387
Previous reward num = 1
new_q_fs = 3.8105330999636946
new_q_iqr = 3.810533099963694
Got value = 9.737638446522174
Previous value fs = 3.8105330999636946
Previous value iqr = 3.810533099963694
Previous reward num = 2
new_q_fs = 5.786234882149855
new_q_iqr = 5.786234882149854
Got value = 1.767434729152909
Previous value fs = 0
Previous value iqr = 0
Previous reward num = 0
new_q_fs = 1.767434729152909
new_q_iqr = 1.767434729152909
Got value = 5.066665479835949
Previous value fs = 5.786234882149855
Previous value iqr = 5.786234882149854
Previous reward num = 3
new_q_fs = 5.606342531571379
new_q_iqr = 5.606342531571378
Got value = 3.716621

In [161]:
bandit.summarize()

Summarize bandit
Q function and reward num:
[5.228398277095096, 1.767434729152909, 5.8064123336197015]
[5, 1, 4]


In [87]:
np.mean(np.random.normal(5, 4, 100000))

5.006778998441803

In [88]:
np.mean(np.random.normal(5, 3, 100000))

4.992991926653388

In [89]:
np.mean(np.random.normal(5, 1, 100000))

5.000591190535414