In [5]:
import polars as pl
import numpy as np

In [102]:
class KArmedBandit:
    def __init__(self, n_of_arms, mean, variance, epsilon, seed):
        self._n_of_arms = n_of_arms
        self._mean = mean
        self._variance = variance
        self._epsilon = epsilon
        self._seed = seed
        np.random.seed(self._seed)
        self.reset()

    def reset(self):
        self._q = [0 for _ in range(self._n_of_arms)]
        self._reward_num = [0 for _ in range(self._n_of_arms)]

    def select_action(self):
        if np.random.uniform() > self._epsilon:
            #print("Greedy logic")
            action = np.argmax(self._q)
        else:
            #print("Random epsilon")
            action = np.random.choice(self._n_of_arms)
            #print(f"Selected action: {action}")
        return action

    def generate_reward(self, action):
        value = np.random.normal(self._mean[action], self._variance[action])
        #print(f"Got value = {value}")
        return value
    
    def update_internal_state(self, action, reward):
        # update Q, reward num
        self._q[action] = (self._q[action] * self._reward_num[action] + reward) / (self._reward_num[action] + 1)
        self._reward_num[action] += 1

    def run(self, n_of_iter):
        
        for _i in range(n_of_iter):
            if _i%10 == 0:
                print(f"Iteration n {_i}")
                self.summarize()

            action = self.select_action()

            reward = self.generate_reward(action=action)

            self.update_internal_state(action=action, reward=reward)
    
    def summarize(self):
        print("Summarize bandit")
        print("Q function and reward num:")
        print(self._q)
        print(self._reward_num)

In [106]:
mean = [5, 1, 5]
variance = [3, 1, 4]

bandit = KArmedBandit(n_of_arms=3, mean=mean, variance=variance, epsilon=0.1, seed=42)



In [110]:
bandit.reset()

In [111]:
bandit.run(100)

Iteration n 0
Summarize bandit
Q function and reward num:
[0, 0, 0]
[0, 0, 0]
Iteration n 10
Summarize bandit
Q function and reward num:
[4.7059676653825475, 0, 3.6215579165083893]
[8, 0, 2]
Iteration n 20
Summarize bandit
Q function and reward num:
[4.324555978351353, 0, 3.6215579165083893]
[18, 0, 2]
Iteration n 30
Summarize bandit
Q function and reward num:
[4.675798079053424, 0.9263888842080936, 3.6215579165083893]
[27, 1, 2]
Iteration n 40
Summarize bandit
Q function and reward num:
[4.6808625956341094, 0.9263888842080936, 3.6215579165083893]
[37, 1, 2]
Iteration n 50
Summarize bandit
Q function and reward num:
[4.889415642934037, 0.9263888842080936, 3.6215579165083893]
[47, 1, 2]
Iteration n 60
Summarize bandit
Q function and reward num:
[4.905991914212984, 0.9263888842080936, 3.6215579165083893]
[57, 1, 2]
Iteration n 70
Summarize bandit
Q function and reward num:
[4.823454108624989, 0.9263888842080936, 3.6215579165083893]
[67, 1, 2]
Iteration n 80
Summarize bandit
Q function an

In [94]:
bandit.summarize()

Summarize bandit
Q function and reward num:
[5.012049848825856, 1.042366740766647, 4.942242440617779]
[8043, 417, 3740]


In [87]:
np.mean(np.random.normal(5, 4, 100000))

5.006778998441803

In [88]:
np.mean(np.random.normal(5, 3, 100000))

4.992991926653388

In [89]:
np.mean(np.random.normal(5, 1, 100000))

5.000591190535414