In [112]:
import polars as pl
import numpy as np

In [144]:
class KArmedBandit:
    def __init__(self, n_of_arms, mean, variance, epsilon, seed):
        self._n_of_arms = n_of_arms
        self._mean = mean
        self._variance = variance
        self._epsilon = epsilon
        self._seed = seed
        np.random.seed(self._seed)
        self.reset()

    def reset(self):
        self._q = [0 for _ in range(self._n_of_arms)]
        self._reward_num = [0 for _ in range(self._n_of_arms)]

    def select_action(self):
        if np.random.uniform() > self._epsilon:
            # print("Greedy logic")
            action = np.argmax(self._q)
        else:
            # print("Random epsilon")
            action = np.random.choice(self._n_of_arms)
            # print(f"Selected action: {action}")
        return action

    def generate_reward(self, action):
        value = np.random.normal(self._mean[action], self._variance[action])
        # print(f"Got value = {value}")
        return value
    
    def update_internal_state(self, action, reward):
        # update Q, reward num
        # print("Updating value")
        # print(f"Previous value = {self._q[action]}")
        # print(f"Previous reward num = {self._reward_num[action]}")
        self._q[action] = (self._q[action] * self._reward_num[action] + reward) / (self._reward_num[action] + 1)
        self._reward_num[action] += 1
        # print(f"Post value = {self._q[action]}")
        # print(f"Post reward num = {self._reward_num[action]}")


    def run(self, n_of_iter):
        
        for _i in range(n_of_iter):
            if _i%50 == 0:
                print(f"Iteration n {_i}")
                self.summarize()

            action = self.select_action()

            reward = self.generate_reward(action=action)

            self.update_internal_state(action=action, reward=reward)
    
    def summarize(self):
        print("Summarize bandit")
        print("Q function and reward num:")
        print(self._q)
        print(self._reward_num)

In [145]:
mean = [5, 1, 5]
variance = [3, 1, 4]

bandit = KArmedBandit(n_of_arms=3, mean=mean, variance=variance, epsilon=0.1, seed=42)



In [146]:
bandit.reset()

In [147]:
bandit.run(101)

Iteration n 0
Summarize bandit
Q function and reward num:
[0, 0, 0]
[0, 0, 0]
Iteration n 50
Summarize bandit
Q function and reward num:
[4.840358997367585, 0.8548619000985043, 6.158969414495561]
[7, 2, 41]
Iteration n 100
Summarize bandit
Q function and reward num:
[4.774519518836353, 0.7520058454695193, 5.781297914378294]
[11, 4, 85]


In [148]:
bandit.summarize()

Summarize bandit
Q function and reward num:
[4.774519518836353, 0.7520058454695193, 5.771994636551869]
[11, 4, 86]


In [87]:
np.mean(np.random.normal(5, 4, 100000))

5.006778998441803

In [88]:
np.mean(np.random.normal(5, 3, 100000))

4.992991926653388

In [89]:
np.mean(np.random.normal(5, 1, 100000))

5.000591190535414