In [None]:
import numpy as np

In [None]:
class KArmedBandit_NS:
    def __init__(self, n_of_arms, mean, variance, epsilon, alpha, seed):
        self._n_of_arms = n_of_arms
        self._mean = mean
        self._variance = variance
        self._epsilon = epsilon
        self._alpha = alpha
        self._seed = seed
        np.random.seed(self._seed)
        self.reset()

    def reset(self):
        self._q_fs = [0 for _ in range(self._n_of_arms)]
        self._q_icr = [0 for _ in range(self._n_of_arms)]
        self._reward_num = [0 for _ in range(self._n_of_arms)]
        self._reward_hist = []

    def select_action(self):
        if np.random.uniform() > self._epsilon:
            # print("Greedy logic")
            action = np.argmax(self._q_icr)
        else:
            # print("Random epsilon")
            action = np.random.choice(self._n_of_arms)
            # print(f"Selected action: {action}")
        return action

    def generate_reward(self, action, iter):
        
        # After 100 iterations the env become non stationary
        if iter > 100:
            value = self.update_qs_generate_non_stationary_value(action)
        value = np.random.normal(self._mean[action], self._variance[action])
        print(f"Got value = {value}")
        return value
    
    def update_qs_generate_non_stationary_value(self, action):
        self._mean[action] += np.random.normal(0, 0.01)

    def update_internal_state(self, action, reward):
        # update Q, reward num
        # print("Updating value")
        print(f"Previous value fs = {self._q_fs[action]}")
        print(f"Previous value iqr = {self._q_icr[action]}")
        print(f"Previous reward num = {self._reward_num[action]}")

        self._reward_num[action] += 1
        
        self._q_fs[action] = ((self._q_fs[action] * (self._reward_num[action] - self._alpha)) + (self._alpha * reward)) / self._reward_num[action]
        
        self._q_icr[action] = self._q_icr[action] + ((self._alpha*(reward - self._q_icr[action])) / (self._reward_num[action]))
        
        print(f"new_q_fs = {self._q_fs[action]}")
        print(f"new_q_iqr = {self._q_icr[action]}")

    def run(self, n_of_iter):
        sum_rewards = 0
        for _i in range(n_of_iter):
        
            action = self.select_action()    
            reward = self.generate_reward(action=action, iter=_i)
            sum_rewards += reward
            print(f"sum rewards = {sum_rewards}")
            self._reward_hist.append(sum_rewards/(_i+1))

            self.update_internal_state(action=action, reward=reward)
    
    def summarize(self):
        print("Summarize bandit")
        print("Q function and reward num:")
        print(self._q_fs)
        print(self._q_icr)
        print(self._reward_num)
        print("Mean changes")
        print(self._mean)

In [None]:
mean = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
variance = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
n_experiments = 100_000

n_exp = {'a': 0.1, 'b': 0.1, 'c': 0.1, 'd': 0.1, 'e': 0.1}
bandits = {name: KArmedBandit_NS(n_of_arms=10, mean=mean, variance=variance, epsilon=0.1, alpha=value, seed=42) for name, value in n_exp.items()}



In [None]:
# Run
for name, bandit in bandits.items():
    bandit.reset()
    bandit.run(n_experiments)

In [None]:
for _k in bandits:
    print(f"Bandit {_k} summary:")
    bandits[_k].summarize()
    


In [None]:
bandits['d'].summarize()

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(15, 10))
ax = fig.add_subplot(111)
col_list = ['ro', 'bo', 'yo', 'go', 'co']
for (name, bandit), col in zip(bandits.items(), col_list):
    ax.plot(bandit._reward_hist, col, markersize=2,)

