In [None]:
import numpy as np
import copy

In [None]:
class Environment:

    def __init__(self, mean, variance, deturpation, seed):
        self._seed = seed
        self._mean_bkp = mean
        self._deturpation = deturpation
        self._variance_bkp = variance
        self._mean = copy.deepcopy(self._mean_bkp)
        self._variance = copy.deepcopy(self._variance_bkp)
        self._rng = np.random.default_rng(self._seed)
        self._hist_best_reward = []

    
    def reset(self):
        self._mean = copy.deepcopy(self._mean_bkp)
        self._variance = copy.deepcopy(self._mean)
        self._rng = np.random.default_rng(self._seed)
        self._hist_best_reward = []

    def add_noise(self):
        self._mean = [i + self._rng.normal(0, self._deturpation) for i in self._mean]

    def generate_reward(self, action):
        self._hist_best_reward.append(np.max(self._mean))
        return self._rng.normal(self._mean[action], self._variance[action])
    
    

In [None]:
class KArmedBandit_NS:
    def __init__(self, n_of_arms, stationarity_time, env, epsilon, alpha, k, seed):
        self._n_of_arms = n_of_arms
        self._stationarity_time = stationarity_time
        self._env = env
        self._epsilon = epsilon
        self._alpha = alpha
        self._seed = seed
        self._k = k
        self._rng = np.random.default_rng(self._seed)
        self.reset()

    def reset(self):
        self._q_icr = [0 for _ in range(self._n_of_arms)]
        self._reward_num = [0 for _ in range(self._n_of_arms)]
        self._mean_reward_hist = []
        self._mean_reward_hist_last_k = []
        self._reward_hist = []
        self._last_k_reward_hist = []

    def select_action(self):
        if self._rng.uniform() > self._epsilon:
            # print("Greedy logic")
            action = np.argmax(self._q_icr)
        else:
            # print("Random epsilon")
            action = self._rng.choice(self._n_of_arms)
            # print(f"Selected action: {action}")
        return action

    def generate_reward(self, action):
        
        value = self._env.generate_reward(action)
        print(self._env._mean)
        print(f"Got value = {value}")
        return value


    def update_internal_state(self, action, reward):
        # update Q, reward num
        # print("Updating value")
        #print(f"Previous value fs = {self._q_fs[action]}")
        print(f"Previous value iqr = {self._q_icr[action]}")
        print(f"Previous reward num = {self._reward_num[action]}")

        self._reward_num[action] += 1
        
        self._q_icr[action] = self._q_icr[action] + ((self._alpha*(reward - self._q_icr[action])) / (self._reward_num[action]))

        #self._q_icr[action] = self._q_icr[action] + (reward - self._q_icr[action]) / (self._reward_num[action])
        
        print(f"new_q_iqr = {self._q_icr[action]}")

    def run(self, n_of_iter):
        sum_rewards = 0
        for _i in range(n_of_iter):

            if _i > self._stationarity_time:
                self._env.add_noise()
        
            action = self.select_action()    
            reward = self.generate_reward(action=action)
            sum_rewards += reward
            print(f"sum rewards = {sum_rewards}")
            #self._reward_hist.append(reward)
            self._mean_reward_hist.append(sum_rewards/(_i+1))
            
            self._last_k_reward_hist.append(reward)
            if self._k == len(self._last_k_reward_hist):
                self._last_k_reward_hist.pop(0)
            
            self._mean_reward_hist_last_k.append(np.average(self._last_k_reward_hist))
            self.update_internal_state(action=action, reward=reward)
    
    def summarize(self):
        print("Summarize bandit")
        print("Q function and reward num:")
        print(self._q_icr)
        print(self._reward_num)
        print("Mean changes")

In [None]:
mean = [1, 1, 1, 2, 1, 1, 1, 1, 1, 1]
variance = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
n_experiments = 30_000

# n_exp = {'a': 0.8, 'b': 0.7, 'c': 0.6, 'd': 0.5, 'e': 0.2}

n_exp = {'a': 0.4, 'b': 0.3, 'c': 0.2, 'd': 0.15, 'e': 0.1}

envs = {name: Environment(mean, variance, deturpation=0.1, seed=42) for name, _ in n_exp.items()}
bandits = {name: KArmedBandit_NS(n_of_arms=10, stationarity_time=200, env=envs[name], epsilon=value, alpha=0.01, k=200, seed=42) for name, value in n_exp.items()}



In [None]:
# Run
d = {}
for name, bandit in bandits.items():
    bandit.run(n_experiments)
    d[name] = envs[name]._mean
    
    

In [None]:
print(bandits['e']._reward_hist)

In [None]:
# for _k in bandits:
#     print(f"Bandit {_k} summary:")
#     bandits[_k].summarize()
    

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(15, 10))
ax = fig.add_subplot(111)
col_list = ['ro', 'bo', 'yo', 'go', 'co']
for (name, bandit), col in zip(bandits.items(), col_list):
    ax.plot(bandit._mean_reward_hist, col, markersize=2,)
ax.plot(envs['a']._hist_best_reward, 'g+', markersize=1)



In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(15, 10))
ax = fig.add_subplot(111)
col_list = ['ro', 'bo', 'yo', 'go', 'co']
for (name, bandit), col in zip(bandits.items(), col_list):
    ax.plot(bandit._mean_reward_hist_last_k, col, markersize=2,)

ax.plot(envs['a']._hist_best_reward, 'g+', markersize=1)

