In [3]:
import numpy as np
import random

In [65]:
class K_Bandit:
    def __init__(self, k):
        self.k = k
        self.reward = np.zeros(k)
        self.action_time = np.zeros(k)
        self.award = np.random.uniform(0, 10, k)
    
    def choose_action(self, i):
        if i < 0 or i >= self.k:
            raise ValueError(f"Change value of i. Should be greater than 0 and smaller than {self.k-1}")
        
        curr_award = self.award[i]
        print(curr_award)
        self.action_time[i] += 1
        self.reward[i] += (curr_award - self.reward[i])/self.action_time[i]
        return self.reward[i]

bandit = K_Bandit(5)
print(bandit.award)
epsilon = 0.5
total_reward = 0
x = 0

while x < 100:
    if epsilon < np.random.uniform(0, 1):
        action = np.argmax(bandit.reward)
    else:
        action = random.randint(0, bandit.k-1)

    reward = bandit.choose_action(action)
    print(reward, "reward")
    total_reward += reward
    
    if x % 10 == 0 and x > 0:
        epsilon = max(epsilon-0.1, 0.1)
        print(f"Epsilon Value: {epsilon}, Total_reward = {total_reward}")
        total_reward = 0

    x += 1

[7.11904463 9.42289893 9.17959535 5.4301468  9.94338131]
9.4228989280798
9.4228989280798 reward
9.4228989280798
9.4228989280798 reward
9.4228989280798
9.4228989280798 reward
9.4228989280798
9.4228989280798 reward
7.119044629794447
7.119044629794447 reward
9.4228989280798
9.4228989280798 reward
9.4228989280798
9.4228989280798 reward
9.4228989280798
9.4228989280798 reward
9.943381309671432
9.943381309671432 reward
9.943381309671432
9.943381309671432 reward
9.943381309671432
9.943381309671432 reward
Epsilon Value: 0.4, Total_reward = 102.90948105536737
9.943381309671432
9.943381309671432 reward
9.4228989280798
9.4228989280798 reward
9.943381309671432
9.943381309671432 reward
9.943381309671432
9.943381309671432 reward
7.119044629794447
7.119044629794447 reward
9.943381309671432
9.943381309671432 reward
9.943381309671432
9.943381309671432 reward
9.179595354213046
9.179595354213046 reward
9.943381309671432
9.943381309671432 reward
9.943381309671432
9.943381309671432 reward
Epsilon Value: 0.3

In [63]:
class K_Bandit:
    def __init__(self, k):
        self.k = k
        self.reward = np.zeros(k)
        self.action_count = np.zeros(k)
        self.true_values = np.random.uniform(0, 10, k)
    
    def choose_action(self, i):
        if i < 0 or i >= self.k:
            raise ValueError(f"Invalid action. Should be between 0 and {self.k-1}")

        curr_reward = self.true_values[i]
        # curr_reward = np.random.normal(self.true_values[i], 1)
        self.action_count[i] += 1
        self.reward[i] += (curr_reward - self.reward[i]) / self.action_count[i]
        return curr_reward

bandit = K_Bandit(10)
epsilon = 0.5
total_reward = 0
x = 0

while x < 1000:
    if np.random.uniform(0, 1) > epsilon:
        action = int(np.argmax(bandit.reward))
    else:
        action = random.randint(0, bandit.k-1)
    
    reward = bandit.choose_action(action)
    total_reward += reward
    
    if x % 100 == 0 and x > 0:
        epsilon = max(0, epsilon - 0.1)
        print(f"Step {x}, Epsilon: {epsilon}, Total Reward: {total_reward}")
        total_reward = 0
    x += 1

print(f"Final Total Reward: {total_reward}")
print(f"Estimated Action Values: {bandit.reward}")
print(f"True Action Values: {bandit.true_values}")

Step 100, Epsilon: 0.4, Total Reward: 704.74137538508
Step 200, Epsilon: 0.30000000000000004, Total Reward: 729.5166779342757
Step 300, Epsilon: 0.20000000000000004, Total Reward: 746.2462806847647
Step 400, Epsilon: 0.10000000000000003, Total Reward: 851.9879396580461
Step 500, Epsilon: 2.7755575615628914e-17, Total Reward: 925.9085035655459
Step 600, Epsilon: 0, Total Reward: 999.064118698206
Step 700, Epsilon: 0, Total Reward: 999.064118698206
Step 800, Epsilon: 0, Total Reward: 999.064118698206
Step 900, Epsilon: 0, Total Reward: 999.064118698206
Final Total Reward: 989.0734775112239
Estimated Action Values: [3.6796859  0.15936856 7.8565172  9.99064119 3.50755076 8.0402681
 0.77729097 0.06787444 0.77944542 7.88534789]
True Action Values: [3.6796859  0.15936856 7.8565172  9.99064119 3.50755076 8.0402681
 0.77729097 0.06787444 0.77944542 7.88534789]


In [66]:
class K_Bandit:
    def __init__(self, k):
        self.k = k
        self.reward = np.zeros(k)
        self.action_time = np.zeros(k)
        self.award = np.random.uniform(0, 10, k)
    
    def choose_action(self, i):
        if i < 0 or i >= self.k:
            raise ValueError(f"Change value of i. Should be greater than 0 and smaller than {self.k-1}")
        
        curr_award = self.award[i]
        print(curr_award)
        self.action_time[i] += 1
        self.reward[i] += (curr_award - self.reward[i])/self.action_time[i]
        return self.reward[i]

bandit = K_Bandit(5)
print(bandit.award)
epsilon = 0.5
total_reward = 0
x = 0

while x < 100:
    if epsilon < np.random.uniform(0, 1):
        action = np.argmax(bandit.reward)
    else:
        action = random.randint(0, bandit.k-1)

    reward = bandit.choose_action(action)
    print(reward, "reward")
    total_reward += reward
    
    if x % 10 == 0 and x > 0:
        epsilon = max(epsilon-0.1, 0.1)
        print(f"Epsilon Value: {epsilon}, Total_reward = {total_reward}")
        total_reward = 0

    x += 1

[9.64860702 8.3157623  4.47784347 8.57270475 4.06787045]
9.64860701774524
9.64860701774524 reward
8.572704750000392
8.572704750000392 reward
9.64860701774524
9.64860701774524 reward
4.477843466186965
4.477843466186965 reward
9.64860701774524
9.64860701774524 reward
9.64860701774524
9.64860701774524 reward
9.64860701774524
9.64860701774524 reward
9.64860701774524
9.64860701774524 reward
9.64860701774524
9.64860701774524 reward
8.315762300018624
8.315762300018624 reward
9.64860701774524
9.64860701774524 reward
Epsilon Value: 0.4, Total_reward = 98.55516665816789
9.64860701774524
9.64860701774524 reward
9.64860701774524
9.64860701774524 reward
9.64860701774524
9.64860701774524 reward
9.64860701774524
9.64860701774524 reward
9.64860701774524
9.64860701774524 reward
8.572704750000392
8.572704750000392 reward
9.64860701774524
9.64860701774524 reward
4.06787044727963
4.06787044727963 reward
9.64860701774524
9.64860701774524 reward
9.64860701774524
9.64860701774524 reward
Epsilon Value: 0.3000

In [6]:
class K_Bandit:
    def __init__(self, k):
        self.k = k
        self.reward = np.zeros(k)
        self.action_count = np.zeros(k)
        self.true_values = np.random.uniform(0, 10, k)
    
    def choose_action(self, i):
        if i < 0 or i >= self.k:
            raise ValueError(f"Invalid action. Should be between 0 and {self.k-1}")

        curr_reward = np.random.normal(self.true_values[i], 1)
        self.action_count[i] += 1
        self.reward[i] += (curr_reward - self.reward[i]) / self.action_count[i]
        return curr_reward

bandit = K_Bandit(10)
epsilon = 0.5
total_reward = 0
x = 0

while x < 10000:
    if np.random.uniform(0, 1) > epsilon:
        action = int(np.argmax(bandit.reward))
    else:
        action = random.randint(0, bandit.k-1)
    
    reward = bandit.choose_action(action)
    total_reward += reward
    
    if x % 1000 == 0 and x > 0:
        epsilon = max(0, epsilon - 0.1)
        print(f"Step {x}, Epsilon: {epsilon}, Total Reward: {total_reward}")
        total_reward = 0
    x += 1

print(f"Final Total Reward: {total_reward}")
print(f"Estimated Action Values: {bandit.reward}")
print(f"True Action Values: {bandit.true_values}")

Step 1000, Epsilon: 0.4, Total Reward: 6598.192809726761
Step 2000, Epsilon: 0.30000000000000004, Total Reward: 6885.546790207848
Step 3000, Epsilon: 0.20000000000000004, Total Reward: 7410.844338218756
Step 4000, Epsilon: 0.10000000000000003, Total Reward: 7755.7454247184205
Step 5000, Epsilon: 2.7755575615628914e-17, Total Reward: 8026.398447989656
Step 6000, Epsilon: 0, Total Reward: 8465.285458199001
Step 7000, Epsilon: 0, Total Reward: 8401.915087648144
Step 8000, Epsilon: 0, Total Reward: 8476.275734055453
Step 9000, Epsilon: 0, Total Reward: 8434.135161935064
Final Total Reward: 8461.190119353605
Estimated Action Values: [6.35488565 5.08383804 5.39820058 7.99469758 0.94107531 5.21360475
 6.49058107 8.45256167 0.33290124 1.74999217]
True Action Values: [6.37600095 5.13028371 5.39360179 7.99882033 1.00388092 5.23518856
 6.56998273 8.44459512 0.40265323 1.73427946]


In [68]:
print(9**100/100**100)

2.6561398887587476e-105
