In [1]:
import numpy as np
import gym
from gym import spaces


In [2]:
class KArmedBanditEnv(gym.Env):
    """
    K-Armed Bandit Environment
    """
    def __init__(self, k=10):
        super(KArmedBanditEnv, self).__init__()
        self.k = k
        self.action_space = spaces.Discrete(k)
        self.probabilities = np.random.rand(k)  # Randomly generated probabilities for each arm

    def step(self, action):
        assert self.action_space.contains(action)
        # Reward is 1 with the probability of the selected arm, else 0
        reward = np.random.rand() < self.probabilities[action]
        return action, reward, True, {}

    def reset(self):
        return None

In [3]:
'''
ε-greedy 策略平衡了探索（尝试可能不是最佳的动作以获得更多信息）和利用（选择当前估计最好的动作）之间的权衡
'''
def epsilon_greedy_policy(k, epsilon, q_values):
    if np.random.rand() < epsilon:
        return np.random.choice(k)
    else:
        return np.argmax(q_values)

In [6]:
# Environment and training setup
k = 10
episodes = 1000
epsilon = 0.1

env = KArmedBanditEnv(k=k)
q_values = np.zeros(k)
counts = np.zeros(k)

for _ in range(episodes):
    action = epsilon_greedy_policy(k, epsilon, q_values)
    _, reward, _, _ = env.step(action)
    counts[action] += 1
    q_values[action] += (reward - q_values[action]) / counts[action]

print("Estimated values:", q_values)

Estimated values: [0.89368771 0.78571429 0.85714286 0.76190476 0.77777778 0.33333333
 0.4        0.         0.72727273 0.22222222]
