In [12]:
import numpy as np
import random

In [13]:
class MultiArmedBandit:
    def __init__(self, k):
        self.k = k
        self.probs = np.random.rand(k)  # Random probabilities for each arm
    
    def reset(self):
        return 0
    
    def step(self, action):
        reward = 1 if random.random() < self.probs[action] else 0
        return 0, reward, False, False, {}

# Initialize the multi-armed bandit environment
env_bandit = MultiArmedBandit(k=10)


In [14]:
def q_learning_bandit(env, episodes=1000, alpha=0.1, gamma=0.99, epsilon=0.1,n_steps=500):
    Q = np.zeros(env.k)
    action_counts = np.zeros(env.k)
    for _ in range(episodes):
        state = env.reset()
        for _ in range(n_steps):
            if random.uniform(0, 1) < epsilon:
                action = random.choice(range(env.k))
            else:
                action = np.argmax(Q)
            _, reward, _, _, _ = env.step(action)
            Q[action] = Q[action] + alpha * (reward - Q[action])
            action_counts[action] += 1
    policy = np.argmax(Q)
    return policy, Q

policy_bandit_ql, Q_table_bandit = q_learning_bandit(env_bandit)
print("Optimal Policy (Bandit - Q-Learning):", policy_bandit_ql)
print("Q-Table (Bandit - Q-Learning):", Q_table_bandit)


Optimal Policy (Bandit - Q-Learning): 8
Q-Table (Bandit - Q-Learning): [0.5592774  0.34444699 0.32092182 0.40638371 0.72993786 0.48354923
 0.39438064 0.31653592 0.92905259 0.71095371]


In [15]:
def epsilon_greedy_bandit(Q, epsilon):
    if random.uniform(0, 1) < epsilon:
        return np.random.choice(len(Q))
    else:
        return np.argmax(Q)

# Example usage
action = epsilon_greedy_bandit(Q_table_bandit, epsilon=0.1)
print("Selected Action (Bandit - Epsilon-Greedy):", action)


Selected Action (Bandit - Epsilon-Greedy): 6


In [16]:
def ucb_selection_bandit(Q, N, c=1):
    total_counts = np.sum(N) + 1e-10
    ucb_values = Q + c * np.sqrt(np.log(total_counts) / (N + 1e-10))
    return np.argmax(ucb_values)

def ucb_learning_bandit(env, episodes=1000, alpha=0.1, gamma=0.99, c=1,n_steps=500):
    Q = np.zeros(env.k)
    N = np.zeros(env.k)
    for _ in range(episodes):
        state = env.reset()
        for _ in range(n_steps):
            action = ucb_selection_bandit(Q, N, c)
            _, reward, _, _, _ = env.step(action)
            Q[action] = Q[action] + alpha * (reward - Q[action])
            N[action] += 1
    policy = np.argmax(Q)
    return policy, Q

policy_bandit_ucb, Q_table_bandit = ucb_learning_bandit(env_bandit)
print("Optimal Policy (Bandit - UCB):", policy_bandit_ucb)
print("Q-Table (Bandit - UCB):", Q_table_bandit)


  ucb_values = Q + c * np.sqrt(np.log(total_counts) / (N + 1e-10))


Optimal Policy (Bandit - UCB): 8
Q-Table (Bandit - UCB): [0.28186689 0.17360517 0.28155902 0.16992931 0.39260842 0.35092359
 0.30876304 0.23070885 0.96974403 0.47365267]
