In [7]:
import numpy as np

# Define the number of arms/actions and their true reward values
num_arms = 5
true_reward_values = [1.5, 2.0, 1.8, 2.2, 1.9]

In [8]:
def choose_action(action_values):
    """Performs greedy action selection based on estimated action values.
    
    Args:
        action_values (list or numpy.ndarray): Estimated action values for each action.
    
    Returns:
        int: Index of the action selected.
    """
    return np.argmax(action_values)

In [9]:
def get_reward(action):
    """Simulates the reward received for a chosen action.
    
    Args:
        action (int): Index of the chosen action.
    
    Returns:
        float: Reward received for the chosen action.
    """
    return np.random.normal(true_reward_values[action], 1.0)

In [10]:
# Initialize the estimated action values
estimated_action_values = np.zeros(num_arms)

estimated_action_values

array([0., 0., 0., 0., 0.])

In [11]:
# Perform action selection for a number of steps
num_steps = 1000

for step in range(num_steps):
    # Select an action using the greedy method
    chosen_action = choose_action(estimated_action_values)
    
    # Get the reward for the chosen action
    reward = get_reward(chosen_action)
    
    # Update the estimated action value for the chosen action
    estimated_action_values[chosen_action] += (1 / (step + 1)) * (reward - estimated_action_values[chosen_action])

![image.png](attachment:b458b279-607c-400b-a8e6-2ae144169332.png)

In [12]:
# Print the estimated action values
print("Estimated action values:", estimated_action_values)

Estimated action values: [1.47876476 0.         0.         0.         0.        ]
