In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
%cd /content/drive/MyDrive/

/content/drive/MyDrive


In [6]:
!pwd

/content/drive/MyDrive


In [7]:
!git init reinforcement-learning

Initialized empty Git repository in /content/drive/MyDrive/reinforcement-learning/.git/


In [8]:
%ls -a

[0m[01;34m'Colab Notebooks'[0m/           [01;34m'Markov Decision Processes'[0m/
 [01;34mdeep-learning-python-book[0m/   [01;34mPNS[0m/
 [01;34mdynamical-systems[0m/           [01;34mreinforcement-learning[0m/


In [9]:
%cd reinforcement-learning

/content/drive/MyDrive/reinforcement-learning


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# A/B/n Testing

We first create a function factory that will create a random number generator that generates normal variates with a given mean and standard deviation:

In [None]:
def normal_generator(mean, stdev, rng):
    def generator():
        return rng.normal(mean, stdev)
    return generator

Let's now implement A/B/n Testing.

We first simulate the game choosing a random action for a certain number of steps.

In [None]:
# Restart random number generator
rng = np.random.default_rng(seed=42)  

# Define array of actions
nactions = 5
means = [1.1, 2.3, 2.2, 1.2, 2.1]
stdevs = [1, 1, 1, 1, 1]
reward_generators = [normal_generator(mean, stdev, rng) for mean, stdev in zip(means, stdevs)]

# Initialize simulation
nsteps = 1000
action_values = np.zeros(nactions, dtype=np.float64)
action_counts = np.zeros(nactions, dtype=np.int64)

# Run simulation
for _ in range(nsteps):
    action = rng.integers(nactions)
    reward = reward_generators[action]()
    action_counts[action] += 1
    action_values[action] += (reward - action_values[action]) / action_counts[action]
print('Training results')
for i in range(nactions):
  print(f'Action {i}: Count: {action_counts[i]} Value: {action_values[i]}')
best_action = np.argmax(action_values)
best_value = action_values[best_action]
print(f'Best action: {best_action} Value: {best_value}')

Training results
Action 0: Count: 198 Value: 1.0794119031506175
Action 1: Count: 204 Value: 2.271695036128357
Action 2: Count: 204 Value: 2.1292896561988575
Action 3: Count: 201 Value: 1.154292804913939
Action 4: Count: 193 Value: 2.1130371998782436
Best action: 1 Value: 2.271695036128357


Things to experiment with:

- Run the code above with different values of the seed. Is there variation on the best action selected?

- Run the code with different values of the means and standard deviation.

- Is there a minimum number of steps that reliably selects the best action for any values of the parameters?

(It is possible to give a precise answer to this question, depending on the values of the standard deviations. A way to get a feeling for that experimentally would be to compare only two actions with close means and different standard deviations)

# $\epsilon$ Greedy Actions

In [None]:
# Restart random number generator
rng = np.random.default_rng(seed=2048)  

# Define array of actions
nactions = 5
means = [1.1, 2.3, 2.2, 1.2, 2.1]
stdevs = [1, 1, 1, 1, 1]
reward_generators = [normal_generator(mean, stdev, rng) for mean, stdev in zip(means, stdevs)]

# Initialize simulation
eps = 0.001
nsteps = 10000
action_values = np.zeros(nactions, dtype=np.float64)
action_counts = np.zeros(nactions, dtype=np.int64)

# Run simulation
for _ in range(nsteps):
    if rng.random() < eps:
        action = rng.integers(nactions)
    else:
        action = np.argmax(action_values)
    reward = reward_generators[action]()
    action_counts[action] += 1
    action_values[action] += (reward - action_values[action]) / action_counts[action]
print('Training results')
for i in range(nactions):
  print(f'Action {i}: Count: {action_counts[i]} Value: {action_values[i]}')
best_action = np.argmax(action_values)
best_value = action_values[best_action]
print(f'Best action: {best_action} Value: {best_value}')


Training results
Action 0: Count: 5 Value: 0.33055188588313
Action 1: Count: 9989 Value: 2.2861703712256958
Action 2: Count: 5 Value: 2.112255655820844
Action 3: Count: 1 Value: 1.2547221814541167
Action 4: Count: 0 Value: 0.0
Best action: 1 Value: 2.2861703712256958
