# COMP 579 Assignment 1

1. Do Not Change the Random Seed
The random seed has been set to ensure reproducibility. Please do not modify it.

2. Guidance for the First Question
For the initial question, fill in the blanks under the sections marked as TODO. Follow the provided structure and complete the missing parts.

3. Approach for Subsequent Questions
For the later questions, we expect you to attempt the solutions independently. You can refer to the examples provided in earlier questions to understand how to
plot figures and implement solutions.

4. Ensure that the plots you produce for later questions are similar in style and format to those shown in the previous examples.

In [178]:
%matplotlib inline
import random
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy.special import softmax
from IPython.core.debugger import set_trace
np.random.seed(40)

plt.rcParams["figure.figsize"]=10,5

## Q1 Simulator for Gaussian Bandit

In [179]:

class GaussianBandit:
  """
    A class representing a Gaussian multi-armed bandit.

    Attributes
    ----------
    num_arms : int
        Number of arms in the bandit.
    mean : list or np.ndarray
        List of mean rewards for each arm.
    variance : float
        Variance of the rewards for all arms.

    Methods
    -------
    sample(arm_index)
        Samples a reward from the specified arm based on a Gaussian distribution.
    """

  # TODO:
  def __init__(self, num_arms, mean, variance):
    self.num_arms = num_arms
    self.mean = mean
    self.variance = variance

  def sample(self, arm_index):
    reward = np.random.normal(loc=self.mean[arm_index], scale=np.sqrt(self.variance))
    return reward


In [180]:
# TODO:
delta = 0.2
num_arms = 3
means = np.array([0.5, 0.5-delta, 0.5+delta])
# means = [0.5, 0.5 - delta, 0.5 + delta]
variance = 0.01
num_samples = 50

three_arm_gaussian_bandit = GaussianBandit(num_arms=num_arms, mean=means, variance=variance)

# Store the rewards for each arm
action_rewards = []
actions = range(num_arms)

for action in actions:
    # Store 50 samples per action
    rewards = []
    for _ in range(num_samples):
      rewards.append(three_arm_gaussian_bandit.sample(action))
    action_rewards.append(rewards)


### Graphs

In [181]:
for action in actions:
  fig, ax = plt.subplots()

  # TODO:
  true_value = means[action]
  estimated_value = np.mean(action_rewards[action])

  # draw the line of the true value
  line_true_val = ax.axhline(y = true_value, color = 'b', linestyle = ':', label = "true value")
  # draw the line of the estimated value
  line_est_val = ax.axhline(y = estimated_value, color = 'r', linestyle = '--', label = "estimated value")
  # plot the reward samples
  plt_samples, = ax.plot(action_rewards[action], 'o', label = "reward samples")

  ax.set_xlabel("sample number")
  ax.set_ylabel("reward value")
  ax.set_title("Sample reward, estimated and true expected reward over 50 samples for action %s" %action, y=-0.2)

  # show the legend with the labels of the line
  ax.legend(handles=[line_true_val, line_est_val, plt_samples])

## Q2 Estimated Q values

In [182]:
def update(reward_samples, alpha):
  """
  Each call to the function yields the current incremental average of the reward with a fixed learning rate, alpha
  E.g. Inital call returns alpha * reward_samples[0], second call returns prev_val + alpha * (reward_samples[1] - prev_val)
  where prev_val is the value return from the previous call, so on and so forth

  Parameters
  ----------
  reward_samples : array of int
      samples of reward values from one arm of a bandit
  alpha : int
      learning rate parameter for the averaging
  """
  for i in range(len(reward_samples)):
    if i == 0:
      average = alpha*reward_samples[i]
    else:
      average = average + alpha*(reward_samples[i]-average)
    yield average

def updateAvg(reward_samples):

  """
  Each call to the function yields the current incremental average of the reward
  E.g. Inital call returns reward_samples[0], second call returns the average of reward_samples[0] and reward_samples[0], so on and so forth

  Parameters
  ----------
  reward_samples : array of int
      samples of reward values from one arm of a bandit
  """

  for i in range(len(reward_samples)):
    if i == 0:
      average = reward_samples[i]
    else:
      average = average + 1/(i+1)*(reward_samples[i]-average)
    yield average

def updateDecaying(reward_samples, alpha_0=0.5, lambda_=0.01, p=0.5):
    """
    Each call to the function yields the updated estimate of the action value using an
    improved decaying learning rate.

    Parameters
    ----------
    reward_samples : array-like of int or float
        Samples of reward values from one arm of a bandit.
    alpha_0 : float, optional
        The initial learning rate (default is 0.5).
    lambda_ : float, optional
        The decay rate constant (default is 0.01).
    p : float, optional
        The power parameter for controlling decay (default is 0.5).
    """
    for i in range(len(reward_samples)):
      if i == 0:
        average = reward_samples[i]*alpha_0
      else:
        alpha = alpha_0/((1+lambda_*(i+1))**p)
        average = average + alpha*(reward_samples[i]-average)
      yield average


### Graphs

In [183]:
for action in actions:
  fig, ax = plt.subplots()

  # TODO:
  incr_avgs = list(updateAvg(action_rewards[action]))
  alpha_1_percent = list(update(action_rewards[action], 0.01))
  alpha_10_percent = list(update(action_rewards[action], 0.1))
  alpha_decay = list(updateDecaying(action_rewards[action], alpha_0=0.5, lambda_=0.01, p=0.5))
  true_value = means[action]

  # draw the true value line
  line_true_val = ax.axhline(y = true_value, color = 'b', linestyle = ':', label = "true value")

  # plot incremental values for averaging, alpha = 0.01, alpha = 0.1
  plt_incr_avgs, = ax.plot(incr_avgs, label = "incremental average")
  plt_alpha_1_percent, = ax.plot(alpha_1_percent, label = r"$\alpha = 0.01$")
  plt_alpha_10_percent, = ax.plot(alpha_10_percent, label = r"$\alpha = 0.1$")
  plt_alpha_decay, = ax.plot(alpha_decay, label = r"$\alpha = decay$")

  ax.set_xlabel("sample number")
  ax.set_ylabel("reward value")
  ax.set_title("Incremental estimates and true expected reward values over 50 samples for action %s" %(action + 1), y=-0.2)

  # show the legend with the labels of the line
  ax.legend(handles=[line_true_val, plt_incr_avgs, plt_alpha_1_percent, plt_alpha_10_percent, plt_alpha_decay])

## Q3 Effect of $α$ on Estimated Q values

In [184]:
num_samples = 100

# incr_avgs_by_arm has structure [[100 runs of 100 samples incr avg arm 1], [100 runs of 100 samples incr avg arm 2], [100 runs of 100 samples incr avg arm 3]]
incr_avgs_by_arm = []
# following three arrays have a similar structure.
alpha_1_percent_by_arm = []
alpha_10_percent_by_arm = []
alpha_decay_by_arm = []
for action in actions:

  # incr_avgs_arm_run will contain the 100 runs of 100 samples for the arm we are at
  incr_avgs_arm_run = []
  alpha_1_percent_arm_run = []
  alpha_10_percent_arm_run = []
  alpha_decay_arm_run = []
  for run in range(100):
    rewards = [0] # the array containing the numeric samples
    for _ in range(num_samples):
      rewards.append(three_arm_gaussian_bandit.sample(action))
    incr_avgs_arm_run.append(list(updateAvg(rewards)))
    alpha_1_percent_arm_run.append(list(update(rewards, 0.01)))
    alpha_10_percent_arm_run.append(list(update(rewards, 0.1)))
    alpha_decay_arm_run.append(list(updateDecaying(rewards, alpha_0=0.05, lambda_=0.01, p=0.5)))
  incr_avgs_by_arm.append(incr_avgs_arm_run)
  alpha_1_percent_by_arm.append(alpha_1_percent_arm_run)
  alpha_10_percent_by_arm.append(alpha_10_percent_arm_run)
  alpha_decay_by_arm.append(alpha_decay_arm_run)

# convert to np arrays:
incr_avgs_by_arm = np.asarray(incr_avgs_by_arm)
alpha_1_percent_by_arm = np.asarray(alpha_1_percent_by_arm)
alpha_10_percent_by_arm = np.asarray(alpha_10_percent_by_arm)
alpha_decay_by_arm = np.asarray(alpha_decay_by_arm)

In [185]:
# our end goal is to have the structure
# [[avg sample 1 arm 1, avg sample 2 arm 1, ..., avg sample 100 arm 1], [avg sample 1 arm 2, ..., avg sample 100 arm 2], [avg sample 1 arm 3, ..., avg sample 100 arm 3]]
# We restructure the arrays to obtain this structure:
avg_incr_avg_by_arm = [] #this array will contain the structure above for incr avg.
avg_alpha_1_percent_by_arm = []
avg_alpha_10_percent_by_arm = []
avg_alpha_decay_by_arm = []

# these arrays will have that structure for the standard deviation
stdev_incr_avg_by_arm = []
stdev_alpha_1_percent_by_arm = []
stdev_alpha_10_percent_by_arm = []
stdev_alpha_decay_by_arm = []

# these arrays will have that structure for the standard error
stderr_incr_avg_by_arm = []
stderr_alpha_1_percent_by_arm = []
stderr_alpha_10_percent_by_arm = []
stderr_alpha_decay_by_arm = []

for action in actions:
  avg_incr_avg_for_arm = [] # want this to have size 101. The ith entry will be the average ith entry over all the 100 runs
  avg_alpha_1_percent_for_arm = []
  avg_alpha_10_percent_for_arm = []
  avg_alpha_decay_for_arm = []

  stdev_incr_avg_for_arm = [] # ith entry = standard dev of 100 ith samples
  stdev_alpha_1_percent_for_arm = []
  stdev_alpha_10_percent_for_arm = []
  stdev_alpha_decay_for_arm = []

  stderr_incr_avg_for_arm = [] # ith entry = standard error of 100 ith samples
  stderr_alpha_1_percent_for_arm = []
  stderr_alpha_10_percent_for_arm = []
  stderr_alpha_decay_for_arm = []
  for _ in range(101):
    incr_avg_by_sample = [] # these will have all the samples of the same index (size 100), we'll average this
    alpha_1_percent_by_sample = []
    alpha_10_percent_by_sample = []
    alpha_decay_by_sample = []
    for run in range(100): # keep ith reward fixed, iterate over runs
      incr_avg_by_sample.append(incr_avgs_by_arm[action][run][_])
      alpha_1_percent_by_sample.append(alpha_1_percent_by_arm[action][run][_])
      alpha_10_percent_by_sample.append(alpha_10_percent_by_arm[action][run][_])
      alpha_decay_by_sample.append(alpha_decay_by_arm[action][run][_])

    #convert to numpy array
    incr_avg_by_sample = np.asarray(incr_avg_by_sample)
    alpha_1_percent_by_sample = np.asarray(alpha_1_percent_by_sample)
    alpha_10_percent_by_sample = np.asarray(alpha_10_percent_by_sample)
    alpha_decay_by_sample = np.asarray(alpha_decay_by_sample)

    # take average and add to vector of averages per sample index for this arm
    avg_incr_avg_for_arm.append(np.mean(incr_avg_by_sample, axis=0))
    avg_alpha_1_percent_for_arm.append(np.mean(alpha_1_percent_by_sample, axis=0))
    avg_alpha_10_percent_for_arm.append(np.mean(alpha_10_percent_by_sample, axis=0))
    avg_alpha_decay_for_arm.append(np.mean(alpha_decay_by_sample, axis=0))

    # take standard deviation and add to vector of averages per sample index for this arm
    stdev_incr_avg_for_arm.append(np.std(incr_avg_by_sample, axis=0))
    stdev_alpha_1_percent_for_arm.append(np.std(alpha_1_percent_by_sample, axis=0))
    stdev_alpha_10_percent_for_arm.append(np.std(alpha_10_percent_by_sample, axis=0))
    stdev_alpha_decay_for_arm.append(np.std(alpha_decay_by_sample, axis=0))

    # take standard error and add to vector of averages per sample index for this arm
    stderr_incr_avg_for_arm.append(np.std(incr_avg_by_sample, axis=0) / np.sqrt(100))
    stderr_alpha_1_percent_for_arm.append(np.std(alpha_1_percent_by_sample, axis=0) / np.sqrt(100))
    stderr_alpha_10_percent_for_arm.append(np.std(alpha_10_percent_by_sample, axis=0) / np.sqrt(100))
    stderr_alpha_decay_for_arm.append(np.std(alpha_decay_by_sample, axis=0) / np.sqrt(100))

  # Add metrics for the arm to the list for all three arms.
  avg_incr_avg_by_arm.append(avg_incr_avg_for_arm)
  avg_alpha_1_percent_by_arm.append(avg_alpha_1_percent_for_arm)
  avg_alpha_10_percent_by_arm.append(avg_alpha_10_percent_for_arm)
  avg_alpha_decay_by_arm.append(avg_alpha_decay_for_arm)

  stdev_incr_avg_by_arm.append(stdev_incr_avg_for_arm)
  stdev_alpha_1_percent_by_arm.append(stdev_alpha_1_percent_for_arm)
  stdev_alpha_10_percent_by_arm.append(stdev_alpha_10_percent_for_arm)
  stdev_alpha_decay_by_arm.append(stdev_alpha_decay_for_arm)

  stderr_incr_avg_by_arm.append(stderr_incr_avg_for_arm)
  stderr_alpha_1_percent_by_arm.append(stderr_alpha_1_percent_for_arm)
  stderr_alpha_10_percent_by_arm.append(stderr_alpha_10_percent_for_arm)
  stderr_alpha_decay_by_arm.append(stderr_alpha_decay_for_arm)

### Graphs

In [186]:
for action in actions:
  fig, ax = plt.subplots()

  # obtain averaged incremental reward values for averaging, alpha = 0.01, alpha = 0.1 and decay alpha over 100 runs
  # TODO:
  mean_incr_avgs_by_actions = np.asarray(avg_incr_avg_by_arm[action])
  mean_alpha_1_percent_by_actions = np.asarray(avg_alpha_1_percent_by_arm[action])
  mean_alpha_10_percent_by_actions = np.asarray(avg_alpha_10_percent_by_arm[action])
  mean_alpha_decay_by_actions = np.asarray(avg_alpha_decay_by_arm[action])

  true_value = means[action]

  # obtain the standard deviation for averaging, alpha = 0.01, alpha = 0.1 and decay alpha over 100 runs
  std_incr_avgs_by_actions = np.asarray(stdev_incr_avg_by_arm[action])
  std_alpha_1_percent_by_actions = np.asarray(stdev_alpha_1_percent_by_arm[action])
  std_alpha_10_percent_by_actions = np.asarray(stdev_alpha_10_percent_by_arm[action])
  std_alpha_decay_by_actions = np.asarray(stdev_alpha_decay_by_arm[action])

  # obtain the standard error for averaging, alpha = 0.01, alpha = 0.1 and decay alpha over 100 runs
  std_err_incr_avgs_by_actions = np.asarray(stderr_incr_avg_by_arm[action])
  std_err_alpha_1_percent_by_actions = np.asarray(stderr_alpha_1_percent_by_arm[action])
  std_err_alpha_10_percent_by_actions = np.asarray(stderr_alpha_10_percent_by_arm[action])
  std_err_alpha_decay_by_actions = np.asarray(stderr_alpha_decay_by_arm[action])

  # draw the true value line
  line_true_val = ax.axhline(y = true_value, color = 'b', linestyle = ':', label = "true value")

  # draw the averaged incremental reward values for averaging
  plt_incr_avgs, = ax.plot(mean_incr_avgs_by_actions, label = "incremental average")
  # draw the error bar/area for averaging
  incr_avgs_minus_std_err = mean_incr_avgs_by_actions - std_err_incr_avgs_by_actions
  incr_avgs_plus_std_err = mean_incr_avgs_by_actions + std_err_incr_avgs_by_actions
  ax.fill_between(range(0,101), incr_avgs_minus_std_err, incr_avgs_plus_std_err, alpha=0.3)

  # draw the averaged incremental reward values for alpha = 0.01
  plt_alpha_1_percent, = ax.plot(mean_alpha_1_percent_by_actions, label = "alpha = 0.01")
  # draw the error bar/area for alpha = 0.01
  alpha_1_percent_minus_std_err = mean_alpha_1_percent_by_actions - std_err_alpha_1_percent_by_actions
  alpha_1_percent_plus_std_err = mean_alpha_1_percent_by_actions + std_err_alpha_1_percent_by_actions
  ax.fill_between(range(0,101), alpha_1_percent_minus_std_err, alpha_1_percent_plus_std_err, alpha=0.3)

  # draw the averaged incremental reward values for alpha = 0.1
  plt_alpha_10_percent, = ax.plot(mean_alpha_10_percent_by_actions, label = "alpha = 0.1")
  # draw the error bar/area for alpha = 0.1
  alpha_10_percent_minus_std_err = mean_alpha_10_percent_by_actions - std_err_alpha_10_percent_by_actions
  alpha_10_percent_plus_std_err = mean_alpha_10_percent_by_actions + std_err_alpha_10_percent_by_actions
  ax.fill_between(range(0,101), alpha_10_percent_minus_std_err, alpha_10_percent_plus_std_err, alpha=0.3)

  plt_alpha_decay, = ax.plot(mean_alpha_decay_by_actions, label = "alpha = decay")
  alpha_decay_minus_std_err = mean_alpha_decay_by_actions - std_err_alpha_decay_by_actions
  alpha_decay_plus_std_err = mean_alpha_decay_by_actions + std_err_alpha_decay_by_actions
  ax.fill_between(range(0,101), alpha_decay_minus_std_err, alpha_decay_plus_std_err, alpha=0.3)

  ax.set_xlabel("sample number")
  ax.set_ylabel("reward value")
  ax.set_title("Incremental estimates and true expected reward values averaged over 100 runs for action %s" %action, y=-0.2)

  ax.legend(handles=[line_true_val, plt_incr_avgs, plt_alpha_1_percent, plt_alpha_10_percent, plt_alpha_decay])

### Answers:  


α = 0.1 seems to be performing better than $\alpha$ = 0.01, which in term performs better than the decaying $\alpha$. The way decaying $\alpha$ is set up, it decays very slowly from 0.5 to about 0.35 after 100 iterations. Incremental averaging appears to be performing better than the fixed and decaying $\alpha$ methods. It may be preferential to use decaying α over a longer time horizon. If we were to run further experiments, we would likely examine sequences $\alpha_t$ which converge to 0 faster.

## Q4 Epsilon-greedy

In [187]:
def epsilon_greedy(bandit, epsilon, alpha = None, num_time_step = 1000, epsilon_decay=False, lambda_=0.001, stationary=True, nonstation_change=[]):
  """Epsilon greedy algorithm for bandit action selection

  Parameters
  ----------
  bandit : bandit class
      A bernoulli bandit attributes num_arms and probs_arr, and method sample
  epsilon: float
      A parameter which determines the probability for a random action to be selected
  alpha: (optional) float
      A parameter which determined the learning rate for averaging. If alpha is none, incremental averaging is used.
      Default is none, corresponding to incremental averaging.

  Returns
  -------
  R_over_t
      a list of instantaneous return over the time steps
  total_R_over_t
      a list of cummulative reward over the time steps
  est_is_best_over_t
      a list of values of 0 and 1 where 1 indicates the estimated best action is the true best action and 0 otherwise for each time step
  l_over_t
      a list of instanteneous regret over the time steps
  total_l_over_t
      a list of cummulative regret over the time steps
  """
  # TODO:
  num_arms = bandit.num_arms

  # I think Q_arr is the array of rewards per arm and has form [[rewards for arm 1], ..., [rewards for last arm]]
  Q_arr = np.zeros(num_arms)
  N_arr = np.zeros(num_arms, dtype=np.int64)
  total_R = 0
  total_l = 0
  actions = range(num_arms)

  opt_value =  np.max(bandit.mean)
  best_action = np.argmax(bandit.mean)

  # I will add this extra array which will keep track of the rewards per arm
  # R_per_arm = [[] for _ in range(num_arms)]
  R_per_arm = np.zeros((num_arms, num_time_step))

  R_over_t = []
  total_R_over_t = []
  est_is_best_over_t = []
  l_over_t = []
  total_l_over_t = []

  epsilon_t = epsilon

  updates = []
  for arm in range(num_arms):
    if (alpha == None):
      updates.append(updateAvg(R_per_arm[arm]))
    else:
      updates.append(update(R_per_arm[arm], alpha))

  for time_step in range(num_time_step):
    if not stationary:
      if time_step == nonstation_change[0]:
        bandit = nonstation_change[1]

    if epsilon_decay:
        epsilon_t = epsilon/(1+lambda_*time_step)

    # random_choice = np.random.choice([True, False], p=[epsilon_t, 1-epsilon_t])
    if np.random.uniform() <= epsilon_t:
      A = np.random.choice(actions)
    else:
      indices_of_best_actions = (Q_arr == np.max(Q_arr)).nonzero() # if multiple actions have optimal average, choose one at random
      A = np.random.choice(indices_of_best_actions[0])

    # this will be for the non-stationary bandit:
    # if (not stationary) and (sum(N_arr)>500):
    #   bandit.mean = [0.5, 0.5+2*delta, 0.5-2*delta]

    curr_R = bandit.sample(A)

    #added:
    # R_per_arm[A].append(curr_R)
    R_per_arm[A][N_arr[A]] = curr_R

    N_arr[A] += 1
    Q_arr[A] = next(updates[A])


    total_R += curr_R
    total_l += opt_value - curr_R

    R_over_t.append(curr_R)

    total_R += curr_R
    total_R_over_t.append(total_R)

    est_is_best = int(A == np.argmax(bandit.mean))
    est_is_best_over_t.append(est_is_best)

    l_t = opt_value - curr_R
    l_over_t.append(l_t)

    total_l += l_t
    total_l_over_t.append(total_l)
  return R_over_t, total_R_over_t, est_is_best_over_t, l_over_t, total_l_over_t

### Graphs

In [188]:
#TODO:
epsilons = [0, 1/8, 1/4, 1/2, 1]
decaying_epsilon_params = {'epsilon_0': 0.5, 'lambda_': 0.1}  # Decaying epsilon parameters

fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(18, 18))

for epsilon in epsilons + ["decay"]:

  # arrays of the data generated from 100 runs
  R_over_t_runs = []
  total_R_over_t_runs = []
  est_is_best_over_t_runs = []
  l_over_t_runs = []
  total_l_over_t_runs = []

  for run in range(100):
    if epsilon == "decay":
      R_over_t, total_R_over_t, est_is_best_over_t, l_over_t, total_l_over_t = epsilon_greedy(
          three_arm_gaussian_bandit,
          decaying_epsilon_params['epsilon_0'],
          epsilon_decay=True,
          lambda_=decaying_epsilon_params['lambda_']
      )
    else:
      R_over_t, total_R_over_t, est_is_best_over_t, l_over_t, total_l_over_t = epsilon_greedy(
          three_arm_gaussian_bandit,
          epsilon
      )
    R_over_t_runs.append(R_over_t)
    total_R_over_t_runs.append(total_R_over_t)
    est_is_best_over_t_runs.append(est_is_best_over_t)
    l_over_t_runs.append(l_over_t)
    total_l_over_t_runs.append(total_l_over_t)

  R_over_t_runs = np.asarray(R_over_t_runs)
  total_R_over_t_runs = np.asarray(total_R_over_t_runs)
  est_is_best_over_t_runs = np.asarray(est_is_best_over_t_runs)
  l_over_t_runs = np.asarray(l_over_t_runs)
  total_l_over_t_runs = np.asarray(total_l_over_t_runs)

  # plot the mean reward over time

  mean_R_over_t_runs = np.mean(R_over_t_runs, axis=0)
  # print(mean_R_over_t_runs)
  std_err_R_over_t_runs = np.std(R_over_t_runs, axis=0) / np.sqrt(np.size(R_over_t_runs, axis=0))

  axs[0,0].plot(mean_R_over_t_runs, label = r"$\epsilon = %s$" %epsilon)

  R_over_t_minus_std_err = mean_R_over_t_runs - std_err_R_over_t_runs
  R_over_t_plus_std_err = mean_R_over_t_runs  + std_err_R_over_t_runs
  axs[0,0].fill_between(range(0,1000), R_over_t_minus_std_err, R_over_t_plus_std_err, alpha=0.4)
  # axs[0,0].errorbar(range(0,1000), mean_R_over_t_runs, yerr=std_err_R_over_t_runs)

  axs[0,0].legend()
  axs[0,0].set_xlabel("time step")
  axs[0,0].set_ylabel("reward value")
  axs[0,0].set_title("Average Instantaneous Reward Received over Time", y=-0.18)

  # plot the mean cummulative reward over time

  mean_total_R_over_t_runs = np.mean(total_R_over_t_runs, axis=0) # different epsilons seem to be performing exactly the same
  std_err_total_R_over_t_runs = np.std(total_R_over_t_runs, axis=0) / np.sqrt(np.size(total_R_over_t_runs, axis=0))

  axs[0,1].plot(mean_total_R_over_t_runs, label = r"$\epsilon = %s$" %epsilon)

  total_R_over_t_minus_std_err = mean_total_R_over_t_runs - std_err_total_R_over_t_runs
  total_R_over_t_plus_std_err = mean_total_R_over_t_runs  + std_err_total_R_over_t_runs
  axs[0,1].fill_between(range(0,1000), total_R_over_t_minus_std_err, total_R_over_t_plus_std_err, alpha=0.4)

  axs[0,1].legend()
  axs[0,1].set_xlabel("time step")
  axs[0,1].set_ylabel("reward value")
  axs[0,1].set_title("Average Cumulative Reward Received over Time", y=-0.18)

  #plot the mean percentage of the estimated best action being the third action

  est_is_best_over_t_runs_avgs = np.mean(est_is_best_over_t_runs, axis=0)
  plt_est_is_best_over_t_runs_avgs, = axs[1,0].plot(est_is_best_over_t_runs_avgs, label = r"$\epsilon = %s$" %epsilon)

  axs[1,0].legend()
  axs[1,0].set_xlabel("time step")
  axs[1,0].set_ylabel("percentage")
  axs[1,0].set_title("Percentage of Runs where Best Action was Chosen", y=-0.18)

  #plot the mean instantaneous regret over time

  l_over_t_runs_avgs = np.mean(l_over_t_runs, axis=0)
  axs[1,1].plot(l_over_t_runs_avgs, label = r"$\epsilon = %s$" %epsilon)

  axs[1,1].legend()
  axs[1,1].set_xlabel("time step")
  axs[1,1].set_ylabel("regret")
  axs[1,1].set_title("Instantaneous Regret over Time", y=-0.18)

  #plot the total regret over time

  total_l_over_t_runs_avgs = np.mean(total_l_over_t_runs, axis=0)
  axs[2,0].plot(total_l_over_t_runs_avgs, label = r"$\epsilon = %s$" %epsilon)

  axs[2,0].legend()
  axs[2,0].set_xlabel("time step")
  axs[2,0].set_ylabel("regret")
  axs[2,0].set_title("Total Regret up to Time Step t", y=-0.18)

axs[-1, -1].axis('off')

title = r'Graphs  for Epsilon Greedy with Varying Epsilons'
fig.suptitle(title, fontsize=16, y=0.08)

plt.show()

### Answers

Because the first observed reward is highly likely to be positive, in the strictly greedy ($\epsilon=0$) case, the agent ends up pulling the same arm for the rest of the run time, which explains the poor performance in that case. $\epsilon = 1$ similarly performs poorly because the estimates of the means values of the arms are never used to make a decision about which arm to sample. Smaller $\epsilon$ values seem to perform better over time, with decaying $\epsilon$ (which roughly decays from $0.5$ to $0.005$) accumulating less regret over time than $\epsilon = \frac{1}{8}$, which in turn accumulates less regret than $\epsilon = \frac{1}{4}$ and $\epsilon = \frac{1}{2}$.


## Q5 Hyperparameters for Epsilon-greedy

To have a plain start, you have been provided with predefined functions for generating plots until now. However, moving forward, you are expected to plot graphs on your own.

In [189]:
# how we're gonna go about this:
# By column. Each column will correspond to a different alpha value.
# Each graph will have the same 3 base curves: greedy non-decaying with epsilon [1/4,1/8] and greedy decaying with epsilon_0=1/2 and lambda=0.1
# Then we can add to each the specific curve of decaying with fixed learning rate, this will vary by column

### Graphs

In [190]:
alphas = [0.1, 0.01, 0.001, None] # none represents incremental averaging


# we start by making the base graphs! This procedure is almost identical to #4, but with the row and column structure a bit different
epsilons = [1/4, 1/8, 'decay']
decaying_epsilon_params = {'epsilon_0': 0.5, 'lambda_': 0.1}  # Decaying epsilon parameters

num_rows = 5
num_cols = 3

fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols,constrained_layout=True, figsize=(30, 30))


for epsilon in range(len(epsilons)):
  row_1 = []
  row_2 = []
  row_3 = []
  row_4 = []
  row_5 = []
  for alp in range(len(alphas)):
    # arrays of the data generated from 100 runs
    R_over_t_runs = []
    total_R_over_t_runs = []
    est_is_best_over_t_runs = []
    l_over_t_runs = []
    total_l_over_t_runs = []

    for run in range(100):
      if epsilons[epsilon] == 'decay':
        R_over_t, total_R_over_t, est_is_best_over_t, l_over_t, total_l_over_t = epsilon_greedy(
            three_arm_gaussian_bandit,
            alpha=alphas[alp],
            epsilon=decaying_epsilon_params['epsilon_0'],
            epsilon_decay=True,
            lambda_=decaying_epsilon_params['lambda_']
        )
      else:
        R_over_t, total_R_over_t, est_is_best_over_t, l_over_t, total_l_over_t = epsilon_greedy(
            three_arm_gaussian_bandit,
            epsilons[epsilon],
            alpha=alphas[alp]
        )
      R_over_t_runs.append(R_over_t)
      total_R_over_t_runs.append(total_R_over_t)
      est_is_best_over_t_runs.append(est_is_best_over_t)
      l_over_t_runs.append(l_over_t)
      total_l_over_t_runs.append(total_l_over_t)

    R_over_t_runs = np.asarray(R_over_t_runs)
    total_R_over_t_runs = np.asarray(total_R_over_t_runs)
    est_is_best_over_t_runs = np.asarray(est_is_best_over_t_runs)
    l_over_t_runs = np.asarray(l_over_t_runs)
    total_l_over_t_runs = np.asarray(total_l_over_t_runs)

    # plot the mean reward over time

    mean_R_over_t_runs = np.mean(R_over_t_runs, axis=0)
    std_err_R_over_t_runs = np.std(R_over_t_runs, axis=0) / np.sqrt(np.size(R_over_t_runs, axis=0))

    axs[0,epsilon].plot(mean_R_over_t_runs, label = r"$\alpha = %s$" %alphas[alp])

    R_over_t_minus_std_err = mean_R_over_t_runs - std_err_R_over_t_runs
    R_over_t_plus_std_err = mean_R_over_t_runs  + std_err_R_over_t_runs
    axs[0,epsilon].fill_between(range(0,1000), R_over_t_minus_std_err, R_over_t_plus_std_err, alpha=0.4)
    # axs[0,0].errorbar(range(0,1000), mean_R_over_t_runs, yerr=std_err_R_over_t_runs)

    axs[0,epsilon].legend()
    axs[0,epsilon].set_xlabel("time step")
    axs[0,epsilon].set_ylabel("reward value")
    axs[0,epsilon].set_title("Average Instantaneous Reward Received over Time with ε= "+str(epsilons[epsilon]), y=-0.18)

    # plot the mean cummulative reward over time

    mean_total_R_over_t_runs = np.mean(total_R_over_t_runs, axis=0)
    std_err_total_R_over_t_runs = np.std(total_R_over_t_runs, axis=0) / np.sqrt(np.size(total_R_over_t_runs, axis=0))

    axs[1,epsilon].plot(mean_total_R_over_t_runs, label = r"$\alpha = %s$" %alphas[alp])

    total_R_over_t_minus_std_err = mean_total_R_over_t_runs - std_err_total_R_over_t_runs
    total_R_over_t_plus_std_err = mean_total_R_over_t_runs  + std_err_total_R_over_t_runs
    axs[1,epsilon].fill_between(range(0,1000), total_R_over_t_minus_std_err, total_R_over_t_plus_std_err, alpha=0.4)

    axs[1,epsilon].legend()
    axs[1,epsilon].set_xlabel("time step")
    axs[1,epsilon].set_ylabel("reward value")
    axs[1,epsilon].set_title("Average Cumulative Reward Received over Time with ε= "+str(epsilons[epsilon]), y=-0.18)

    #plot the mean percentage of the estimated best action being the third action

    est_is_best_over_t_runs_avgs = np.mean(est_is_best_over_t_runs, axis=0)
    # plt_est_is_best_over_t_runs_avgs, = axs[2,epsilon].plot(est_is_best_over_t_runs_avgs, label = r"$\alpha = %s$" %alphas[alp])
    axs[2,epsilon].plot(est_is_best_over_t_runs_avgs, label = r"$\alpha = %s$" %alphas[alp])

    axs[2,epsilon].legend()
    axs[2,epsilon].set_xlabel("time step")
    axs[2,epsilon].set_ylabel("percentage")
    axs[2,epsilon].set_title("Percentage of Runs where Best Action was Chosen with ε= "+str(epsilons[epsilon]), y=-0.18)

    #plot the mean instantaneous regret over time

    l_over_t_runs_avgs = np.mean(l_over_t_runs, axis=0)
    axs[3,epsilon].plot(l_over_t_runs_avgs, label = r"$\alpha = %s$" %alphas[alp])

    axs[3,epsilon].legend()
    axs[3,epsilon].set_xlabel("time step")
    axs[3,epsilon].set_ylabel("regret")
    axs[3,epsilon].set_title("Instantaneous Regret over Time with ε="+str(epsilons[epsilon]), y=-0.18)

    #plot the total regret over time

    total_l_over_t_runs_avgs = np.mean(total_l_over_t_runs, axis=0)
    axs[4,epsilon].plot(total_l_over_t_runs_avgs, label = r"$\alpha = %s$" %str(alphas[alp]))

    axs[4,epsilon].legend()
    axs[4,epsilon].set_xlabel("time step")
    axs[4,epsilon].set_ylabel("regret")
    axs[4,epsilon].set_title(r"Total Regret up to Time Step t with ε= "+str(epsilons[epsilon]), y=-0.18)

  axs[-1, -1].axis('off')

title = r'Graphs  for Epsilon Greedy with Varying Epsilons'
fig.suptitle(title, fontsize=16, y=-0.08)



### Answers

Incremental averaging and $\alpha = 0.1$ exhibit slow growth total regret (though still roughly linear), whereas smaller learning rates ($\alpha = 0.01, 0.001$) give linear growth in total regret. Incremental averaging seems to learn early on which action is best, and then $\alpha = 0.1$ tends to catch up after a bit of a lag.

## Q6 Gradient Bandit

In [191]:
def gradient_bandit(bandit, alpha, num_time_steps=1000, alpha_decay=False, lda=0.01, p=0.5, stationary=True, nonstation_change=[]):
    instantaneous_rewards_ot = []
    cumulative_rewards_ot = np.zeros(num_time_steps)
    estimate_of_best_action_ot = np.zeros(num_time_steps)
    instantaneous_regret_ot = np.zeros(num_time_steps)
    cumulative_regret_ot = np.zeros(num_time_steps)
    N_arr = np.zeros(bandit.num_arms) # added to keep track of number of tries (for non-stationary)

    total_regret = 0

    H = np.zeros(bandit.num_arms)
    pi = softmax(H) # parameter p for np.random.choice

    best_action = np.argmax(bandit.mean)
    optimal_value = np.max(bandit.mean)

    for i in range(num_time_steps):
        alpha_t = alpha
        if alpha_decay:
          alpha_t /= (1 + lda*i) ** p

        A_t = np.random.choice(range(bandit.num_arms), p=pi) # selects action according to probabilities in pi

        # two lines for the non-stationary bandit:
        N_arr[A_t] += 1
        # if (not stationary) and (sum(N_arr)>500):
        #   bandit.mean = [0.5, 0.5+2*delta, 0.5-2*delta]
        if not stationary:
          if i == nonstation_change[0]:
            bandit = nonstation_change[1]

        R_t = bandit.sample(A_t)
        instantaneous_rewards_ot.append(R_t)
        if (i > 0):
          cumulative_rewards_ot[i] = cumulative_rewards_ot[i-1] + R_t
        else:
          cumulative_rewards_ot[i] = R_t
        average_R_t = next(updateAvg(instantaneous_rewards_ot))
        estimate_of_best_action_ot[i] = int(best_action == A_t)

        H[A_t] = H[A_t] + alpha_t*(R_t - average_R_t)*(1 - pi[A_t])
        for a in range(bandit.num_arms):
            if a != A_t:
                H[a] = H[a] - alpha_t*(R_t - average_R_t)*pi[a]
        pi = softmax(H)

        instantaneous_regret = optimal_value - R_t
        instantaneous_regret_ot[i] = instantaneous_regret
        total_regret += instantaneous_regret
        cumulative_regret_ot[i] = total_regret

    return instantaneous_rewards_ot, cumulative_rewards_ot, estimate_of_best_action_ot, instantaneous_regret_ot, cumulative_regret_ot



### Graphs

In [192]:
alphas = [0.1, 0.01, 0.001]
decaying_alpha_params = {'alpha_0': 0.5, 'lda': 0.01, 'p' : 0.5}

fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(18, 18))

for alp in alphas + ["decay"]:

  # arrays of the data generated from 100 runs
  R_over_t_runs = []
  total_R_over_t_runs = []
  est_is_best_over_t_runs = []
  l_over_t_runs = []
  total_l_over_t_runs = []

  for run in range(100):
    if alp == "decay":
      R_over_t, total_R_over_t, est_is_best_over_t, l_over_t, total_l_over_t = gradient_bandit(
          three_arm_gaussian_bandit,
          alpha=decaying_alpha_params['alpha_0'],
          alpha_decay=True,
          lda=decaying_alpha_params['lda'],
          p=decaying_alpha_params['p']
          )
    else:
      R_over_t, total_R_over_t, est_is_best_over_t, l_over_t, total_l_over_t = gradient_bandit(
          three_arm_gaussian_bandit,
          alp
      )
    R_over_t_runs.append(R_over_t)
    total_R_over_t_runs.append(total_R_over_t)
    est_is_best_over_t_runs.append(est_is_best_over_t)
    l_over_t_runs.append(l_over_t)
    total_l_over_t_runs.append(total_l_over_t)

  R_over_t_runs = np.asarray(R_over_t_runs)
  total_R_over_t_runs = np.asarray(total_R_over_t_runs)
  est_is_best_over_t_runs = np.asarray(est_is_best_over_t_runs)
  l_over_t_runs = np.asarray(l_over_t_runs)
  total_l_over_t_runs = np.asarray(total_l_over_t_runs)

  # plot the mean reward over time

  mean_R_over_t_runs = np.mean(R_over_t_runs, axis=0)
  # print(mean_R_over_t_runs)
  std_err_R_over_t_runs = np.std(R_over_t_runs, axis=0) / np.sqrt(np.size(R_over_t_runs, axis=0))

  axs[0,0].plot(mean_R_over_t_runs, label = r"$\alpha = %s$" %alp)

  R_over_t_minus_std_err = mean_R_over_t_runs - std_err_R_over_t_runs
  R_over_t_plus_std_err = mean_R_over_t_runs  + std_err_R_over_t_runs
  axs[0,0].fill_between(range(0,1000), R_over_t_minus_std_err, R_over_t_plus_std_err, alpha=0.4)
  # axs[0,0].errorbar(range(0,1000), mean_R_over_t_runs, yerr=std_err_R_over_t_runs)

  axs[0,0].legend()
  axs[0,0].set_xlabel("time step")
  axs[0,0].set_ylabel("reward value")
  axs[0,0].set_title("Average Instanteneous Reward Received over Time", y=-0.18)

  # plot the mean cummulative reward over time

  mean_total_R_over_t_runs = np.mean(total_R_over_t_runs, axis=0)
  std_err_total_R_over_t_runs = np.std(total_R_over_t_runs, axis=0) / np.sqrt(np.size(total_R_over_t_runs, axis=0))

  axs[0,1].plot(mean_total_R_over_t_runs, label = r"$\alpha = %s$" %alp)

  total_R_over_t_minus_std_err = mean_total_R_over_t_runs - std_err_total_R_over_t_runs
  total_R_over_t_plus_std_err = mean_total_R_over_t_runs  + std_err_total_R_over_t_runs
  axs[0,1].fill_between(range(0,1000), total_R_over_t_minus_std_err, total_R_over_t_plus_std_err, alpha=0.4)

  axs[0,1].legend()
  axs[0,1].set_xlabel("time step")
  axs[0,1].set_ylabel("reward value")
  axs[0,1].set_title("Average Cumulative Reward Received over Time", y=-0.18)

  #plot the mean percentage of the estimated best action being the third action

  est_is_best_over_t_runs_avgs = np.mean(est_is_best_over_t_runs, axis=0)
  plt_est_is_best_over_t_runs_avgs, = axs[1,0].plot(est_is_best_over_t_runs_avgs, label = r"$\alpha = %s$" %alp)

  axs[1,0].legend()
  axs[1,0].set_xlabel("time step")
  axs[1,0].set_ylabel("percentage")
  axs[1,0].set_title("Percentage of Runs where Best Action was Chosen", y=-0.18)

  #plot the mean instantaneous regret over time

  l_over_t_runs_avgs = np.mean(l_over_t_runs, axis=0)
  axs[1,1].plot(l_over_t_runs_avgs, label = r"$\alpha = %s$" %alp)

  axs[1,1].legend()
  axs[1,1].set_xlabel("time step")
  axs[1,1].set_ylabel("regret")
  axs[1,1].set_title("Instantaneous Regret over Time", y=-0.18)

  #plot the total regret over time

  total_l_over_t_runs_avgs = np.mean(total_l_over_t_runs, axis=0)
  axs[2,0].plot(total_l_over_t_runs_avgs, label = r"$\alpha = %s$" %alp)

  axs[2,0].legend()
  axs[2,0].set_xlabel("time step")
  axs[2,0].set_ylabel("regret")
  axs[2,0].set_title("Total Regret up to Time Step t", y=-0.18)

axs[-1, -1].axis('off')

title = r'Graphs for Gradient Bandit with Varying Learning Rates'
fig.suptitle(title, fontsize=16, y=0.08)

plt.show()

### Answers

Starting off with a higher learning rate seems to give better performance. When $\alpha = 0.1$ or $\alpha = 0.01$, the updates to the preferences occur slowly, which seems to give linear growth in total regret. However, if $\alpha = 0.1$ or we initialize our decaying $\alpha_t$ to 0.5, the preferences are updates enough to achieve logarithmic (i.e. optimal) accumulation of regret.



## Q7 Thompson Sampling

In [193]:
def thompson_sampling(bandit, prior_means=[], prior_sd=[], num_iterations=1000, stationary=True, nonstation_change = []):
    if (len(prior_means) == 0):
      # prior_means = np.zeros(bandit.num_arms)
      prior_means = np.full(bandit.num_arms, 0.000)

    if (len(prior_sd) == 0):
      # prior_means = np.zeros(bandit.num_arms)
      prior_sd = np.full(bandit.num_arms, 1.0000)
    instantaneous_rewards_ot = []
    cumulative_rewards_ot = np.zeros(num_iterations)
    estimate_of_best_action_ot = np.zeros(num_iterations)
    instantaneous_regret_ot = np.zeros(num_iterations)
    cumulative_regret_ot = np.zeros(num_iterations)

    optimal_action = np.argmax(bandit.mean)
    optimal_value = np.max(bandit.mean)
    cumulative_reward = 0
    cumulative_regret = 0

    means_estimate = np.zeros(bandit.num_arms)
    sd_estimate = 0.1

    # for j in range(bandit.num_arms):
    #     means_estimate[j] = np.random.normal(prior_means[j], prior_sd)

    for i in range(num_iterations):
      if not stationary:
        if nonstation_change[0] == i:
          bandit = nonstation_change[1]
      # print(prior_sd)
      for j in range(bandit.num_arms):
        means_estimate[j] = np.random.normal(prior_means[j], math.sqrt(prior_sd[j] ** 2 + sd_estimate ** 2))
        # means_estimate[j] = np.random.normal(prior_means[j], prior_sd[j])

      indices_of_best_actions = (means_estimate == np.max(means_estimate)).nonzero() # if multiple actions have optimal average, choose one at random
      # print(indices_of_best_actions)
      A_i = np.random.choice(indices_of_best_actions[0])
      # print(A_i)
      # A_i = np.argmax(means_estimate)
      R_i = bandit.sample(A_i)
      numerator = (prior_means[A_i]/(prior_sd[A_i] ** 2)) + (R_i/(sd_estimate ** 2))
      denominator = 1/(prior_sd[A_i] ** 2) + 1/(sd_estimate ** 2)
      prior_means[A_i] = numerator/denominator
      prior_sd[A_i] = np.sqrt(1/(1/(sd_estimate ** 2) + 1/(prior_sd[A_i] ** 2)))

      instantaneous_rewards_ot.append(R_i)
      cumulative_reward += R_i
      cumulative_rewards_ot[i] = cumulative_reward
      estimate_of_best_action_ot[i] = int(A_i == optimal_action)
      inst_regret = optimal_value - R_i
      instantaneous_regret_ot[i] = inst_regret
      cumulative_regret += inst_regret
      cumulative_regret_ot[i] = cumulative_regret

    # print(means_estimate)
    return instantaneous_rewards_ot, cumulative_rewards_ot, estimate_of_best_action_ot, instantaneous_regret_ot, cumulative_regret_ot



In [194]:
instantaneous_rewards_ot, cumulative_rewards_ot, estimate_of_best_action_ot, instantaneous_regret_ot, cumulative_regret_ot = thompson_sampling(three_arm_gaussian_bandit)


### Graphs

In [195]:
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(18, 18))

R_over_t_runs = []
total_R_over_t_runs = []
est_is_best_over_t_runs = []
l_over_t_runs = []
total_l_over_t_runs = []

for run in range(100):
    R_over_t, total_R_over_t, est_is_best_over_t, l_over_t, total_l_over_t = thompson_sampling(three_arm_gaussian_bandit)
    R_over_t_runs.append(R_over_t)
    total_R_over_t_runs.append(total_R_over_t)
    est_is_best_over_t_runs.append(est_is_best_over_t)
    l_over_t_runs.append(l_over_t)
    total_l_over_t_runs.append(total_l_over_t)

R_over_t_runs = np.asarray(R_over_t_runs)
total_R_over_t_runs = np.asarray(total_R_over_t_runs)
est_is_best_over_t_runs = np.asarray(est_is_best_over_t_runs)
l_over_t_runs = np.asarray(l_over_t_runs)
total_l_over_t_runs = np.asarray(total_l_over_t_runs)

# plot the mean reward over time

mean_R_over_t_runs = np.mean(R_over_t_runs, axis=0)
# print(mean_R_over_t_runs)
std_err_R_over_t_runs = np.std(R_over_t_runs, axis=0) / np.sqrt(np.size(R_over_t_runs, axis=0))

axs[0,0].plot(mean_R_over_t_runs)

R_over_t_minus_std_err = mean_R_over_t_runs - std_err_R_over_t_runs
R_over_t_plus_std_err = mean_R_over_t_runs  + std_err_R_over_t_runs
axs[0,0].fill_between(range(0,1000), R_over_t_minus_std_err, R_over_t_plus_std_err, alpha=0.4)
# axs[0,0].errorbar(range(0,1000), mean_R_over_t_runs, yerr=std_err_R_over_t_runs)

# axs[0,0].legend()
axs[0,0].set_xlabel("time step")
axs[0,0].set_ylabel("reward value")
axs[0,0].set_title("Average Instanteneous Reward Received over Time", y=-0.18)

# plot the mean cummulative reward over time

mean_total_R_over_t_runs = np.mean(total_R_over_t_runs, axis=0)
std_err_total_R_over_t_runs = np.std(total_R_over_t_runs, axis=0) / np.sqrt(np.size(total_R_over_t_runs, axis=0))

axs[0,1].plot(mean_total_R_over_t_runs)

total_R_over_t_minus_std_err = mean_total_R_over_t_runs - std_err_total_R_over_t_runs
total_R_over_t_plus_std_err = mean_total_R_over_t_runs  + std_err_total_R_over_t_runs
axs[0,1].fill_between(range(0,1000), total_R_over_t_minus_std_err, total_R_over_t_plus_std_err, alpha=0.4)

# axs[0,1].legend()
axs[0,1].set_xlabel("time step")
axs[0,1].set_ylabel("reward value")
axs[0,1].set_title("Average Cumulative Reward Received over Time", y=-0.18)

#plot the mean percentage of the estimated best action being the third action

est_is_best_over_t_runs_avgs = np.mean(est_is_best_over_t_runs, axis=0)
plt_est_is_best_over_t_runs_avgs, = axs[1,0].plot(est_is_best_over_t_runs_avgs)

# axs[1,0].legend()
axs[1,0].set_xlabel("time step")
axs[1,0].set_ylabel("percentage")
axs[1,0].set_title("Percentage of Runs where Best Action was Chosen", y=-0.18)

#plot the mean instantaneous regret over time

l_over_t_runs_avgs = np.mean(l_over_t_runs, axis=0)
axs[1,1].plot(l_over_t_runs_avgs)

# axs[1,1].legend()
axs[1,1].set_xlabel("time step")
axs[1,1].set_ylabel("regret")
axs[1,1].set_title("Instantaneous Regret over Time", y=-0.18)

#plot the total regret over time

total_l_over_t_runs_avgs = np.mean(total_l_over_t_runs, axis=0)
axs[2,0].plot(total_l_over_t_runs_avgs)

# axs[2,0].legend()
axs[2,0].set_xlabel("time step")
axs[2,0].set_ylabel("regret")
axs[2,0].set_title("Total Regret up to Time Step t", y=-0.18)

axs[-1, -1].axis('off')

title = r'Graphs for Thompson Sampling Bandit'
fig.suptitle(title, fontsize=16, y=0.08)

plt.show()

### Answers

Thompson sampling seems to identify relatively quickly which arm is best, and then seems to largely stick to sampling that arm.

## Q8 Comparison of Algorithms

### Graphs

In [196]:
eps_results = np.zeros((100, 5, 1000))
grad_results = np.zeros((100, 5, 1000))
thompson_results = np.zeros((100, 5, 1000))


for i in range(100):
  eps_results[i] = epsilon_greedy(three_arm_gaussian_bandit, 0.5, epsilon_decay = True, lambda_=0.1)
  grad_results[i] = gradient_bandit(three_arm_gaussian_bandit, alpha=0.5, alpha_decay=True, lda=0.01, p=0.5)
  thompson_results[i] = thompson_sampling(three_arm_gaussian_bandit)

In [197]:
average_instantaneous_reward_over_time_epsilon = np.average(eps_results[:,0], axis=0)
average_cumulative_reward_over_time_epsilon = np.average(eps_results[:,1], axis=0)
average_estimated_best_action_over_time_epsilon = np.average(eps_results[:,2], axis=0)
average_instantaneous_regret_over_time_epsilon = np.average(eps_results[:,3], axis=0)
average_cumulative_regret_over_time_epsilon = np.average(eps_results[:,4], axis=0)

average_instantaneous_reward_over_time_gradient = np.average(grad_results[:,0], axis=0)
average_cumulative_reward_over_time_gradient = np.average(grad_results[:,1], axis=0)
average_estimated_best_action_over_time_gradient = np.average(grad_results[:,2], axis=0)
average_instantaneous_regret_over_time_gradient = np.average(grad_results[:,3], axis=0)
average_cumulative_regret_over_time_gradient = np.average(grad_results[:,4], axis=0)

average_instantaneous_reward_over_time_thompson = np.average(thompson_results[:,0], axis=0)
average_cumulative_reward_over_time_thompson = np.average(thompson_results[:,1], axis=0)
average_estimated_best_action_over_time_thompson = np.average(thompson_results[:,2], axis=0)
average_instantaneous_regret_over_time_thompson = np.average(thompson_results[:,3], axis=0)
average_cumulative_regret_over_time_thompson = np.average(thompson_results[:,4], axis=0)

In [198]:
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(18, 18))

axs[0, 0].plot(average_instantaneous_reward_over_time_epsilon, label = r"$\epsilon = decay$")
axs[0, 0].plot(average_instantaneous_reward_over_time_gradient, label = r"$\alpha = decay$, gradient")
axs[0, 0].plot(average_instantaneous_reward_over_time_thompson, label = "thompson sampling")
axs[0, 0].set_xlabel("time step")
axs[0, 0].set_ylabel("reward value")
axs[0, 0].set_title("Average Instantaneous Reward Received over Time", y=-0.18)
axs[0, 0].legend()

axs[0, 1].plot(average_cumulative_reward_over_time_epsilon, label = r"$\epsilon = decay$")
axs[0, 1].plot(average_cumulative_reward_over_time_gradient, label = r"$\alpha = decay$, gradient")
axs[0, 1].plot(average_cumulative_reward_over_time_thompson, label = "thompson sampling")
axs[0, 1].set_xlabel("time step")
axs[0, 1].set_ylabel("reward value")
axs[0, 1].set_title("Average Cumulative Reward Received over Time", y=-0.18)
axs[0, 1].legend()

axs[1, 0].plot(average_estimated_best_action_over_time_epsilon, label = r"$\epsilon = decay$")
axs[1, 0].plot(average_estimated_best_action_over_time_gradient, label = r"$\alpha = decay$, gradient")
axs[1, 0].plot(average_estimated_best_action_over_time_thompson, label = "thompson sampling")
axs[1, 0].set_xlabel("time step")
axs[1, 0].set_ylabel("reward value")
axs[1, 0].set_title("% of time optimal action was chosen", y=-0.18)
axs[1, 0].legend()

axs[1, 1].plot(average_instantaneous_regret_over_time_epsilon, label = r"$\epsilon = decay$")
axs[1, 1].plot(average_instantaneous_regret_over_time_gradient, label = r"$\alpha = decay$, gradient")
axs[1, 1].plot(average_instantaneous_regret_over_time_thompson, label = "thompson sampling")
axs[1, 1].set_xlabel("time step")
axs[1, 1].set_ylabel("reward value")
axs[1, 1].set_title("Average Instantaneous Regret Received over Time", y=-0.18)
axs[1, 1].legend()

axs[2, 0].plot(average_cumulative_regret_over_time_epsilon, label = r"$\epsilon = decay$")
axs[2, 0].plot(average_cumulative_regret_over_time_gradient, label = r"$\alpha = decay$, gradient")
axs[2, 0].plot(average_cumulative_regret_over_time_thompson, label = "thompson sampling")
axs[2, 0].set_xlabel("time step")
axs[2, 0].set_ylabel("reward value")
axs[2, 0].set_title("Average Cumulative Regret Received over Time", y=-0.18)
axs[2, 0].legend()

axs[-1, -1].axis('off')


### Answers

Decaying $\epsilon$-greedy and gradient with decaying $\alpha$ seems to perform very similarly. They both exhibit sublinear (i.e. logarithmic) growth in average cumulative regret, which is optimal, whereas Thompson sampling seems to exhibit roughly linear growth in average cumulative regret. However, Thompson sampling and gradient bandit both exhibit slower growth in average cumulative compared with $\epsilon$-greedy.

## Q9 Non-stationary Environment

In [211]:
delta = 0.2
num_arms = 3
means = np.array([0.5, 0.5+2*delta, 0.5-2*delta])
variance = 0.01
bandit = GaussianBandit(num_arms = num_arms, mean = means, variance = variance)
station_changes = [500, bandit]

# plot instantaneous

avg_nonstationary_reward_epsilon_greedy_incremental, egi_cumu = epsilon_greedy(three_arm_gaussian_bandit, epsilon=0.25, stationary=False, nonstation_change=station_changes)[0:2]
avg_nonstationary_reward_epsilon_greedy_lr, eglr_cumu = epsilon_greedy(three_arm_gaussian_bandit, epsilon=0.25, alpha=0.1, stationary=False, nonstation_change=station_changes)[0:2]
avg_nonstationary_reward_epsilon_decay_incremental, egdi_cumu = epsilon_greedy(three_arm_gaussian_bandit, epsilon=0.5, epsilon_decay=True, lambda_=0.1, stationary=False, nonstation_change=station_changes)[0:2]
avg_nonstationary_reward_epsilon_decay_lr, egdlr_cumu = epsilon_greedy(three_arm_gaussian_bandit, epsilon=0.5, epsilon_decay=True, lambda_=0.1, alpha=0.1, stationary=False, nonstation_change=station_changes)[0:2]

avg_nonstationary_reward_gradient_01, grad01_cumu = gradient_bandit(three_arm_gaussian_bandit, alpha=0.1, stationary=False, nonstation_change=station_changes)[0:2]
avg_nonstationary_reward_gradient_001, grad001_cumu = gradient_bandit(three_arm_gaussian_bandit, alpha=0.01, stationary=False, nonstation_change=station_changes)[0:2]

avg_nonstationary_reward_thompson, thomp_cumu = thompson_sampling(three_arm_gaussian_bandit, stationary=False, nonstation_change=station_changes)[0:2]

plt.plot(avg_nonstationary_reward_epsilon_greedy_incremental, label="epsilon=1/4, incremental averaging")
plt.plot(avg_nonstationary_reward_epsilon_greedy_lr, label="epsilon=1/4, alpha=0.1")
plt.plot(avg_nonstationary_reward_epsilon_decay_incremental, label="decaying epsilon, incremental averaging")
plt.plot(avg_nonstationary_reward_epsilon_decay_lr, label="decaying epsilon, alpha=0.1")

plt.plot(avg_nonstationary_reward_gradient_01, label="gradient bandit, alpha=0.1")
plt.plot(avg_nonstationary_reward_gradient_001, label="gradient bandit, alpha=0.01")
plt.plot(avg_nonstationary_reward_thompson, label="thompson sampling")

plt.xlabel("time step")
plt.ylabel("average instantaneous reward received")

# non_stat_greedy_rewards = epsilon_greedy(bandit, epsilon = 1/4, num_time_step = 1000, stationary=True)[0]
# plt.plot(non_stat_greedy_rewards)
plt.legend()
plt.show()

In [212]:
# plot cumulative rewards

plt.plot(egi_cumu, label="epsilon=1/4, incremental averaging")
plt.plot(eglr_cumu, label="epsilon=1/4, alpha=0.1")
plt.plot(egdi_cumu, label="decaying epsilon, incremental averaging")
plt.plot(egdlr_cumu, label="decaying epsilon, alpha=0.1")

plt.plot(grad01_cumu, label="gradient bandit, alpha=0.1")
plt.plot(grad001_cumu, label="gradient bandit, alpha=0.01")
plt.plot(thomp_cumu, label="thompson sampling")

plt.xlabel("time step")
plt.ylabel("average reward received")

# non_stat_greedy_rewards = epsilon_greedy(bandit, epsilon = 1/4, num_time_step = 1000, stationary=True)[0]
# plt.plot(non_stat_greedy_rewards)
plt.legend()
plt.show()

By graphing the individual reward progressions for each method, we can observe how they respond to the change in reward distribution at $t=500$. $\epsilon$-greedy with incremental averaging experiences a steep drop-off in average reward, and takes about $300$ iterations to identify the new best arm, after which it behaves as expected for a relatively high value of $\epsilon$. On the other hand, $\epsilon = \frac{1}{4}$ with a fixed learning rate of $\alpha = 0.1$ behaves about the same for the first $500$ iterations as in the incremental averaging case, but takes less than $50$ iterations after the distributions change to identify the new best arm. This suggests a fixed learning rate for $\epsilon$-greedy is better suited to non-stationarity. In the decaying $\epsilon$ case, the agent fails to identify the new best arm, and is stuck receiving sub-par rewards for the remaining iterations.

Gradient bandit with $\alpha = 0.1$ takes roughly 200 iterations to identify the new best arm after the distribution changes, and it does so slowly. Gradient bandit with $\alpha = 0.01$ seemingly never identifies the new best arm, and experiences significant variation in average reward after $t=500$. Thompson sampling exhibits a sharp drop-off in average rewards, but around iteration 750 it identifies the new best arm, and largely samples that arm for the remainder of the runtime.

In conclusion, Thompson sampling and $\epsilon$-greedy with a fixed learning rate seem best adapted to non-stationary problems.
