/
simulations.py
64 lines (49 loc) · 2.38 KB
/
simulations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""Runs the main simulations which compare the performance of various bandit
algorithms
We pre-draw simulated Bernoulli reward sequences for each arm separately of length
indicated by the horizon of the simulation. Each algorithm does not know the
realization of this sequence chooses and arm, and hence reward, to sample.
"""
import numpy as np
import reward_process as rp
import time
import copy
__author__ = 'Ilari'
def simulate_algorithms(algorithms, parameters, n_sims, horizon):
results = {}
best_arm = np.argmax(parameters)
for algorithm in algorithms:
start_time = time.time()
# Initiate data collection arrays:
chosen_arms = np.zeros([n_sims, horizon])
cum_rewards = np.zeros([n_sims, horizon])
cum_regret = np.zeros([n_sims, horizon]) # The realized regret compared to pulling the best ar
expected_values = np.zeros([n_sims, horizon, len(parameters)])
for sim in range(0, n_sims):
algo = copy.deepcopy(algorithm)
"""Note: Deepcopy circumvents the need to re-initialize the arms for
bandit_algorithms with priors in every simulation loop"""
algo.initialize() # Does nothing for algorithms which start with priors
simulate = rp.RewardProcess(parameters, horizon) # Draw reward process for each arm
rewards = simulate.bernoulli()
for t in range(0, horizon):
chosen_arm = algo.select_arm()
chosen_arms[sim, t] = chosen_arm
reward = rewards[chosen_arm, t]
if t > 0:
cum_rewards[sim, t] = cum_rewards[sim, t-1] + reward
cum_regret[sim, t] = cum_regret[sim, t-1] + (parameters[best_arm] - reward)
else:
cum_rewards[sim, t] = reward
cum_regret[sim, t] = parameters[best_arm] - reward
algo.update(chosen_arm, reward)
expected_values[sim, t, :] = algo.values
# Collect results into dictionary
results_algo = {'cumulative_regret': cum_regret,
'cumulative_rewards': cum_rewards,
'chosen_arms': chosen_arms,
'expected_values': expected_values}
name = algo.name
results[name] = results_algo
print "Elapsed time of simulation: ", time.time() - start_time, name
return results