In [11]:
import numpy as np

class Environment:
    def __init__(self, bandits, agents):
        self.n = 0
        self.bandits = bandits
        self.agents = agents
        self.optimal_selections = [0 for x in range(len(agents))]
        self.results = []
        
    def optimal_selection(self):
        best_val, best_idx = None, None
        for idx, bandit in enumerate(self.bandits):
            if bandit.mean > best_val or best_val == None:
                best_val = bandit.mean
                best_idx = idx
        return best_idx
    
    def step(self):
        #print("Optimal selection: {}".format(self.optimal_selection()))
        for idx, agent in enumerate(self.agents):
            arm = agent.select_bandit()
            reward = self.bandits[arm].reward()
            #print("Agent {} with Q table: {} selected {} for reward of {}".format(idx, self.agents[idx].Q_table, arm, reward))
            agent.update_q(reward, arm)
            if arm == self.optimal_selection():
                self.optimal_selections[idx] += 1
        self.n += 1
        result = self.optimal_selections#np.multiply(self.optimal_selections,1/self.n)
        self.results.append((self.n,result))
        
        for bandit in self.bandits:
            bandit.update()                        
        
        

class Bandit:
    def __init__(self):
        self.mean = 1
        self.stddev = 0.1
    
    def update(self):
        disturb = np.random.normal(0, 0.01)
        self.mean += disturb
    
    def reward(self):
        return np.random.normal(self.mean, self.stddev)

class Agent:
    def __init__(self, av_meth, arm_count, eps):
        self.av_meth = av_meth
        self.Q_table = [0 for x in range(0,arm_count)]
        self.n_arm = [0 for x in range(0,arm_count)]
        self.eps = eps
        self.optimal_rate = None
        
    def update_q(self, r, arm):
        self.n_arm[arm] += 1
        self.Q_table[arm] = self.av_meth(self.Q_table[arm], r, self.n_arm[arm])
    
    def select_bandit(self):
        if np.random.uniform() > self.eps:
            return np.argmax(self.Q_table)
        else:
            return np.random.randint(0,len(self.Q_table))
        
def sample_av(Q_old, reward, n):
    return Q_old + (reward - Q_old) / n

def step_av(Q_old, reward, n):
    return Q_old + 0.1 * (reward - Q_old)

# Each bandit has uniform Q initially, varies as time goes (add unique np.random.normal(0,0.01) to each Q on each iteration)
# Create sample average method and fixed weight average, compare relative % optimal Q for each time step over 10k steps

total_steps = 10000
bandit_count = 10
eps = 0.1

bandits = []
results = []
agents = []

for i in xrange(bandit_count):
    bandits.append(Bandit())
    
agents.append(Agent(sample_av, bandit_count, eps))
agents.append(Agent(step_av, bandit_count, eps))
env = Environment(bandits, agents)

for step in xrange(total_steps):
    env.step()
    if step % 100 == 0:
        print(env.results[env.n - 1])
                             

(1, [1, 1])
(101, [7, 2])
(201, [8, 5])
(301, [49, 8])
(401, [113, 8])
(501, [116, 8])
(601, [130, 9])
(701, [188, 9])
(801, [225, 11])
(901, [227, 13])
(1001, [227, 74])
(1101, [228, 167])
(1201, [229, 258])
(1301, [229, 351])
(1401, [230, 442])
(1501, [232, 533])
(1601, [235, 626])
(1701, [235, 716])
(1801, [236, 806])
(1901, [236, 896])
(2001, [236, 985])
(2101, [275, 1037])
(2201, [369, 1038])
(2301, [462, 1038])
(2401, [555, 1038])
(2501, [648, 1039])
(2601, [741, 1040])
(2701, [830, 1114])
(2801, [922, 1204])
(2901, [1014, 1291])
(3001, [1106, 1381])
(3101, [1197, 1470])
(3201, [1285, 1561])
(3301, [1377, 1647])
(3401, [1465, 1737])
(3501, [1557, 1831])
(3601, [1647, 1918])
(3701, [1741, 2011])
(3801, [1805, 2074])
(3901, [1858, 2130])
(4001, [1942, 2217])
(4101, [2036, 2309])
(4201, [2127, 2404])
(4301, [2213, 2499])
(4401, [2305, 2593])
(4501, [2396, 2685])
(4601, [2491, 2775])
(4701, [2581, 2861])
(4801, [2646, 2926])
(4901, [2647, 2964])
(5001, [2649, 3056])
(5101, [2650, 315