# MULTI ARMED BANDITS IN PYTHON

**epsilon-greedy action value method for solving the k-armed Bandit problem**

In [1]:
import numpy as np

In [2]:
class Bandits:
    """
    numBandits = number of bandits
    maxMean    = bandit value cap
    maxDev     = bandit standard deviation cap
    steps      = total rounds
    """
    def __init__(self, numBandits, maxMean, maxDev, steps):
        self.numBandits = numBandits
        self.maxMean    = maxMean
        self.maxDev     = maxDev
        self.steps      = steps

        # initialising explore and exploit counters to 0
        self.explore, self.exploit = 0, 0

        # initialising bandits
        self.initBandits()
    
    # randomly initialises bandit parameters within range with uniform distribution
    def initBandits(self):
        self.means      = np.random.rand(self.numBandits)*self.maxMean  # bandit value
        self.stdevs     = np.random.rand(self.numBandits)*self.maxDev   # bandit standard deviation
        self.estValInit = np.random.rand(self.numBandits)*self.maxMean  # initial bandit value estimate
    
    def simulate(self, epsilon):
        # resetting simulation parameters
        k, agent = 0, 0
        self.explore, self.exploit = 0, 0
        self.reward = np.zeros(self.steps)        # step-wise reward 
        self.chosen = np.zeros(self.numBandits)   # bandit choice counter
        self.estVal = np.array(self.estValInit)   # common initial values between runs
        
        for k in range(self.steps):
            greed = np.random.rand()
            
            if greed <= epsilon:
                self.explore += 1
                agent = np.random.randint(low=0, high=self.numBandits)   # chose random bandit
            else:
                self.exploit += 1
                agent = np.argmax(self.estVal)     # chose bandit with highest estimated value
            
            self.reward[k] = np.random.normal(loc=self.means[agent], scale=self.stdevs[agent])  # generate step reward
            self.chosen[agent] += 1                                                             # increment bandit counter
            self.estVal[agent] += (self.reward[k] - self.estVal[agent])/(self.chosen[agent])    # update estimated value of bandit
    
    def results(self, epsilon):
        self.simulate(epsilon)
        
        print("Simulation Results for ε =", epsilon, "greedy", self.numBandits, "bandits", "running for", self.steps, "steps :=")
        for k in range(self.numBandits):
            print("Bandit", k+1, ":=",
                  "est. mean =", self.estVal[k],
                  "\tact. mean =", self.means[k],
                  "\tstdev =", self.stdevs[k],
                  "\tchosen", int(self.chosen[k]), "times")
        print("explore counter =", self.explore)
        print("exploit counter =", self.exploit)
        print("total reward =", np.sum(self.reward))
        print("regret =", np.max(self.means)*self.steps)
        print("% regret =", 100*(1-np.sum(self.reward)/(np.max(self.means)*self.steps)))

creating Bandit object

In [3]:
bandit = Bandits(numBandits=5, maxMean=10, maxDev=3, steps=1000)

running game with the same bandits for different values of epsilon

In [9]:
bandit.results(epsilon=0.1)

Simulation Results for ε = 0.1 greedy 5 bandits running for 1000 steps :=
Bandit 1 := est. mean = 7.94266628744492 	act. mean = 8.120189756046312 	stdev = 2.241590169669475 	chosen 21 times
Bandit 2 := est. mean = 8.197547077701651 	act. mean = 8.134411592018962 	stdev = 0.6804241104530381 	chosen 882 times
Bandit 3 := est. mean = 3.426018356053969 	act. mean = 2.8791345646961766 	stdev = 1.30273168340194 	chosen 8 times
Bandit 4 := est. mean = 1.9678802116751855 	act. mean = 1.0072114483586947 	stdev = 2.779074488092154 	chosen 27 times
Bandit 5 := est. mean = 8.136792157901455 	act. mean = 8.047281234129215 	stdev = 1.845343216892775 	chosen 62 times
explore counter = 92
exploit counter = 908
total reward = 7982.054540922757
regret = 8134.4115920189615
% regret = 1.8729941234555736
