# A simplified approach to K-armed bandit problem

In [91]:
import numpy as np

In [92]:
class BernoulliBandit:
    def __init__(self, p, verbose=True):
        self.p = p
        if verbose:
            #print("Creating Bernoulli Bandit with p = %.2f"%p)
            print("Creating Bernoulli Bandit with p = {:.2f}".format(p))
    def pull(self):  #it will return 1 with prob p and 0 with prob 1-p
        return np.random.binomial(1, self.p)   # 1 being the no. of trials,i.e , the no. of arms pulled     

In [93]:
c1 = BernoulliBandit(0.6)
print(c1.pull())

Creating Bernoulli Bandit with p = 0.60
0


In [94]:
import random

In [95]:
class Banditsgameclass:
    def __init__(self, K, T , verbose=True):
        self.T = T
        self.K = K
        self.bandits = [BernoulliBandit(np.random.uniform(), verbose) for i in range(K)]   # every element of this array is an object of prev class
        self.verbose = verbose
    def stochastic_algo(self):
        results = np.zeros((self.T))
        for t in range(self.T):
            k = random.randrange(self.K)
            results[t] = self.bandits[k].pull()
            if self.verbose:
                print("T={} Playing Bandit {} Reward is {:.2f}".format(t, k, results[t]))
        return results  

In [101]:
game = Banditsgameclass(K=3 , T= 20)
game.stochastic_algo()

Creating Bernoulli Bandit with p = 0.95
Creating Bernoulli Bandit with p = 0.40
Creating Bernoulli Bandit with p = 0.79
T=0 Playing Bandit 0 Reward is 1.00
T=1 Playing Bandit 0 Reward is 1.00
T=2 Playing Bandit 1 Reward is 1.00
T=3 Playing Bandit 2 Reward is 1.00
T=4 Playing Bandit 0 Reward is 1.00
T=5 Playing Bandit 0 Reward is 1.00
T=6 Playing Bandit 2 Reward is 1.00
T=7 Playing Bandit 0 Reward is 1.00
T=8 Playing Bandit 1 Reward is 0.00
T=9 Playing Bandit 0 Reward is 1.00
T=10 Playing Bandit 2 Reward is 1.00
T=11 Playing Bandit 0 Reward is 1.00
T=12 Playing Bandit 2 Reward is 1.00
T=13 Playing Bandit 1 Reward is 0.00
T=14 Playing Bandit 0 Reward is 0.00
T=15 Playing Bandit 2 Reward is 0.00
T=16 Playing Bandit 1 Reward is 0.00
T=17 Playing Bandit 1 Reward is 1.00
T=18 Playing Bandit 2 Reward is 1.00
T=19 Playing Bandit 0 Reward is 1.00


array([1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 0., 0., 0.,
       1., 1., 1.])

# Running the bandit game multiple times and averaging the rewards

In [102]:
def run_simulation(n_runs, runs_per_game, K, T):
    
    results = np.zeros((K,T))
    
    for run in range(n_runs):

        run_results = np.zeros((K,T))

        for run in range(runs_per_game):
            game = Banditsgameclass(K=K, T=T, verbose=False)
            run_results += game.stochastic_algo()

        results += run_results / runs_per_game
    
    results = results / n_runs
    
    return results   

In [98]:
stochastic_results = run_simulation(n_runs = 10, runs_per_game = 100, T=20, K=10)

In [99]:
stochastic_results1 = run_simulation(n_runs=10, runs_per_game=100, K=3, T=1000)
stochastic_results = stochastic_results.mean(axis=0)
print("Mean reward: {:.2f}".format(stochastic_results.mean()))
print("G: {:.2f}".format(stochastic_results.sum()))



Mean reward: 0.49
G: 9.90


In [100]:

stochastic_results


array([0.483, 0.518, 0.466, 0.493, 0.513, 0.485, 0.495, 0.489, 0.492,
       0.483, 0.486, 0.485, 0.502, 0.498, 0.506, 0.477, 0.506, 0.521,
       0.48 , 0.52 ])