In [None]:
# Reinforcement learning basics

: 

In [None]:
import random as rnd

class bandit_machine:
    """
    class to create a bandit machine object
    """
    def __init__(self, name: str, probability: float):
        """
        Contructor to initialize a bandit machine
        :var name: name of the machine
        :var probability: probabilty to gain a reward
        """
        self.name = name                # Name of the machine
        self.p = probability            # Probability to gain a reward
        self.p_estimated = 0            # Estimated probability for each iteration
        self.n_samples = 1              # Number of times we pull the arm
        self.p_estimated_cache = [0]    # Not need it. Only to plot the sample mean later on

    def pull(self) -> int:
        """
        Method to simulate a result after pulling the arm from the machine
        :return: 1 if you got a reward, 0 if you didn't
        """
        return int(rnd.random() < self.p)

    def update(self, x: int):
        """
        Method to update the estimated probability and number of samples parameters
        """
        self.n_samples += 1
        self.p_estimated = ((self.n_samples - 1) * self.p_estimated + x) / self.n_samples
        self.p_estimated_cache.append(self.p_estimated)


: 

In [None]:
from matplotlib import pyplot as plt

def plot_function(name, x_values, y_values):
    plt.plot(x_values, y_values, '-') # added '-o' to join the points
    plt.xlabel("Iterations")
    plt.ylabel("Estimated mean")
    plt.title(f"{name}")
    plt.show()

: 

In [None]:
# Create the machine
machine_test = bandit_machine('test_machine' ,0.75)

# Test the machine by pulling the arm
print(machine_test.name, machine_test.pull())

: 

In [None]:
# Create the bandit machines
probabilities = [0.35, 0.5, 0.75]
machines = [bandit_machine(f'machine_{i+1}', p) for i, p in enumerate(probabilities)]

# Initial parameters
n_experiments = 200
epsilon = 0.1

# Experiment
for i in range(n_experiments):
    # Select the machine to run
    if rnd.random() < epsilon: # Explore action
        selected_machine = machines[rnd.randint(0,2)]
    else:
        best_estimation = max([m.p_estimated for m in machines])
        selected_machine = [m for m in machines if m.p_estimated == best_estimation][0]

    # Pull the machine
    reward = selected_machine.pull()
    selected_machine.update(reward)


# Plot results
for m in machines:
    print(f'{m.name} ({m.n_samples} pulls):')
    print(f'The bandit has a estimated probability of {round(m.p_estimated, 2)}/{m.p}')
    x_points = list(range(len(m.p_estimated_cache)))
    plot_function(m.name, x_points, m.p_estimated_cache)


: 

In [None]:
class bandit_machine:
    """
    class to create a bandit machine object
    """
    def __init__(self, name: str, probability: float):
        """
        Contructor to initialize a bandit machine
        :var name: name of the machine
        :var probability: probabilty to gain a reward
        """
        self.name = name                # Name of the machine
        self.p = probability            # Probability to gain a reward
        self.p_estimated = 2.            # Estimated probability for each iteration
        self.n_samples = 1              # Number of times we pull the arm
        self.p_estimated_cache = [self.p_estimated]    # Not need it. Only to plot the sample mean later on

    def pull(self) -> int:
        """
        Method to simulate a result after pulling the arm from the machine
        :return: 1 if you got a reward, 0 if you didn't
        """
        return int(rnd.random() < self.p)

    def update(self, x: int):
        """
        Method to update the estimated probability and number of samples parameters
        """
        self.n_samples += 1
        self.p_estimated = ((self.n_samples - 1) * self.p_estimated + x) / self.n_samples
        self.p_estimated_cache.append(self.p_estimated)
        

: 

In [None]:
# Create the bandit machines
probabilities = [0.35, 0.5, 0.75]
machines_1 = [bandit_machine(f'machine_{i+1}', p) for i, p in enumerate(probabilities)]

# Initial parameters
n_experiments = 500
epsilon = 0.1

# Experiment
for i in range(n_experiments):
    # Select the machine to run
    best_estimation = max([m.p_estimated for m in machines_1])
    selected_machine = [m for m in machines_1 if m.p_estimated == best_estimation][0]

    # Pull the machine
    reward = selected_machine.pull()
    selected_machine.update(reward)


# Plot results
for m in machines_1:
    print(f'{m.name} ({m.n_samples} pulls):')
    print(f'The bandit has a estimated probability of {round(m.p_estimated, 2)}/{m.p}')
    x_points = list(range(len(m.p_estimated_cache)))
    plot_function(m.name, x_points, m.p_estimated_cache)

: 

In [None]:
m_eg = machines[2]
m_ov = machines_1[2]

x_points = list(range(100))

plt.plot(x_points, m_eg.p_estimated_cache[:100], '-')
plt.plot(x_points, m_ov.p_estimated_cache[:100], '-')
plt.xlabel("Iterations")
plt.ylabel("Estimated mean")
plt.title(f"Epsilon greedy VS Optimistic initial values")
plt.show()

: 

In [None]:
from math import log

class bandit_machine:
    """
    class to create a bandit machine object
    """
    def __init__(self, name: str, probability: float):
        """
        Contructor to initialize a bandit machine
        :var name: name of the machine
        :var probability: probabilty to gain a reward
        """
        self.name = name                # Name of the machine
        self.p = probability            # Probability to gain a reward
        self.p_estimated = 2            # Estimated probability for each iteration
        self.n_samples = 1              # Number of times we pull the arm
        self.p_estimated_cache = [self.p_estimated]    # Not need it. Only to plot the sample mean later on

    def pull(self) -> int:
        """
        Method to simulate a result after pulling the arm from the machine
        :return: 1 if you got a reward, 0 if you didn't
        """
        return int(rnd.random() < self.p)

    def update(self, x: int):
        """
        Method to update the estimated probability and number of samples parameters
        :var x: reward obtained
        """
        self.n_samples += 1
        x_mean = ((self.n_samples - 1) * self.p_estimated + x) / self.n_samples
        self.p_estimated = x_mean
        self.p_estimated_cache.append(self.p_estimated)


def apply_weight(x_mean, n_plays, n_samples):
    return x_mean + (2*log(n_plays)/n_samples)**0.5

: 

In [None]:
# Create the bandit machines
probabilities = [0.35, 0.5, 0.75]
machines_2 = [bandit_machine(f'machine_{i+1}', p) for i, p in enumerate(probabilities)]

# Initial parameters
n_experiments = 500
epsilon = 0.1

# Experiment
for i in range(n_experiments):
    # Select the machine to run
    best_estimation = max([apply_weight(m.p_estimated, i+1, m.n_samples) for m in machines_2])
    selected_machine = [m for m in machines_2 if apply_weight(m.p_estimated, i+1, m.n_samples) == best_estimation][0]

    # Pull the machine
    reward = selected_machine.pull()
    selected_machine.update(reward)


# Plot results
for m in machines_2:
    print(f'{m.name} ({m.n_samples} pulls):')
    print(f'The bandit has a estimated probability of {round(m.p_estimated, 2)}/{m.p}')
    x_points = list(range(len(m.p_estimated_cache)))
    plot_function(m.name, x_points, m.p_estimated_cache)

: 

In [None]:
m_eg = machines_1[2]
m_ov = machines_2[2]

x_points = list(range(100))

plt.plot(x_points, m_eg.p_estimated_cache[:100], '-')
plt.plot(x_points, m_ov.p_estimated_cache[:100], '-')
plt.xlabel("Iterations")
plt.ylabel("Estimated mean")
plt.title(f"Optimistic initial values VS UCB1")
plt.show()

: 

## Bayesian bandits

In [None]:
import numpy as np
from scipy.stats import beta

class bandit_machine:
    """
    class to create a bandit machine object
    """
    def __init__(self, name: str, probability: float):
        """
        Contructor to initialize a bandit machine
        :var name: name of the machine
        :var probability: probabilty to gain a reward
        """
        self.name = name                # Name of the machine
        self.p = probability            # Probability to gain a reward
        self.a = 1                      # Number of positive outcomes (rewards)
        self.b = 1                      # Number of negative outcomes (failures)
        self.N = 0                      # Number of pulls (optional)

    def pull(self) -> int:
        """
        Method to simulate a result after pulling the arm from the machine
        :return: 1 if you got a reward, 0 if you didn't
        """
        return int(rnd.random() < self.p)

    def sample(self):
        """
        Method to draw a sample from Beta(a,b)
        """
        return np.random.beta(self.a, self.b)

    def update(self, x: int):
        """
        Method to update the estimated probability and number of samples parameters
        :var x: reward obtained
        """
        self.a += x
        self.b += 1 - x
        self.N += 1


def plot(machines, trial):
  x = np.linspace(0, 1, 200)
  for b in machines:
    y = beta.pdf(x, b.a, b.b)
    plt.plot(x, y, label=f"real p: {b.p:.4f}, win rate = {b.a - 1}/{b.N}")
  plt.title(f"Bandit distributions after {trial} trials")
  plt.legend()
  plt.show()

: 

In [None]:
# Create the bandit machines
probabilities = [0.35, 0.5, 0.75]
machines = [bandit_machine(f'machine_{i+1}', p) for i, p in enumerate(probabilities)]

# Initial parameters
n_experiments = 2000
sample_points = [5,100,500,1999]
rewards = np.zeros(n_experiments)


for i in range(n_experiments):
    # Thompson sampling
    j = np.argmax([b.sample() for b in machines])

    # plot the posteriors
    if i in sample_points:
        plot(machines, i)

    # pull the arm for the bandit with the largest sample
    x = machines[j].pull()

    # update rewards
    rewards[i] = x

    # update the distribution for the bandit whose arm we just pulled
    machines[j].update(x)

# print total reward
print("total reward earned:", rewards.sum())
print("overall win rate:", rewards.sum() / n_experiments)
print("num times selected each bandit:", [b.N for b in machines])

: 