# TwoArmedBandit

> Defines a two-armed bandit task with possibility of reversals.

In [None]:
#| default_exp envs/TwoArmedBandit

In [None]:
#| hide
# Imports for the nbdev development environment

from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#| hide

%load_ext autoreload
%autoreload 2

In [None]:
#| export

import numpy as np

In [None]:
#| export

class TwoArmedBandit(object):
    """
    Class for defining a simple two-armed bandit task.
    """

    def __init__(self, p_0, p_1, rew, pun):
        self.M = 2    # number of arms
        self.rew = rew    # reward value (usually 1)
        self.pun = pun    # punishment value (usually 0 or -1)
        self.probs = [p_0, p_1]    # probability that option 0 (resp. option 1)
        # yields a reward

    def return_payoffs(self, choices):
        """Returns reward with probability p and punishment with probability
        (1-p)."""
        payoffs = np.ones((len(choices)))
        probs_ar = np.array(self.probs)  # create array of probs
        choices_li = list(choices)  # create list of choices
        probs_now = probs_ar[choices_li]  # returns prob associated
        # with each choice
        rd = np.random.rand(len(choices))  # returns random numbers
        # between 0 and 1
        mask = rd <= probs_now  # if random number is smaller than prob,
        # arm yields reward
        payoffs[mask] = payoffs[mask] * self.rew  # attribute reward
        payoffs[~mask] = payoffs[~mask] * self.pun  # attribute punishment
        return payoffs

    def reversal_occurs(self):
        """Introduces a reversal in reward probabilities."""
        self.probs = self.probs[::-1]

    def output_probs(self):
        """Outputs np array of probabilities associated to each option."""
        return np.array(self.probs)

## Example environment

We define a two-armed bandit task with the following reward probabilities: $0.9$ for the first arm, and $0.7$ for the second arm. This means that e.g. the first arm has a probability $0.9$ of returning a reward, that we set to $+1$, and a probability $1 - 0.9 = 0.1$ of returning a penalty, that we set to $-1$.

In [None]:
# Define example environment
p_0 = 0.9
p_1 = 0.7
rew = 1
pun = -1

example = TwoArmedBandit(p_0, p_1, rew, pun)

## TwoArmedBandit methods

In [None]:
show_doc(TwoArmedBandit.return_payoffs)

---

### TwoArmedBandit.return_payoffs

>      TwoArmedBandit.return_payoffs (choices)

Returns reward with probability p and punishment with probability
(1-p).

**Input**:

* agents' choices (array)

**Output**:

* an array containing the payoffs that result from agents' choices


#### Example

In [None]:
# Define random choices (example with two agents)
choices = np.random.randint(0, 2, 2)
print("Choices:", choices)
example.return_payoffs(choices)

Choices: [1 0]


array([ 1., -1.])

In [None]:
show_doc(TwoArmedBandit.output_probs)

---

### TwoArmedBandit.output_probs

>      TwoArmedBandit.output_probs ()

Outputs np array of probabilities associated to each option.

**Input**:

* none

**Output**:

* an array containing the bandits' reward probabilities


#### Example

In [None]:
# Output bandits' probabilities
example.output_probs()

array([0.9, 0.7])

In [None]:
show_doc(TwoArmedBandit.reversal_occurs)

---

### TwoArmedBandit.reversal_occurs

>      TwoArmedBandit.reversal_occurs ()

Introduces a reversal in reward probabilities.

**Input**:

* none

**Output**:

* an array containing the bandits' reward probabilities after reversal


#### Example

In [None]:
# Output bandits' probabilities
print("Probabilities before reversal:", example.output_probs())

# Reversal occurs
example.reversal_occurs()

# Output bandits' new probabilities
print("Probabilities after reversal:", example.output_probs())

Probabilities before reversal: [0.9 0.7]
Probabilities after reversal: [0.7 0.9]
