# SimpleConf

> Performs updating of state-action values according to a simple Rescorla-Wagner rule, with use of social information. Allows for asymmetric learning rates in the form of a confirmation/disconfirmation bias. 

In [None]:
#| default_exp agents/SimpleConf

In [None]:
#| hide
# Imports for the nbdev development environment

from nbdev.showdoc import *
from fastcore.test import *

# Imports for the examples

import matplotlib.pyplot as plt

In [None]:
#| hide

%load_ext autoreload
%autoreload 2

In [None]:
#| export

import numpy as np
import networkx as nx

In [None]:
#| export

class SimpleConf(object):
    """
    Class for simple reinforcement learning (Rescorla-Wagner rule)
    with confirmation/disconfirmation bias.
    """

    def __init__(self, 
                 params: np.ndarray):  # agents' parameters
        
        self.N = np.shape(params)[0]  # number of agents
        self.M = 2  # number of options
        self.alphac = params[:, 0]   # confirmatory learning rates
        self.alphad = params[:, 1]   # disconfirmatory learning rates
        self.beta = params[:, 2]    # inverse temperatures

    def connect_agents_full(self):
        """Connects agents according to a fully connected graph."""
        return nx.complete_graph(self.N)

    def compute_softmax(self, Qtable):
        """Returns a probability table for all agents for all actions,
        from agents' Qtable."""
        beta = np.row_stack(self.beta)
        num = np.exp(beta*Qtable)  # numerator
        den = np.sum(num, axis=1)  # denominator
        return num/den[:, None]

    def choose(self, Ptable):
        """Computes chosen options from agents's probability table."""
        choices = np.zeros((np.shape(Ptable)[0]))  # 1 choice per agent
        rd = np.reshape(np.random.rand(len(choices)), (len(choices), 1))
        choices = np.sum(rd > np.cumsum(Ptable, axis=1), axis=1)
        choices = choices.astype(int)  # converts the choices to int values
        return choices
    
    def all_take_action(self, Qtable):
        """Computes all agents' choices from their Qtable.
        Combines `compute_softmax` and `choose`.
        """
        Ptable = self.compute_softmax(Qtable)
        choices = self.choose(Ptable)
        return choices
    
    def update_Qvalues(self, G_att, choices, payoffs, Qtable): 
        """Updates all agents' Q-values according to CARL, 
        without for loops."""
        Qs = np.einsum('ijk->ikj', np.reshape(np.repeat(Qtable, self.N), 
                                              (self.N, self.M, self.N)))
        Rs = np.einsum('ijk->kij', np.reshape(np.repeat(np.repeat(payoffs, 
                                                                  self.N), 
                                                        self.M), 
                                              (self.N, self.M, self.N)))
        deltas = Rs - Qs
        # TODO: the cubes could be built at the beginning of the experiment
        alphac_cube = np.reshape(np.repeat(self.alphac, self.N*self.M), 
                                 (self.N, self.N, self.M))
        alphad_cube = np.reshape(np.repeat(self.alphad, self.N*self.M), 
                                 (self.N, self.N, self.M))
        pos = deltas > 0
        ags = np.arange(0, self.N, 1)
        # choice_mask selects all actions that have been taken
        choice_mask = np.zeros((self.N, self.N, self.M)).astype(bool)
        choice_mask[:, ags, choices] = True
        deltas[~choice_mask] = 0
        # own_mask selects actions per agent
        own_mask = np.zeros((self.N, self.N, self.M)).astype(bool)
        own_mask[ags, :, choices] = True
        # same_act selects actions similar to my actions
        same_act = choice_mask & own_mask
        # other_act selects actions different from mine
        other_act = choice_mask & ~own_mask
        same_pos = pos & same_act
        other_pos = pos & other_act
        same_neg = ~pos & same_act
        other_neg = ~pos & other_act
        alphas = np.zeros((self.N, self.N, self.M))
        alphas[same_pos] = alphac_cube[same_pos]
        alphas[same_neg] = alphad_cube[same_neg]
        alphas[other_neg] = alphac_cube[other_neg]
        alphas[other_pos] = alphad_cube[other_pos] 
        # obs_mask selects actions that I observe; 
        # TODO: could be implemented at the beginning of the experiment
        obs_ij = np.asarray(nx.adjacency_matrix(G_att).todense()).astype(bool)
        obs_mask = np.reshape(np.repeat(obs_ij, self.M), (self.N, self.N, 
                                                          self.M))
        np.fill_diagonal(obs_ij, True)
        obs_mask[:, :, 0] = obs_ij
        obs_mask[:, :, 1] = obs_ij
        deltas[~obs_mask] = 0
        deltas *= alphas
        deltas_sum = np.sum(deltas, axis=1)
        Qtable += deltas_sum
        return Qtable

## Example agents

To illustrate the methods, let's consider 2 example agents. Both have a confirmation bias. Their parameters are the following:

|         | $\alpha_C$ | $\alpha_D$ | $\beta$ |
| --------| ---------- | ---------- | ------- |
| agent 1 | $0.2$      | $0.1$      | $5.$     |
| agent 2 | $0.15$     | $0.05$     | $4.$     |

* $\alpha_C$: confirmatory learning rate
* $\alpha_D$: disconfirmatory learning rate
* $\beta$: inverse temperature

In [None]:
# Define example agents
params = np.array([[0.2, 0.1, 5.],  # agent 1's params
                   [0.15, 0.05, 4.]])  # agent 2's params

example = SimpleConf(params)

## SimpleConf methods

In [None]:
show_doc(SimpleConf.connect_agents_full)

**Input**:

* None

**Output**:

* Fully connected, non-directed "attention graph": each agent pays attention to all others.


#### Example

In [None]:
# Connecting example agents
G = example.connect_agents_full()

# Draw graph
plt.figure(figsize=(2.5, 1.5))
nx.draw(G)
plt.show()

In [None]:
show_doc(SimpleConf.compute_softmax)

**Input**: 
* `Qtable`: Q-table, 2d-array: *number of agents* $\times$ *number of options*

|         | option 1  | option 2  |
| --------| --------- | --------- |
| agent 1 | $Q_{1,1}$ | $Q_{1,2}$ |
| agent 2 | $Q_{2,1}$ | $Q_{2,2}$ |


**Output**:
* `Ptable`: P-table, 2d-array: *number of agents* $\times$ *number of options*

|         | option 1  | option 2  |
| --------| --------- | --------- |
| agent 1 | $P_{1,1}$ | $P_{1,2}$ |
| agent 2 | $P_{2,1}$ | $P_{2,2}$ |



#### Softmax policy

Probability that agent $i$ chooses option $j$ given Q-values $Q_{i,k}$, for $k$ any available option:

$$ P_{i,j} = \frac{exp(\beta_{i} Q_{i,j})}{\sum_{k} exp(\beta_{i} Q_{i,k})} $$

with $\beta_{i}$ agent $i$'s inverse temperature.



#### Example

With 2 agents, 2 options, all Qs are 0.

In [None]:
Qtable = np.zeros((2, 2))
Ptable = example.compute_softmax(Qtable)
Ptable

In [None]:
show_doc(SimpleConf.choose)

**Input**:

* `Ptable`: P-table, 2d-array: *number of agents* $\times$ *number of options*

|         | option 1  | option 2  |
| --------| --------- | --------- |
| agent 1 | $P_{1,1}$ | $P_{1,2}$ |
| agent 2 | $P_{2,1}$ | $P_{2,2}$ |


**Output**:

* `choices`: choice (i.e., chosen option) list, 1d-array: *number of agents*

|         | choice  | 
| --------| ------- | 
| agent 1 | $c_{1}$ | 
| agent 2 | $c_{2}$ | 


NB: options are labelled $0$ to $M-1$, with $M$ number of available options.


#### Example

In [None]:
# Compute example agents' choices according to previous Ptable
choices = example.choose(Ptable)
choices

In [None]:
# Test: given option labelling, over many simulations, average
# choice should approach probability of choosing option 1
choices_test = np.zeros((2, 10000))
for i in range(10000):  # loop over simulations
    choices_test[:, i] = example.choose(Ptable)
    
np.mean(choices_test, axis=1)  # compute average

In [None]:
show_doc(SimpleConf.all_take_action)

**Input**:

* `Qtable`: Q-table, 2d-array: *number of agents* $\times$ *number of options*

|         | option 1  | option 2  |
| --------| --------- | --------- |
| agent 1 | $Q_{1,1}$ | $Q_{1,2}$ |
| agent 2 | $Q_{2,1}$ | $Q_{2,2}$ |


**Output**:

* `choices`: choice (i.e., chosen option) list, 1d-array: *number of agents*

|         | choice  | 
| --------| ------- | 
| agent 1 | $c_{1}$ | 
| agent 2 | $c_{2}$ | 


#### Example

In [None]:
# Compute example agents' choices according to previous Qtable
choices = example.all_take_action(Qtable)
choices

In [None]:
# Test: given option labelling, over many simulations, average
# choice should approach probability of choosing option 1
choices_test = np.zeros((2, 10000))
for i in range(10000):  # loop over simulations
    choices_test[:, i] = example.all_take_action(Qtable)
    
np.mean(choices_test, axis=1)  # compute average

In [None]:
show_doc(SimpleConf.update_Qvalues)

**Input**:

* `G_att`: attention graph, obtained through `connect_agents_full`
* `choices`: choice (i.e., chosen option) list, 1d-array: *number of agents*

|         | choice  | 
| --------| ------- | 
| agent 1 | $c_{1}$ | 
| agent 2 | $c_{2}$ | 

* `payoffs`: payoff list returned by task, 1d-array: *number of agents*

|         | payoff  | 
| --------| ------- | 
| agent 1 | $r_{1}$ | 
| agent 2 | $r_{2}$ | 

* `Qtable`: Q-table, 2d-array: *number of agents* $\times$ *number of options*

|         | option 1  | option 2  |
| --------| --------- | --------- |
| agent 1 | $Q_{1,1}$ | $Q_{1,2}$ |
| agent 2 | $Q_{2,1}$ | $Q_{2,2}$ |



**Output**:
* `Qtable`: updated Q-table, 2d-array: *number of agents* $\times$ *number of options*

|         | option 1  | option 2  |
| --------| --------- | --------- |
| agent 1 | $Q_{1,1}$ | $Q_{1,2}$ |
| agent 2 | $Q_{2,1}$ | $Q_{2,2}$ |


In [None]:
#| hide
# import nbdev; nbdev.nbdev_export()