### Develop notebook

> **Todos**

- [ ] Sanity check implementation of IPD where agent two sees the actions of agent one
- [ ] Test functions and visualize Q-value trajectories

In [5]:
%load_ext autoreload
%autoreload 2

from typing import Dict, List, Tuple, Union
from gym import Space, spaces
import seaborn as sns
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from prisoners_dilemma import utils
from prisoners_dilemma.env import PrisonersDilemmaEnv

sns.set('notebook', font_scale=1.1, rc={'figure.figsize': (7, 4)})
sns.set_style('ticks', rc={'figure.facecolor': 'none', 'axes.facecolor': 'none'})
%config InlineBackend.figure_format = 'svg'
matplotlib.rcParams['figure.facecolor'] = 'white'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
# Setup config
config = {
    'num_simuls': 1,
    'num_episodes': 500,
    'num_agents': 2,
    'num_actions': 2,
    'verbose': False,
    'init_type':'zeros',
}

config['payoffs'] = {
    'reward_payoff': 2,
    'tempta_payoff': 3,
    'sucker_payoff': 0,
    'punish_payoff': 1,
}

config['params'] = {
    'alpha': np.array([0.1, 0.1]),
    'eps': np.array([0.2, 0.2]),
    'gamma': np.array([0.5, 0.5]),
}


#q_traj_one, q_traj_two, rewards_seq, actions_seq = run_extended_ipd_exp(config)

In [None]:
def run_extended_ipd_exp(config: Dict) -> Tuple[np.array, np.array, np.array, np.array]:
    '''
    Allow one agent to see the action of the other.
    '''

    game_env = PrisonersDilemmaEnv(
        config['payoffs']['reward_payoff'], 
        config['payoffs']['tempta_payoff'], 
        config['payoffs']['sucker_payoff'], 
        config['payoffs']['punish_payoff'],
    )

    # Initialize Q-tables (own act x other play act)
    q_table_one = np.zeros((game_env.action_space.n))
    q_table_two = np.zeros((game_env.action_space.n, game_env.action_space.n))

    q_traj_one = np.zeros((config['num_episodes'], game_env.action_space.n))
    q_traj_two = np.zeros((config['num_episodes'], game_env.action_space.n, game_env.action_space.n))
    rewards_seq = np.zeros((config['num_episodes'], config['num_agents']))
    action_seq = np.zeros((config['num_episodes'], config['num_agents']), dtype=int)

    # Condition the action of one agent on the action of the other
    for episode_i in range(config['num_episodes']):

        # # # # Select action player one # # # #
        if np.random.random() < config['params']['eps'][0]:
            act_play_one = np.array([game_env.action_space.sample()])
        else: # Exploit
            act_play_one = np.random.choice(
                 a=np.argwhere((q_table_one == q_table_one.max())).flatten(),
                size=(1,)
            )
        # # # # Select action player two | Condition on the action of player one # # # #
        if np.random.random() < config['params']['eps'][1]:
            act_play_two = np.array([game_env.action_space.sample()])
        else:  # Exploit
            act_play_two = np.random.choice(
                a=np.argwhere((q_table_two[:, act_play_one] == q_table_two[:, act_play_one].max())).flatten(),
                size=(1,)
            )

        # # # # Take a step # # # #
        actions = np.concatenate([act_play_one, act_play_two])
        _, rewards, _, _, _ = game_env.step(action=actions)

        # # # # Update Q-values # # # #
        q_table_one[act_play_one] = q_table_one[act_play_one] + \
            config['params']['alpha'][0] * (rewards[0] + config['params']['gamma'][0] * np.max(q_table_one) - q_table_one[act_play_one])

        q_table_two[act_play_two, act_play_one] = q_table_two[act_play_two, act_play_one] + \
            config['params']['alpha'][1] * (rewards[1] + config['params']['gamma'][1] * np.max(q_table_two[:, act_play_one]) - q_table_two[act_play_two, act_play_one])

        # Store trajectory
        rewards_seq[episode_i, :] = rewards
        action_seq[episode_i, :] = actions
        # episode x actions x players
        q_traj_one[episode_i, :] = q_table_one
        q_traj_two[episode_i, :, :] = q_table_two

    return (
        q_traj_one,
        q_traj_two,
        rewards_seq,
        action_seq,
    )