In [None]:
%load_ext autoreload
%autoreload 2

from typing import Dict, List, Tuple, Union
from gym import Space, spaces
import seaborn as sns
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from prisoners_dilemma import utils
from prisoners_dilemma.env import PrisonersDilemmaEnv

sns.set('notebook', font_scale=1.1, rc={'figure.figsize': (7, 4)})
sns.set_style('ticks', rc={'figure.facecolor': 'none', 'axes.facecolor': 'none'})
%config InlineBackend.figure_format = 'svg'
matplotlib.rcParams['figure.facecolor'] = 'white'

### Run experiments [standard]

In [None]:
# Setup config
config = {
    'num_simuls': 1,
    'num_episodes': 50,
    'num_agents': 2,
    'num_actions': 2,
    'verbose': False,
    'init_type':'zeros',
}

config['payoffs'] = {
    'reward_payoff': 2,
    'tempta_payoff': 3,
    'sucker_payoff': 0,
    'punish_payoff': 1,
}

config['params'] = {
    'alpha': np.array([0.1, 0.1]),
    'eps': np.array([0.35, 0.35]),
    'gamma': np.array([0.5, 0.5]),
}

# Run experiments
q_traj_one, q_traj_two, rewards_seq, action_seq = utils.run_standard_ipd_exp(config)

# Visualize trajectories and actions
utils.make_q_vals_fig_standard(
    action_seq=action_seq,
    config=config,
    q_traj_one=q_traj_one,
    q_traj_two=q_traj_two,
)

### Run experiments [with observations]

In [None]:
# Setup config
config = {
    'num_simuls': 10,
    'num_episodes': 1000,
    'num_agents': 2,
    'num_actions': 2,
    'verbose': False,
    'init_type':'zeros',
}

config['payoffs'] = {
    'reward_payoff': 2,
    'tempta_payoff': 3,
    'sucker_payoff': 0,
    'punish_payoff': 1,
}

config['params'] = {
    'alpha': np.array([0.1, 0.1]),
    'eps': np.array([0.1, 0.1]),
    'gamma': np.array([0.5, 0.5]),
}

# Create arrays
q_traj_one = np.zeros((config["num_simuls"], config["num_episodes"], config["num_actions"]))
q_traj_two = np.zeros((config["num_simuls"], config["num_episodes"], config["num_actions"], config["num_actions"]))
rewards_seq = np.zeros((config["num_simuls"], config["num_episodes"], config["num_agents"]))
action_seq = np.zeros((config["num_simuls"], config["num_episodes"], config["num_agents"]), dtype=int)

for simul_i in range(config['num_simuls']):
    q_traj_one[simul_i, :, :], q_traj_two[simul_i, :, :], rewards_seq[simul_i, :, :], action_seq[simul_i, :, :] = utils.run_extended_ipd_exp(config)

In [None]:
df_play_one = pd.DataFrame({
    'Simulation' : np.repeat(np.arange(config['num_simuls']), config['num_episodes']),
    'Episode'    : np.tile(np.arange(config['num_episodes']), config['num_simuls']),
    'Q_D'        : q_traj_one[:, :, 0].flatten(),
    'Q_C'        : q_traj_one[:, :, 1].flatten(),
    'Rewards'    : rewards_seq[:, :, 0].flatten()
})

df_play_two = pd.DataFrame({
    'Simulation' : np.repeat(np.arange(config['num_simuls']), config['num_episodes']),
    'Episode'    : np.tile(np.arange(config['num_episodes']), config['num_simuls']),
    'Q_D_cond_D' : q_traj_two[:, :, 0, 0].flatten(),
    'Q_D_cond_C' : q_traj_two[:, :, 0, 1].flatten(),
    'Q_C_cond_D' : q_traj_two[:, :, 1, 0].flatten(),
    'Q_C_cond_C' : q_traj_two[:, :, 1, 1].flatten(),
    'Rewards'    : rewards_seq[:, :, 1].flatten()
})

In [None]:
sns.set('notebook', font_scale=1.1, rc={'figure.figsize': (10, 5)})
sns.set_style('ticks', rc={'figure.facecolor': 'none', 'axes.facecolor': 'none'})
matplotlib.rcParams['figure.facecolor'] = 'white'

fig, axs = plt.subplots(2, 1, sharey=True, sharex=True)
fig.suptitle(f'Q-values over s={config["num_simuls"]} simulations', y=1.05)

sns.lineplot(data=df_play_one, x='Episode', y='Q_D', errorbar='sd', label=r'$Q^{1}_{D}$', ax=axs[0])
sns.lineplot(data=df_play_one, x='Episode', y='Q_C', errorbar='sd', label=r'$Q^{1}_C$', ax=axs[0]);

sns.lineplot(data=df_play_two, x='Episode', y='Q_D_cond_D', errorbar='sd', label=r'$Q^{2}_{D | D}$', ax=axs[1])
sns.lineplot(data=df_play_two, x='Episode', y='Q_D_cond_C', errorbar='sd', label=r'$Q^{2}_{D | C}$', ax=axs[1])
sns.lineplot(data=df_play_two, x='Episode', y='Q_C_cond_D', errorbar='sd', label=r'$Q^{2}_{C | D}$', ax=axs[1])
sns.lineplot(data=df_play_two, x='Episode', y='Q_C_cond_C', errorbar='sd', label=r'$Q^{2}_{C | C}$', ax=axs[1]);

axs[0].legend(bbox_to_anchor=(1,1))
axs[1].legend(bbox_to_anchor=(1,1))
axs[0].set_title('Agent 1 (does not have observations)')
axs[1].set_title('Agent 2 (actions conditioned on agent 1s action)')

axs[0].set_ylabel('Q-values player 1')
axs[1].set_ylabel('Q-values player 2')
sns.despine()

In [None]:
sns.lineplot(data=df_play_one, x='Episode', y='Rewards', label='Play 1')
sns.lineplot(data=df_play_two, x='Episode', y='Rewards', label='Play 2')
plt.legend(bbox_to_anchor=(1,1));
sns.despine()