### Develop notebook


In [96]:
%load_ext autoreload
%autoreload 2

from typing import Dict, List, Tuple, Union
from gym import Space, spaces
import seaborn as sns
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from prisoners_dilemma import algorithms
from prisoners_dilemma import utils
from prisoners_dilemma.env import PrisonersDilemmaEnv

sns.set('notebook', font_scale=1.1, rc={'figure.figsize': (7, 4)})
sns.set_style('ticks', rc={'figure.facecolor': 'none', 'axes.facecolor': 'none'})
matplotlib.rcParams['figure.facecolor'] = 'white'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [101]:
# Setup config
config = {
    'num_simuls': 10,
    'num_episodes': 1000,
    'num_agents': 2,
    'num_actions': 2,
    'verbose': False,
    'init_type':'zeros',
}

config['payoffs'] = {
    'reward_payoff': 2,
    'tempta_payoff': 3,
    'sucker_payoff': 0,
    'punish_payoff': 1,
}

config['params'] = {
    'alpha': np.array([0.1, 0.1]),
    'eps': np.array([0.15, 0.15]),
    'gamma': np.array([0.5, 0.5]),
}

In [102]:
# Create environment
game_env = PrisonersDilemmaEnv(
    config["payoffs"]["reward_payoff"],
    config["payoffs"]["tempta_payoff"],
    config["payoffs"]["sucker_payoff"],
    config["payoffs"]["punish_payoff"],
)

# Create agents 
# Agent one doesn't get to see the state
agent_one = algorithms.QlearningAgent(
    env=game_env,
    eps=config['params']['eps'][0],
    alpha=config['params']['alpha'][0],
    gamma=config['params']['gamma'][0],
    state=False,
    init_method='zeros',
    verbose=True,
)

# Agent two gets to see the action of the other
agent_two = algorithms.QlearningAgent(
    env=game_env,
    eps=config['params']['eps'][1],
    alpha=config['params']['alpha'][1],
    gamma=config['params']['gamma'][1],
    state=True,
    init_method='zeros',
    verbose=True,
)

# Run for a number of episodes
for episode_i in range(config['num_episodes']):
    
    # Get actions
    act_one = agent_one.get_action(obs=None)
    act_two = agent_two.get_action(obs=act_one)
    actions = np.concatenate([act_one, act_two], dtype=int)

    # Take a step
    _, rewards, _, _, _ = game_env.step(action=actions)

    # Learn: update Q-values
    agent_one.learn(obs=None, action=act_one, reward=rewards[0])
    agent_two.learn(obs=act_one, action=act_two, reward=rewards[1])

action taken: [1]
action taken: [1]
Q-table: 
 [0.  0.2]
Q-table: 
 [[0.  0. ]
 [0.  0.2]]
action taken: [0]
action taken: [1]
Q-table: 
 [0.31 0.2 ]
Q-table: 
 [[0.   0.  ]
 [0.01 0.2 ]]
action taken: [0]
action taken: [0]
Q-table: 
 [0.3945 0.2   ]
Q-table: 
 [[0.11 0.  ]
 [0.01 0.2 ]]
action taken: [0]
action taken: [0]
Q-table: 
 [0.474775 0.2     ]
Q-table: 
 [[0.209 0.   ]
 [0.01  0.2  ]]
action taken: [0]
action taken: [0]
Q-table: 
 [0.55103625 0.2       ]
Q-table: 
 [[0.29855 0.     ]
 [0.01    0.2    ]]
action taken: [0]
action taken: [0]
Q-table: 
 [0.62348444 0.2       ]
Q-table: 
 [[0.3836225 0.       ]
 [0.01      0.2      ]]
action taken: [0]
action taken: [0]
Q-table: 
 [0.69231022 0.2       ]
Q-table: 
 [[0.46444138 0.        ]
 [0.01       0.2       ]]
action taken: [0]
action taken: [0]
Q-table: 
 [0.7576947 0.2      ]
Q-table: 
 [[0.54121931 0.        ]
 [0.01       0.2       ]]
action taken: [0]
action taken: [0]
Q-table: 
 [0.81980997 0.2       ]
Q-table: 
 [[0.61

In [90]:
agent_one.total_reward, agent_two.total_reward

(1067, 1070)

In [91]:
agent_one.q_table.T, agent_two.q_table.T

(array([2.21189304, 1.12457577]),
 array([[3.90381329, 2.81633572],
        [5.80986836, 0.89751417]]))

In [92]:
# Defect, Cooperate
agent_one.q_table

array([2.21189304, 1.12457577])

In [94]:
agent_two.q_table[:, 0]

array([3.90381329, 2.81633572])

In [95]:
agent_two.q_table[:, 1]

array([5.80986836, 0.89751417])