### Develop notebook


In [1]:
%load_ext autoreload
%autoreload 2

from typing import Dict, List, Tuple, Union
from gym import Space, spaces
import seaborn as sns
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from prisoners_dilemma import utils
from prisoners_dilemma.env import PrisonersDilemmaEnv

sns.set('notebook', font_scale=1.1, rc={'figure.figsize': (7, 4)})
sns.set_style('ticks', rc={'figure.facecolor': 'none', 'axes.facecolor': 'none'})
matplotlib.rcParams['figure.facecolor'] = 'white'

In [22]:
# Setup config
config = {
    'num_simuls': 10,
    'num_episodes': 50,
    'num_agents': 2,
    'num_actions': 2,
    'verbose': False,
    'init_type':'zeros',
}

config['payoffs'] = {
    'reward_payoff': 2,
    'tempta_payoff': 3,
    'sucker_payoff': 0,
    'punish_payoff': 1,
}

config['params'] = {
    'alpha': np.array([0.1, 0.1]),
    'eps': np.array([0.1, 0.1]),
    'gamma': np.array([0.5, 0.5]),
}

In [23]:
class QlearningAgent:

    def __init__(
        self, 
        env:object, 
        eps: float, 
        alpha: float,
        gamma: float,
        state: bool=None, 
        init_method: str="zeros"
        ):

        self.env = env
        self.eps = self._get_eps(eps)
        self.alpha = self._get_step_size(alpha)
        self.gamma = gamma
        self.state = state
        self.q_table = self.init_q_table(init_method, table_size=env.action_space.n)
        self.total_reward = 0

    def init_q_table(self, init_method: str, table_size: np.ndarray=2) -> np.ndarray:
        """Initialize Q-table.
        """
        if init_method == "zeros":
            return np.zeros((table_size))
        raise NotImplementedError("Initialization method not known.")

    def get_action(self, state:object=None) -> int:
        """Agent takes a new action.

        Args:
            obs (_type_): _description_
        """
        if np.random.random() < self._get_eps(self.eps):
            action = np.array([self.env.action_space.sample()])
        else: 
            action = np.random.choice(
                a=np.argwhere((self.q_table == self.q_table.max())).flatten(),
                size=(1,),
            )
        return action

    def learn(self, action: int, reward: int, state: object=None) -> None:
        """Update Q-values for each state, action pair.

        Args:
            state (object): the state of the environment
            action (int): action chosen by agent
            reward (int): reward obtained in episode
        """

        self.total_reward += reward

        if state is not None:
            pass #TODO: index q-table by the state and action
        else:
            self.q_table[action] = self.q_table[action] + self.alpha * (reward + self.gamma * np.max(self.q_table) - self.q_table[action])    
    
    def _get_eps(self, eps:float) -> float:
        """Get the exploration prob for an episode."""
        return eps

    def _get_step_size(self, alpha:float) -> float:
        """Get the step size."""
        return alpha

In [24]:
# Create environment
game_env = PrisonersDilemmaEnv(
    config["payoffs"]["reward_payoff"],
    config["payoffs"]["tempta_payoff"],
    config["payoffs"]["sucker_payoff"],
    config["payoffs"]["punish_payoff"],
)

# Create agents
agent_one = QlearningAgent(
    env=game_env,
    eps=config['params']['eps'][0],
    alpha=config['params']['alpha'][0],
    gamma=config['params']['gamma'][0],
    state=None,
)

agent_two = QlearningAgent(
    env=game_env,
    eps=config['params']['eps'][1],
    alpha=config['params']['alpha'][1],
    gamma=config['params']['gamma'][1],
    state=None,
)

# Run for a number of episodes
for episode_i in range(config['num_episodes']):
    
    # Get actions
    act_one = agent_one.get_action()
    act_two = agent_two.get_action()
    actions = np.concatenate([act_one, act_two])

    # Take a step
    _, rewards, _, _, _ = game_env.step(action=actions)

    # Learn: update Q-values
    agent_one.learn(state=None, action=act_one, reward=rewards[0])
    agent_two.learn(state=None, action=act_two, reward=rewards[1])

In [25]:
agent_one.total_reward, agent_two.total_reward

(57, 51)

In [26]:
agent_one.q_table.T, agent_two.q_table.T

(array([1.9659001 , 0.17175745]), array([1.96456682, 0.15487552]))