In [None]:
from absl import logging
import tensorflow.compat.v1 as tf

from open_spiel.python import policy
from open_spiel.python import rl_environment
from open_spiel.python.algorithms import exploitability
from open_spiel.python.algorithms import nfsp
from open_spiel.python.pytorch import nfsp as nfsp_pt

class NFSPPolicies(policy.Policy):
  """Joint policy to be evaluated."""

  def __init__(self, env, nfsp_policies, mode):
    game = env.game
    player_ids = [0, 1]
    super(NFSPPolicies, self).__init__(game, player_ids)
    self._policies = nfsp_policies
    self._mode = mode
    self._obs = {"info_state": [None, None], "legal_actions": [None, None]}

  def action_probabilities(self, state, player_id=None):
    cur_player = state.current_player()
    legal_actions = state.legal_actions(cur_player)

    self._obs["current_player"] = cur_player
    self._obs["info_state"][cur_player] = (
        state.information_state_tensor(cur_player))
    self._obs["legal_actions"][cur_player] = legal_actions

    info_state = rl_environment.TimeStep(
        observations=self._obs, rewards=None, discounts=None, step_type=None)

    with self._policies[cur_player].temp_mode_as(self._mode):
      p = self._policies[cur_player].step(info_state, is_evaluation=True).probs
    prob_dict = {action: p[action] for action in legal_actions}
    return prob_dict


def tf_main(game,
            env_config,
            num_train_episodes,
            eval_every,
            hidden_layers_sizes,
            replay_buffer_capacity,
            reservoir_buffer_capacity,
            anticipatory_param):
  env = rl_environment.Environment(game, **env_configs)
  info_state_size = env.observation_spec()["info_state"][0]
  num_actions = env.action_spec()["num_actions"]

  hidden_layers_sizes = [int(l) for l in hidden_layers_sizes]
  kwargs = {
      "replay_buffer_capacity": replay_buffer_capacity,
      "epsilon_decay_duration": num_train_episodes,
      "epsilon_start": 0.06,
      "epsilon_end": 0.001,
  }
  expl_list = []
  with tf.Session() as sess:
    # pylint: disable=g-complex-comprehension
    agents = [
        nfsp.NFSP(sess, idx, info_state_size, num_actions, hidden_layers_sizes,
                  reservoir_buffer_capacity, anticipatory_param,
                  **kwargs) for idx in range(num_players)
    ]
    expl_policies_avg = NFSPPolicies(env, agents, nfsp.MODE.average_policy)

    sess.run(tf.global_variables_initializer())
    for ep in range(num_train_episodes):
      if (ep + 1) % eval_every == 0:
        losses = [agent.loss for agent in agents]
        print("Losses: %s" %losses)
        expl = exploitability.exploitability(env.game, expl_policies_avg)
        expl_list.append(expl)
        print("[%s] Exploitability AVG %s" %(ep + 1, expl))
        print("_____________________________________________")

      time_step = env.reset()
      while not time_step.last():
        player_id = time_step.observations["current_player"]
        agent_output = agents[player_id].step(time_step)
        action_list = [agent_output.action]
        time_step = env.step(action_list)

      # Episode is over, step all agents with final info state.
      for agent in agents:
        agent.step(time_step)
  return expl_list
        
def pt_main(game,
            env_config,
            num_train_episodes,
            eval_every,
            hidden_layers_sizes,
            replay_buffer_capacity,
            reservoir_buffer_capacity,
            anticipatory_param):
  env = rl_environment.Environment(game, **env_configs)
  info_state_size = env.observation_spec()["info_state"][0]
  num_actions = env.action_spec()["num_actions"]

  hidden_layers_sizes = [int(l) for l in hidden_layers_sizes]
  kwargs = {
      "replay_buffer_capacity": replay_buffer_capacity,
      "epsilon_decay_duration": num_train_episodes,
      "epsilon_start": 0.06,
      "epsilon_end": 0.001,
  }
  expl_list = []
  agents = [
      nfsp_pt.NFSP(idx, info_state_size, num_actions, hidden_layers_sizes,
                   reservoir_buffer_capacity, anticipatory_param,
                   **kwargs) for idx in range(num_players)
  ]
  expl_policies_avg = NFSPPolicies(env, agents, nfsp_pt.MODE.average_policy)  
  for ep in range(num_train_episodes):
    if (ep + 1) % eval_every == 0:
      losses = [agent.loss.item() for agent in agents]
      print("Losses: %s" %losses)
      expl = exploitability.exploitability(env.game, expl_policies_avg)
      expl_list.append(expl)
      print("[%s] Exploitability AVG %s" %(ep + 1, expl))
      print("_____________________________________________")  
    time_step = env.reset()
    while not time_step.last():
      player_id = time_step.observations["current_player"]
      agent_output = agents[player_id].step(time_step)
      action_list = [agent_output.action]
      time_step = env.step(action_list)  
    # Episode is over, step all agents with final info state.
    for agent in agents:
      agent.step(time_step)
  return expl_list

In [None]:
game = "kuhn_poker"
num_players = 2
env_configs = {"players": num_players}
num_train_episodes = int(3e6)
eval_every = 10000
hidden_layers_sizes = [128]
replay_buffer_capacity = int(2e5)
reservoir_buffer_capacity = int(2e6)
anticipatory_param = 0.1

In [None]:
tf_kuhn_result = tf_main(game, 
                         env_configs,
                         num_train_episodes,
                         eval_every,
                         hidden_layers_sizes,
                         replay_buffer_capacity,
                         reservoir_buffer_capacity,
                         anticipatory_param)

In [None]:
pt_kuhn_result = pt_main(game, 
                         env_configs,
                         num_train_episodes,
                         eval_every,
                         hidden_layers_sizes,
                         replay_buffer_capacity,
                         reservoir_buffer_capacity,
                         anticipatory_param)

In [None]:
import matplotlib.pyplot as plt

x = [i*1000 for i in range(len(tf_kuhn_result))]

plt.plot(x, tf_kuhn_result, label='tensorflow')
plt.plot(x, pt_kuhn_result, label='pytorch')
plt.title('Kuhn Poker')
plt.xlabel('Episodes')
plt.ylabel('Exploitability')
plt.legend()
plt.show()

In [None]:
game = "leduc_poker"
num_players = 2
env_configs = {"players": num_players}
num_train_episodes = int(3e6)
eval_every = 100000
hidden_layers_sizes = [128]
replay_buffer_capacity = int(2e5)
reservoir_buffer_capacity = int(2e6)
anticipatory_param = 0.1

In [None]:
tf_leduc_result = tf_main(game, 
                          env_configs,
                          num_train_episodes,
                          eval_every,
                          hidden_layers_sizes,
                          replay_buffer_capacity,
                          reservoir_buffer_capacity,
                          anticipatory_param)

In [None]:
pt_leduc_result = pt_main(game, 
                          env_configs,
                          num_train_episodes,
                          eval_every,
                          hidden_layers_sizes,
                          replay_buffer_capacity,
                          reservoir_buffer_capacity,
                          anticipatory_param)

In [None]:
x = [i * 10000 for i in range(len(tf_leduc_result))]

plt.plot(x, tf_leduc_result, label='tensorflow')
plt.plot(x, pt_leduc_result, label='pytorch')
plt.title('Leduc Poker')
plt.xlabel('Episodes')
plt.ylabel('Exploitability')
plt.legend()
plt.show()