Skip to content

Commit

Permalink
fix bugs and add comments of discounted_cfr
Browse files Browse the repository at this point in the history
  • Loading branch information
ai-gamer committed Oct 14, 2019
1 parent b1e3c89 commit fcc3f7e
Showing 1 changed file with 85 additions and 38 deletions.
123 changes: 85 additions & 38 deletions open_spiel/python/algorithms/discounted_cfr.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,49 @@
"""Discounted CFR and Linear CFR algorithms.
This implements Discounted CFR and Linear CFR, from Noam Brown and Tuomas
Sandholm, 2019, "Solving Imperfect-Information Games via Discounted Regret
Minimization".
See https://arxiv.org/abs/1809.04040.
Linear CFR (LCFR), is identical to CFR, except on iteration `t` the updates to
the regrets and average strategies are given weight `t`.
Discounted CFR(alpha, beta, gamma) is defined by, at iteration `t`:
- multiplying the positive accumulated regrets by 1(t^alpha / (t^aplha + 1))
- multiplying the negative accumulated regrets by 1(t^beta / (t^beta + 1))
- multiplying the contribution to the average strategt by (t / (t + 1))^gamma
WARNING: This was contributed on Github, and the OpenSpiel team is not aware it
has been verified we can reproduce the paper results.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from open_spiel.python.algorithms import cfr

import numpy as np

class _DCFRSolver(cfr._CFRSolver):
def __init__(self, game, initialize_cumulative_values, alternating_updates,
linear_averaging, regret_matching_plus, alpha, beta, gamma):
super(_DCFRSolver,self).__init__(game, initialize_cumulative_values, alternating_updates,linear_averaging, regret_matching_plus)
from open_spiel.python.algorithms import cfr


class _DCFRSolver(cfr._CFRSolver): # pylint: disable=protected-access
"""Discounted CFR."""

def __init__(self, game, alternating_updates, linear_averaging,
regret_matching_plus, alpha, beta, gamma):
super(_DCFRSolver, self).__init__(game, alternating_updates,
linear_averaging, regret_matching_plus)
self.alpha = alpha
self.beta = beta
self.gamma = gamma

def _compute_counterfactual_regret_for_player(self, state,
def _compute_counterfactual_regret_for_player(self, state, policies,
reach_probabilities, player):
"""Increments the cumulative regrets and policy for `player`.
Args:
state: The initial game state to analyze from.
policies: Unused. To be compatible with the `_CFRSolver` signature.
reach_probabilities: The probability for each player of reaching `state`
as a numpy array [prob for player 0, for player 1,..., for chance].
`player_reach_probabilities[player]` will work in all cases.
Expand All @@ -39,12 +65,11 @@ def _compute_counterfactual_regret_for_player(self, state,
new_reach_probabilities = reach_probabilities.copy()
new_reach_probabilities[-1] *= action_prob
state_value += action_prob * self._compute_counterfactual_regret_for_player(
new_state, new_reach_probabilities, player)
new_state, policies, new_reach_probabilities, player)
return state_value

current_player = state.current_player()
info_state = state.information_state(current_player)
legal_actions = state.legal_actions(current_player)

# No need to continue on this history branch as no update will be performed
# for any player.
Expand All @@ -62,14 +87,21 @@ def _compute_counterfactual_regret_for_player(self, state,
# state. Therefore, the utilities are cached.
children_utilities = {}

info_state_policy = self._compute_policy_or_get_it_from_cache(
info_state, legal_actions)
for action, action_prob in info_state_policy.items():
info_state_node = self._info_state_nodes[info_state]
if policies is None:
info_state_policy = self._get_infostate_policy(info_state)
else:
info_state_policy = policies[current_player](info_state)
for action in state.legal_actions():
action_prob = info_state_policy.get(action, 0.)
new_state = state.child(action)
new_reach_probabilities = reach_probabilities.copy()
new_reach_probabilities[current_player] *= action_prob
child_utility = self._compute_counterfactual_regret_for_player(
new_state, reach_probabilities=new_reach_probabilities, player=player)
new_state,
policies=policies,
reach_probabilities=new_reach_probabilities,
player=player)

state_value += action_prob * child_utility
children_utilities[action] = child_utility
Expand All @@ -84,54 +116,69 @@ def _compute_counterfactual_regret_for_player(self, state,

reach_prob = reach_probabilities[current_player]
counterfactual_reach_prob = (
np.prod(reach_probabilities[:current_player]) *
np.prod(reach_probabilities[current_player + 1:]))
np.prod(reach_probabilities[:current_player]) *
np.prod(reach_probabilities[current_player + 1:]))
state_value_for_player = state_value[current_player]

for action, action_prob in info_state_policy.items():
cfr_regret = counterfactual_reach_prob * (
children_utilities[action][current_player] - state_value_for_player)
children_utilities[action][current_player] - state_value_for_player)

info_state_node = self._info_state_nodes[info_state]
info_state_node.cumulative_regret[action] += cfr_regret
# Multiplying accumulative positive and negative regret with different alpha and beta
if info_state_node.cumulative_regret[action] >= 0:
info_state_node.cumulative_regret[action] *= (self._iteration ** self.alpha / (self._iteration ** self.alpha + 1))
else:
info_state_node.cumulative_regret[action] *= (self._iteration ** self.beta / (self._iteration ** self.beta + 1))

if self._linear_averaging:
info_state_node.cumulative_policy[
action] += reach_prob * action_prob
# Applying different weights of contribution to average strategy
info_state_node.cumulative_policy[
action] *= ((self._iteration / (self._iteration + 1)) ** self.gamma)
info_state_node.cumulative_policy[action] += (reach_prob * action_prob * (
self._iteration **self.gamma))
else:
info_state_node.cumulative_policy[action] += reach_prob * action_prob

return state_value


def evaluate_and_update_policy(self):
"""Performs a single step of policy evaluation and policy improvement."""
#print("run")
self._iteration += 1
if self._alternating_updates:
for player in range(self._game.num_players()):
self._compute_counterfactual_regret_for_player(
self._root_node,
policies=None,
reach_probabilities=np.ones(self._game.num_players() + 1),
player=player)
for info_key, info_state in self._info_state_nodes.items():
# 16 is the index of the current player for the info set, This is used to do update for the current player
if int(info_key[16]) == player:
for action in info_state.cumulative_regret.keys():
if info_state.cumulative_regret[action] >= 0:
info_state.cumulative_regret[action] *= (self._iteration**self.alpha / (self._iteration**self.alpha + 1))
else:
info_state.cumulative_regret[action] *= (self._iteration**self.beta / (self._iteration**self.beta + 1))
cfr._update_current_policy(self._current_policy, self._info_state_nodes)



class DCFRSolver(_DCFRSolver):
def __init__(self, game, alpha=3/2, beta=0, gamma=2):

def __init__(self, game, alpha=3 / 2, beta=0, gamma=2):
super(DCFRSolver, self).__init__(
game,
initialize_cumulative_values=True,
regret_matching_plus=False,
alternating_updates=True,
linear_averaging=True,
alpha=alpha,
beta=beta,
gamma=gamma)
game,
regret_matching_plus=False,
alternating_updates=True,
linear_averaging=True,
alpha=alpha,
beta=beta,
gamma=gamma)


class LCFRSolver(_DCFRSolver):

def __init__(self, game):
super(LCFRSolver, self).__init__(
game,
initialize_cumulative_values=True,
regret_matching_plus=False,
alternating_updates=True,
linear_averaging=True,
alpha=1,
beta=1,
gamma=1)

gamma=1)

0 comments on commit fcc3f7e

Please sign in to comment.