In [1]:
import numpy as np
import random
import copy
from tqdm import tqdm

class Node:
	def __init__(self, num_actions, env):
		self.regret_sum = np.zeros(num_actions)
		self.strategy = np.zeros(num_actions)
		self.strategy_sum = np.zeros(num_actions)
		self.num_actions = num_actions
		self.number = 1

		# game information
		self.env = env
	
	def get_strategy(self):
		normalizing_sum = 0
		prob = np.random.dirichlet(alpha=[1] * self.num_actions)
		for a in range(self.num_actions):
			if self.regret_sum[a] > 0:
				self.strategy[a] = self.regret_sum[a]
			else:
				self.strategy[a] = 0
			normalizing_sum += self.strategy[a]

		for a in range(self.num_actions):
			if normalizing_sum > 0:
				self.strategy[a] /= normalizing_sum
			else:
				self.strategy[a] = prob[a]

		return self.strategy

	def get_average_strategy(self):
		avg_strategy = np.zeros(self.num_actions)
		normalizing_sum = 0
		for action in range(self.num_actions):
			normalizing_sum += self.strategy_sum[action]
		for action in range(self.num_actions):
			if normalizing_sum > 0:
				avg_strategy[action] = self.strategy_sum[action] / normalizing_sum
			else:
				avg_strategy[action] = 1.0 / self.num_actions
		
		return avg_strategy



In [None]:
import sys
sys.path.append("..")
from GameModel.TempleOfHorror import TempleOfHorror


class TempleCFR():
	def __init__(self, iterations, nodes):
		self.iterations = iterations
		self.nodes = nodes
		self.env = TempleOfHorror(3)
		self.env_aux = self.env
		self.acting_player = random.randint(0,self.env.N-1)


	def cfr_iterations_external(self):
		utility = np.zeros(self.env.N)
		for t in tqdm(range(1, self.iterations + 1)):
			observation = self.env.reset() 
			for player in range(self.env.N): # Players
				infoSet = self.env.create_state(player, observation)

				#random.shuffle(self.cards)
				utility[player] += self.external_cfr(str(infoSet),  player,  self.acting_player, t)
				#print(player, utility[player])

		print('Average game value: {}'.format(utility[0]/(self.iterations)))
		#for i in sorted(self.nodes):
			#print(i, self.nodes[i].get_average_strategy())
				
	  

	def external_cfr(self, infoSet, learning_player, acting_player, t):
		#print('THIS IS ITERATION', t)
		#print("agent playing", acting_player)


		#infoset = str(cards[acting_player]) + str(history) # infoset are card acting player can see and history
		if infoSet not in self.nodes:
			self.nodes[infoSet] = Node(len(self.env.action_spaces[f"agent_{acting_player}"]), copy.deepcopy(self.env_aux))
		else:
			self.nodes[infoSet].number += 1	



		done, winner = self.nodes[infoSet].env.referee()

		# History is in a terminal state then calculate payments
		if done:			
			if self.nodes[infoSet].env.enc_player_role[f"agent_{learning_player}"] == winner:
				return 100
			else:
				return -100

		#print("state",  self.nodes[infoSet].env.observation_spaces)		
		#print("state",  self.nodes[infoSet].env.score)	

		# Here is where self play is done
		if acting_player == learning_player:
			action_space_length = len(self.nodes[infoSet].env.action_spaces[f"agent_{acting_player}"])
			utility = np.zeros(action_space_length) 
			node_utility = 0
			strategy = self.nodes[infoSet].get_strategy()

			for index, action in enumerate(self.nodes[infoSet].env.action_spaces[f"agent_{acting_player}"]):
				next_acting_player = action

				done, next_observation_spaces, _ = self.nodes[infoSet].env.step(action)
				nextInfoSet = self.nodes[infoSet].env.create_state(next_acting_player, next_observation_spaces)

				self.env_aux = copy.deepcopy(self.nodes[infoSet].env)
				utility[index] = self.external_cfr(str(nextInfoSet), learning_player, next_acting_player,t)
				
				node_utility += strategy[index] * utility[index]

			for action in range(action_space_length):
				regret = utility[action] - node_utility
				self.nodes[infoSet].regret_sum[action] = regret
				#print(regret)

			return node_utility

		else: #acting_player != learning_player sample strategy
			action_space_length = len(self.nodes[infoSet].env.action_spaces[f"agent_{acting_player}"])
			strategy = self.nodes[infoSet].get_strategy()
			utility = 0
			action = np.random.choice(self.nodes[infoSet].env.action_spaces[f"agent_{acting_player}"], p=strategy)

			next_acting_player = action
			#print("other")
			done, next_observation_spaces, _ = self.nodes[infoSet].env.step(action)
			nextInfoSet = self.nodes[infoSet].env.create_state(next_acting_player, next_observation_spaces)
			
			self.env_aux = copy.deepcopy(self.nodes[infoSet].env)
			utility = self.external_cfr(str(nextInfoSet), learning_player, next_acting_player,t)

		for index_action in range(action_space_length):
			self.nodes[infoSet].strategy_sum[index_action] += strategy[index_action]

			return utility




In [4]:
if __name__ == "__main__":
	k = TempleCFR(1000, {})
	k.cfr_iterations_external()

100%|██████████| 1000/1000 [00:01<00:00, 593.08it/s]

Average game value: 48.905885545657554



