-
Notifications
You must be signed in to change notification settings - Fork 0
/
self_play.py
106 lines (84 loc) · 4.19 KB
/
self_play.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from interfaces.game_engine import GameEngine
from interfaces.game_state import GameState
from interfaces.prediction_network import PredictionNetwork
from alpha_zero_agent import AlphaZeroAgent
from utils import logger
import torch
from utils import CUDA
from math import inf
class TrainingExample(object):
def __init__(self, game_state, action_probability_tensor):
self._game_state = game_state
self._action_probability_tesnor = action_probability_tensor
self._value = None
def set_value(self, game_value):
assert self._value is None
assert game_value in [0, GameState.PLAYER_ONE, GameState.PLAYER_TWO], "invalid value"
if game_value == 0:
self._value = 0.0
elif game_value == self._game_state.get_player():
self._value = 1.0
else:
self._value = -1.0
def __str__(self):
return str((self._game_state.convert_to_tensor(), self._action_probability_tesnor, self._value))
def __repr__(self):
return str(self)
def to_tensor_tuple(self):
value_tensor = torch.empty(1, 1, dtype=torch.double)
value_tensor[0, 0] = self._value
if CUDA:
return (self._game_state.convert_to_tensor().cuda(),
self._action_probability_tesnor.cuda(),
value_tensor.cuda())
return (self._game_state.convert_to_tensor(),
self._action_probability_tesnor,
value_tensor)
class SelfPlay(object):
def __init__(self, network, game_engine, num_simulation_iterations, training_augmentor=None, temperature=None):
"""
:param network: a PredictionNetwork object
:param game_engine: a GameEngine object
:param simulation_iterations: number of iterations to perform each turn
:param training_augmentor: augments generated training examples with symmetries [optional]
:param temperature: number of moves after which should change to competitive [optional]
"""
assert isinstance(network, PredictionNetwork)
assert isinstance(game_engine, GameEngine)
self.network = network
self.game_engine = game_engine
self._agent = AlphaZeroAgent(self.network, self.game_engine, num_simulation_iterations)
self._training_augmentor = training_augmentor
self._temperature = temperature if temperature else inf
def play(self):
"""
Execute an entire self play game.
:return: a tuple of the identity of the winning player
(GameState.PLAYER_ONE or GameState.PLAYER_TWO) or 0 if game is tied
and the training examples generated by the self play game, as a list of tensors.
"""
logger.verbose_debug("Starting self play game")
training_examples = list()
game_state = self.game_engine.create_new_game()
num_moves = 0
competitive = False
while not game_state.game_over():
logger.verbose_debug(f"\r\n{game_state}")
next_action, mcts_probabilities_tensor = self._agent.choose_action(competitive=competitive,
fetch_probabilities=True)
training_examples.append(TrainingExample(game_state, mcts_probabilities_tensor))
if self._training_augmentor:
training_examples.extend(self._training_augmentor.augment(training_examples[-1]))
logger.verbose_debug(f"Suggested action: {next_action}")
game_state = game_state.do_action(next_action)
num_moves += 1
if num_moves == self._temperature:
logger.verbose_debug(f"Switching to competitive at move: {num_moves}")
competitive = True
# update training examples with the winning probability
game_score = game_state.get_game_score()
for training_example in training_examples:
training_example.set_value(game_score)
logger.verbose_debug(f"\r\n{game_state}")
logger.verbose_debug(f"Player: {game_state.get_player()}, game value: {game_score}")
return game_state.get_game_score(), [example.to_tensor_tuple() for example in training_examples]