In [73]:
from copy import deepcopy
from math import sqrt, log
import gym, random
import numpy as np

In [74]:
BOARD_SIZE, KOMI, UCB_CONSTANT = 5, 0, 2

In [75]:
test_env = gym.make('gym_go:go-v0', size=BOARD_SIZE, komi=KOMI, reward_method='heuristic')
test_env.reset()
test_env.step(2)
# test_env.render("terminal")
# print(np.argwhere(test_env.valid_moves()).flatten())
print(test_env.reward())

25.0


1. **Selection**: Start from root R and select successive child nodes until a leaf node L is reached. The root is the current game state and a leaf is any node that has a potential child from which no simulation (playout) has yet been initiated.
2. **Expansion**: Unless L ends the game decisively (e.g. win/loss/draw) for either player, create one (or more) child nodes and choose node C from one of them. Child nodes are any valid moves from the game position defined by L.
3. **Simulation**: Complete one random playout from node C. This step is sometimes also called playout or rollout. A playout may be as simple as choosing uniform random moves until the game is decided (for example in chess, the game is won, lost, or drawn).
4. **Backpropagation**: Use the result of the playout to update information in the nodes on the path from C to R.

In [76]:
from math import inf


class Node():

    def __init__(self, env, parent = None):
        self.env = env
        self.parent = parent
        self.children = []
        self.trials = 0
        self.value = 0

    def is_leaf_node(self):
        if self.env.done: return True
        return len(self.children) == 0

    def get_max_ucb_child(self, total_trials):
        best_child = self.children[0]
        if best_child.trials == 0:
                return best_child
        best_child_ucb = best_child.ucb(total_trials)
        for child in self.children:
            if child.trials == 0:
                return child
            child_ucb = child.ucb(total_trials)
            if child_ucb > best_child_ucb:
                best_child = child
                best_child_ucb = child_ucb
        return best_child

    def ucb(self, total_trials):
        return self.value + (UCB_CONSTANT * sqrt(log(total_trials) / self.trials))

    def __backpropagate(self, value):
        if self.parent == None: return
        self.value += value
        self.trials += 1
        self.parent.__backpropagate(value)

    def rollout(self):
        if self.env.done: return self.env.reward()
        rollout_env = deepcopy(self.env)
        while not rollout_env.done:
            rollout_env.step(rollout_env.uniform_random_action())
        self.__backpropagate(rollout_env.reward())
        return rollout_env.reward()
        

In [77]:
class MCTS():

    def __init__(self, size = BOARD_SIZE, komi = KOMI):
        env = gym.make('gym_go:go-v0', size=size, komi=komi, reward_method='heuristic')
        env.reset()
        self.root = Node(env)
        self.iterations = 0

    def run(self):
        current_node = self.root
        self.root.trials = 1
        while not current_node.env.done:
            if not current_node.is_leaf_node():
                current_node = current_node.get_max_ucb_child(self.iterations)
            elif current_node.trials == 0:
                current_node.rollout()
                current_node = self.root
                self.iterations += 1
            else:
                for action in np.argwhere(current_node.env.valid_moves()).flatten():
                    child_env = deepcopy(current_node.env)
                    child_env.step(action)
                    current_node.children.append(Node(child_env, current_node))
                current_node = random.choice(current_node.children)
                current_node.rollout()
                current_node = self.root
                self.iterations += 1

In [78]:
mcts = MCTS()
mcts.run()