In [124]:
import warnings
warnings.filterwarnings("ignore")
# imports
from math import sqrt, log
import gym
import copy
import numpy as np
from gym_go.gogame import turn

In [125]:
# Global constants
UCB_C = 2

BLACK = 0
WHITE = 1
TURN_CHNL = 2
INVD_CHNL = 3
PASS_CHNL = 4
DONE_CHNL = 5

# Monte Carlo Three Search

1. Selection
    - Taverse the tree to find greatest UCB-score
2. Expansion
    - If the selected leaf node has been visited before expand by adding weighted game action
3. Rollout
    - Simulate the game until end-condition from the expanded leaf
4. Back-propagation
    - Updating the value of each ancestor node of the expanded leaf


In [126]:
def get_legal_move(env):
    board_shape = env.state().shape[1:]
    pass_id = np.prod(board_shape)
    action = env.action_space.sample() # pick random action
    action2d = action // board_shape[0], action % board_shape[1], action
    while action2d[2] != pass_id and env.state()[3, action2d[0], action2d[1]] == 1:
        action = env.action_space.sample() # pick random action
        action2d = action // board_shape[0], action % board_shape[1], action
    return action2d[2]

In [127]:
class Node():
    def __init__(self, env, parent, action):
        self.env : gym.Env = env # This env will be altered by the other player
        self.value : int = 0 # Value estimate
        self.trials : int = 0 # Number of trials for this node
        self.parent : Node = parent # Parent node of this node
        self.children : list[Node] = [] # List of children of this node
        self.action : int = action # The step action made by this node
    
    # calculate a Upper Confidence Bound
    def ucb(self, total_trials):
        return self.value + ( UCB_C * sqrt(log(total_trials) / self.trials) )
    
    # Add a new node to a leaf node
    def expansion(self):
        if self.env.done:
            return
        board_shape = self.env.state().shape[1:]
        pass_id = np.prod(board_shape)
        for action in range(0, pass_id + 1):
            action2d = action // board_shape[0], action % board_shape[1]
            if action == pass_id or self.env.state()[3, action2d[0], action2d[1]] == 0:
                child_env = copy.deepcopy(self.env)
                child_env.step(action)
                self.children.append(Node(child_env, self, action))

    # Simulate game from current move until end-condition returning the score
    def rollout(self):
        if self.env.done:
            return self.env.reward()
        
        rollout_env = copy.deepcopy(self.env)
        rollout_result = 0
        done = False
        while not done:
            random_action = get_legal_move(rollout_env)
            _, reward, done, _ = rollout_env.step(random_action)
            rollout_result += reward
        return rollout_result

In [128]:
class Monte_Carlo_Tree_Search():
    def __init__(self):
        self.env : gym.Env = gym.make('gym_go:go-v0', size=3, komi=7, reward_method='heuristic')
        self.env.reset()
        self.number_of_trials : int = 0
        self.root = Node(self.env, None, None)
    
    # Update scores of all parent nodes after rollout
    def back_propagation(self, rollout_node: Node, rollout_result):
        current_node = rollout_node
        while current_node != None:
            current_node.trials += 1
            current_node.value += rollout_result
            current_node = current_node.parent
        self.number_of_trials += 1
    
    # find and return the leaf node with the highest UCB-score 
    def selection(self, starting_node: Node):
        selected_child = starting_node
        current_node = starting_node
        while len(current_node.children) > 0:
            selected_child = current_node.children[0]
            if selected_child.trials == 0:
                return selected_child
            current_best_ucb = current_node.children[0].ucb(self.number_of_trials)

            for child in current_node.children:
                if child.trials == 0:
                    return child

                child_ucb = child.ucb(self.number_of_trials)
                if turn(child.env.state()) == BLACK and child_ucb > current_best_ucb:
                    selected_child = child
                    current_best_ucb = child_ucb

                if turn(child.env.state()) == WHITE and child_ucb < current_best_ucb:
                    selected_child = child
                    current_best_ucb = child_ucb

            current_node = selected_child

        return selected_child

    def run(self):
        selected_node = self.root
        selected_node.expansion()
        selected_node = self.root.children[0]

        run = 0
        while not self.selection(self.root).env.done: # metode "not all leaf nodes done"?
            selected_node = self.selection(self.root)
            print("Run:", run, ": Selection", selected_node.action)

            if selected_node.trials > 0:
                print("Run:", run, ": Expansion")
                selected_node.expansion()
                selected_node = selected_node.children[0]

            rollout_result = selected_node.rollout()
            print("Run:", run, ": Rollout", rollout_result)
            
            self.back_propagation(selected_node, rollout_result)
            run += 1
        return selected_node

In [129]:
model = Monte_Carlo_Tree_Search()
best_node = model.run()

Run: 0 : Selection 0
Run: 0 : Rollout -45.0
Run: 1 : Selection 1
Run: 1 : Rollout -106.0
Run: 2 : Selection 2
Run: 2 : Rollout -56.0
Run: 3 : Selection 3
Run: 3 : Rollout -127.0
Run: 4 : Selection 4
Run: 4 : Rollout -72.0
Run: 5 : Selection 5
Run: 5 : Rollout -23.0
Run: 6 : Selection 6
Run: 6 : Rollout -60.0
Run: 7 : Selection 7
Run: 7 : Rollout -220.0
Run: 8 : Selection 8
Run: 8 : Rollout -170.0
Run: 9 : Selection 9
Run: 9 : Rollout -161.0
Run: 10 : Selection 7
Run: 10 : Expansion
Run: 10 : Rollout -15.0
Run: 11 : Selection 1
Run: 11 : Rollout -55.0
Run: 12 : Selection 2
Run: 12 : Rollout -49.0
Run: 13 : Selection 3
Run: 13 : Rollout -42.0
Run: 14 : Selection 4
Run: 14 : Rollout -82.0
Run: 15 : Selection 5
Run: 15 : Rollout -77.0
Run: 16 : Selection 6
Run: 16 : Rollout -83.0
Run: 17 : Selection 8
Run: 17 : Rollout -46.0
Run: 18 : Selection 9
Run: 18 : Rollout -90.0
Run: 19 : Selection 0
Run: 19 : Expansion
Run: 19 : Rollout -9.0
Run: 20 : Selection 2
Run: 20 : Rollout -39.0
Run: 21 : 

In [130]:
best_game : list[Node] = []
current_node = best_node
while current_node != None:
    best_game.append(current_node)
    current_node = current_node.parent

for node in reversed(best_game):
    node.env.render()

	0 1 2 
0	╔═╤═╗
1	╟─┼─╢
2	╚═╧═╝
	Turn: BLACK, Game State (ONGOING|PASSED|END): ONGOING
	Black Area: 0, White Area: 0

	0 1 2 
0	╔═╤═╗
1	╟─┼─╢
2	╚═○═╝
	Turn: WHITE, Game State (ONGOING|PASSED|END): ONGOING
	Black Area: 9, White Area: 0

	0 1 2 
0	●═╤═╗
1	╟─┼─╢
2	╚═○═╝
	Turn: BLACK, Game State (ONGOING|PASSED|END): ONGOING
	Black Area: 1, White Area: 1

	0 1 2 
0	●═╤═╗
1	╟─┼─╢
2	○═○═╝
	Turn: WHITE, Game State (ONGOING|PASSED|END): ONGOING
	Black Area: 2, White Area: 1

	0 1 2 
0	●═╤═╗
1	●─┼─╢
2	○═○═╝
	Turn: BLACK, Game State (ONGOING|PASSED|END): ONGOING
	Black Area: 2, White Area: 2

