In [1]:
# imports
from math import sqrt, log
import gym
import copy

In [2]:
# Global constants
UCB_C = 2

# Monte Carlo Three Search

1. Selection
    - Taverse the tree to find greatest UCB-score
2. Expansion
    - If the selected leaf node has been visited before expand by adding weighted game action
3. Rollout
    - Simulate the game until end-condition from the expanded leaf
4. Back-propagation
    - Updating the value of each ancestor node of the expanded leaf


In [3]:
class Node():
    def __init__(self, env, state, parent):
        self.env : gym.Env = env
        self.state = state # Gamestate
        self.value : int = 0 # Value estimate
        self.trials : int = 0 # Number of trials for this node
        self.parent : Node = parent # Parent node of this node
        self.children : list[Node] = [] # List of children of this node
    
    # calculate a Upper Confidence Bound
    def ucb(self, total_trials):
        return self.value + ( UCB_C * sqrt(log(total_trials) / self.trials) )
    
    # Add a new node to a leaf node
    def expansion(self):
        for action in self.env.action_space:
            child_env = copy.deepcopy(self.env)
            child_state = child_env.step(action)
            self.children.append(Node(child_env, child_state, self))
    
    # Simulate game from current move until end-condition returning the score
    def rollout(self):
        rollout_env = copy.deepcopy(self.env)
        rollout_result = 0
        done = False
        while not done:
            random_action = rollout_env.action_space.sample()
            _, reward, done, _ = rollout_env.step(random_action)
            rollout_result += reward
        return rollout_result

class Monte_Carlo_Tree_Search():
    def __init__(self):
        self.env = gym.make('gym_go:go-v0', size=3, komi=0, reward_method='real')
        self.number_of_trials = 0
        self.root = Node(self.env, self.env.reset(), None)
    
    # Update scores of all parent nodes after rollout
    def back_propagation(rollout_node: Node, rollout_result, self):
        current_node = rollout_node
        while current_node != None:
            current_node.trials += 1
            current_node.value += rollout_result
            current_node = current_node.parent
        self.number_of_trials += 1
    
    # find and return the leaf node with the highest UCB-score 
    def selection(self, current_node: Node):
        selected_child = current_node
        current_best_ucb = 0
        for child in current_node.children:
            if child.ucb(self.number_of_trials) == 0:
                return child
            if child.ucb(self.number_of_trials) > current_best_ucb:
                selected_child = child
        
        if len(selected_child.children) == 0:
            return selected_child

        return self.selection(selected_child)

    def run(self):
        self.env.reset()
        finished = False
        while not finished: 
            selected_node = self.selection(self.root)

            if selected_node.trials != 0:
                selected_node.expansion()
                selected_node = selected_node.children[0]

            if selected_node.state[5].all() == 1:
                finished = True
                continue

            rollout_result = selected_node.rollout()
            self.back_propagation(selected_node, rollout_result)

In [4]:
model = Monte_Carlo_Tree_Search()
print(model.root.state)

[[[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]]


  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
